# DATA COLLECTION

In [10]:
import pandas as pd
from data_collection import read_aclImdb

In [5]:
train = read_aclImdb('train')

  0%|                                                                                        | 0/12500 [00:00<?, ?it/s]

Positive data collection progress


100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:12<00:00, 1032.47it/s]
  1%|▍                                                                             | 72/12500 [00:00<00:17, 719.39it/s]

Negative data collection progress


100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:11<00:00, 1111.36it/s]


In [17]:
train = pd.DataFrame({'movie_review': train[0], 'label': train[1]})

# DATA PREPROCESSING

In [18]:
from preprocess import preprocess

In [19]:
train['preprocess_review'] = train.loc[:,'movie_review'].apply(preprocess)

# TRAIN/VAL/TEST SPLIT

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.loc[:,'preprocess_review'], 
                                                    train.loc[:,'label'], 
                                                    test_size=0.2,
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# MODEL TRAINING - BAYES

In [7]:
import collections
import nltk

VOCAB_SIZE = 2000

def text_tofeatures(text, word_features):
    text = nltk.word_tokenize(text)
    text_words = set(text)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in text_words)
    return features

In [8]:
words = []
for text in X_train:
    words += nltk.word_tokenize(text)
print('N words:  ', len(words))
all_words = nltk.FreqDist(w.lower() for w in words)
word_features = list(all_words)[:VOCAB_SIZE]

N words:   1964719


In [9]:
train_feat = [(text_tofeatures(movie_review, word_features), label) for (movie_review, label) in zip(X_train, y_train)]
val_feat = [(text_tofeatures(movie_review, word_features), label) for (movie_review, label) in zip(X_val, y_val)]

In [10]:
classifier = nltk.NaiveBayesClassifier.train(train_feat)

In [11]:
print(nltk.classify.accuracy(classifier, val_feat))

0.84025


In [12]:
classifier.show_most_informative_features(20)

Most Informative Features
         contains(waste) = True              neg : pos    =     13.8 : 1.0
       contains(unfunny) = True              neg : pos    =     11.7 : 1.0
         contains(worst) = True              neg : pos    =      9.8 : 1.0
     contains(laughable) = True              neg : pos    =      9.6 : 1.0
     contains(pointless) = True              neg : pos    =      9.3 : 1.0
   contains(wonderfully) = True              pos : neg    =      8.8 : 1.0
     contains(redeeming) = True              neg : pos    =      8.7 : 1.0
        contains(poorly) = True              neg : pos    =      8.1 : 1.0
         contains(awful) = True              neg : pos    =      8.0 : 1.0
      contains(pathetic) = True              neg : pos    =      7.4 : 1.0
          contains(lame) = True              neg : pos    =      7.1 : 1.0
    contains(underrated) = True              pos : neg    =      7.0 : 1.0
        contains(wasted) = True              neg : pos    =      6.2 : 1.0

# TEST EVALUATION - BAYES

In [48]:
from sklearn import metrics
import pandas as pd

In [31]:
test_feat = [text_tofeatures(movie_review, word_features) for movie_review in X_test]

In [39]:
test_pred = classifier.classify_many(test_feat)

In [42]:
metrics.accuracy_score(y_test, test_pred)

0.8422

In [51]:
cf_bayes = pd.DataFrame(metrics.confusion_matrix(y_test, test_pred, labels = ['pos', 'neg']))
cf_bayes.columns = ['pos_predicted', 'neg_predicted']
cf_bayes.index = ['pos_real', 'neg_real']

In [52]:
cf_bayes

Unnamed: 0,pos_predicted,neg_predicted
pos_real,2142,373
neg_real,416,2069


# MODEL TRAINING - CONV1D

In [27]:
from featurizer import Featurizer
from featurizer import VOCAB_SIZE
from featurizer import SENTENCE_MAX_SIZE

In [22]:
featurizer = Featurizer()
#featurizer.load()
featurizer.fit(X_train)

In [23]:
X_train_feat = featurizer.apply(X_train)
X_val_feat = featurizer.apply(X_val)
y_train_feat = featurizer.y_format(y_train)
y_val_feat = featurizer.y_format(y_val)

In [24]:
import keras

In [25]:
EMBEDDED_DIM = 50

In [28]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(VOCAB_SIZE, EMBEDDED_DIM, input_length = SENTENCE_MAX_SIZE, dropout = 0.2))
model.add(keras.layers.Conv1D(filters=20, kernel_size=10, activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

  


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 50)          100000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 991, 20)           10020     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 495, 20)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9900)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9901      
Total params: 119,921
Trainable params: 119,921
Non-trainable params: 0
_________________________________________________________________
None


In [29]:
BATCH_SIZE = 50
TRAINING_EPOCHS = 3

In [32]:
model.fit(X_train_feat, y_train_feat, 
          batch_size = BATCH_SIZE, 
          epochs = TRAINING_EPOCHS,
          validation_data = (X_val_feat, y_val_feat),
          shuffle=True,
          verbose = 1)

Train on 16000 samples, validate on 4000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x16d5f33bfd0>

In [37]:
model.evaluate(X_val_feat, y_val_feat)



[0.340039741396904, 0.8595]

# TEST EVALUATION - CONV1D

In [62]:
X_test_feat = featurize(X_test, tokenizer)
y_test_feat = y_formatting(y_test)

In [70]:
test_pred = model.predict(X_test_feat)

In [71]:
test_pred = [1 if pred >= 0.5 else 0 for pred in test_pred]

In [72]:
metrics.accuracy_score(y_test_feat, test_pred)

0.8624

In [77]:
cf_conv1d = pd.DataFrame(metrics.confusion_matrix(y_test_feat, test_pred, labels = [1, 0]))
cf_conv1d.columns = ['pos_predicted', 'neg_predicted']
cf_conv1d.index = ['pos_real', 'neg_real']
cf_conv1d

Unnamed: 0,pos_predicted,neg_predicted
pos_real,2138,377
neg_real,311,2174
