# DATA COLLECTION

In [1]:
from data_collection import read_train_data

In [2]:
train = read_train_data()

Positive data collection progress


100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:11<00:00, 1120.91it/s]


Negative data collection progress


100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:11<00:00, 1115.71it/s]


# DATA PREPROCESSING

In [3]:
from preprocess import preprocess

In [4]:
train['preprocess_review'] = train.loc[:,'movie_review'].apply(preprocess)

# TRAIN/VAL/TEST SPLIT

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train.loc[:,'preprocess_review'], 
                                                    train.loc[:,'label'], 
                                                    test_size=0.2,
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# MODEL TRAINING - BAYES

In [6]:
import collections
import nltk

VOCAB_SIZE = 2000

def text_tofeatures(text, word_features):
    text = nltk.word_tokenize(text)
    text_words = set(text)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in text_words)
    return features

In [7]:
words = []
for text in X_train:
    words += nltk.word_tokenize(text)
print('N words:  ', len(words))
all_words = nltk.FreqDist(w.lower() for w in words)
word_features = list(all_words)[:VOCAB_SIZE]

N words:   1964719


In [8]:
train_feat = [(text_tofeatures(movie_review, word_features), label) for (movie_review, label) in zip(X_train, y_train)]
val_feat = [(text_tofeatures(movie_review, word_features), label) for (movie_review, label) in zip(X_val, y_val)]

In [9]:
classifier = nltk.NaiveBayesClassifier.train(train_feat)

In [10]:
print(nltk.classify.accuracy(classifier, val_feat))

0.84025


In [11]:
classifier.show_most_informative_features(20)

Most Informative Features
         contains(waste) = True              neg : pos    =     13.8 : 1.0
       contains(unfunny) = True              neg : pos    =     11.7 : 1.0
         contains(worst) = True              neg : pos    =      9.8 : 1.0
     contains(laughable) = True              neg : pos    =      9.6 : 1.0
     contains(pointless) = True              neg : pos    =      9.3 : 1.0
   contains(wonderfully) = True              pos : neg    =      8.8 : 1.0
     contains(redeeming) = True              neg : pos    =      8.7 : 1.0
        contains(poorly) = True              neg : pos    =      8.1 : 1.0
         contains(awful) = True              neg : pos    =      8.0 : 1.0
      contains(pathetic) = True              neg : pos    =      7.4 : 1.0
          contains(lame) = True              neg : pos    =      7.1 : 1.0
    contains(underrated) = True              pos : neg    =      7.0 : 1.0
        contains(wasted) = True              neg : pos    =      6.2 : 1.0

# MODEL TRAINING - CONV1D

In [None]:
import keras
from keras import preprocessing
import numpy as np

VOCAB_SIZE = 2000
SENTENCE_MAX_SIZE = 1000

tokenizer = preprocessing.text.Tokenizer(
    num_words=VOCAB_SIZE,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True, 
    split=' ')

def featurize(reviews_list, tokenizer):
    reviews_feat = tokenizer.texts_to_sequences(reviews_list)
    reviews_feat = preprocessing.sequence.pad_sequences(reviews_feat, maxlen=SENTENCE_MAX_SIZE, dtype='int32', 
                                                        padding='pre', truncating='post', value=0)
    return reviews_feat

def y_formatting(y_data):
    y = []
    for label in y_data:
        y.append(1 if label == 'pos' else 0)
    return np.array(y)
        

In [7]:
tokenizer.fit_on_texts(X_train)
X_train_feat = featurize(X_train, tokenizer)
X_val_feat = featurize(X_val, tokenizer)

In [8]:
y_train_feat = y_formatting(y_train)
y_val_feat = y_formatting(y_val)

In [9]:
EMBEDDED_DIM = 50
INPUT_LENGTH = X_train_feat.shape[1]

In [10]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(VOCAB_SIZE, EMBEDDED_DIM, input_length = INPUT_LENGTH, dropout = 0.2))
model.add(keras.layers.Conv1D(filters=20, kernel_size=10, activation='relu'))
model.add(keras.layers.MaxPooling1D(pool_size=2))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 50)          100000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 991, 20)           10020     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 495, 20)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 9900)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 9901      
Total params: 119,921
Trainable params: 119,921
Non-trainable params: 0
_________________________________________________________________
None


  


In [11]:
BATCH_SIZE = 50
TRAINING_EPOCHS = 3

In [12]:
model.fit(X_train_feat, y_train_feat, 
          batch_size = BATCH_SIZE, 
          epochs = TRAINING_EPOCHS,
          validation_data = (X_val_feat, y_val_feat),
          verbose = 1)

Train on 16000 samples, validate on 4000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x18e2f3be0f0>

In [13]:
model.evaluate(X_val_feat, y_val_feat)



[0.35925557315349577, 0.85225]