In [10]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, GlobalMaxPooling1D

In [12]:
# Preprocessor to load your documents
import glob
import os
from random import shuffle

In [23]:
def preprocess_data(filepath):
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')
    pos_label = 1
    neg_label = 0
    dataset = []
    
    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r', encoding="utf8") as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r', encoding="utf8") as f:
            dataset.append((neg_label, f.read()))
    
    shuffle(dataset)
    return dataset

In [27]:
dataset=preprocess_data("../Data/aclImdb/train")
dataset[1:5]

[(1,
  "Father of the Pride was the best new show to hit television since Family Guy. It was yet another masterpiece from the talented people at Dreamworks Animation. Like The Simpsons, the show centers around a nuclear family (of white lions, in this case). It also contains many memorable supporting characters including Roger the surly orangutan, Vincent the Italian-American flamingo, the eccentric white tigers Blake and Victoria, the faux patriotic Snout Brothers and Chutney the elephant. The other stars of the show are the Sigfreid and Roy. They are incredibly eccentric and do everything in a grandiose manner, making the most mundane activities entertaining. The combination of cute animal characters with very adult dialog and controversial issues (drugs, prejudice, etc) is the source of the program's brilliance.<br /><br />The blame for this show's failure lies with NBC. They opted to broadcast the episodes in no particular order (perhaps being influenced by which guest stars they c

In [28]:
# Vectorizer and tokenizer
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
word_vectors=KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin.gz', binary=True)

In [82]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])
            except KeyError:
                pass  # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

In [83]:
vectorized_data=tokenize_and_vectorize(dataset)
expected = []
for sample in dataset:
    expected.append(sample[0])

In [86]:
# Train
from sklearn import model_selection

print(len(vectorized_data))
print(len(expected))

25000
25000


In [87]:
X_train, X_test, y_train, y_test=model_selection.train_test_split(
    vectorized_data, expected, test_size=0.2, random_state=42
)

In [88]:
# CNN parameters

In [89]:
maxlen=400
batch_size=32
embedding_dims=300
filters=250
kernel_size=3
hidden_dims=250
epochs=2

In [90]:
def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:

        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data

In [91]:
X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)

In [92]:
X_train = pad_trunc(X_train, maxlen)
X_test = pad_trunc(X_test, maxlen)

X_train = np.reshape(X_train, (len(X_train), maxlen, embedding_dims))
y_train = np.array(y_train)
X_test = np.reshape(X_test, (len(X_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [97]:
# BUILD MODEL
model = Sequential()
model.add(Conv1D(
    filters,
    kernel_size,
    padding='valid',
    activation='relu',
    strides=1,
    input_shape=(maxlen, embedding_dims)
))

In [98]:
# Save model

In [99]:
#Max Pool
model.add(GlobalMaxPooling1D())

# vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)
model.fit(
    X_train, 
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_test, y_test)
)
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("cnn_weights.h5")

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2
