In [1]:
import numpy as np
import pandas as pd
import scipy as sp

from scipy import sparse

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from tqdm import tqdm

import string
import re
import glob

import keras
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense, Embedding, LSTM, Dropout
from keras.models import Sequential, load_model

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

# keras architecture visualization
from keras.utils import plot_model
from keras.utils.vis_utils import model_to_dot

from IPython.display import SVG

import matplotlib.pyplot as plt

print('Keras version: %s' % keras.__version__)

PATH = "data/aclImdb"

Using TensorFlow backend.


Keras version: 2.1.5


In [2]:
# read files in the given tree, using subfolders as the target classes
def read_files(folder, subfolders):
    corpus, labels = [], []
    for index, label in enumerate(subfolders):
        path = '/'.join([folder, label, '*.txt'])
        for filename in glob.glob(path):
            corpus.append(open(filename, 'r').read())
            labels.append(index)
    return corpus, np.array(labels).astype(np.int)


In [3]:
#coprus_train_pos = [open(filename, 'r').read() for filename in glob.glob(PATH + '/train/pos/*.txt')]
#coprus_train_neg = [open(filename, 'r').read() for filename in glob.glob(PATH + '/train/neg/*.txt')]
corpus_train, y_train = read_files(PATH + '/train', ['neg', 'pos'])
corpus_test, y_test = read_files(PATH + '/test', ['neg', 'pos'])

In [4]:
len(corpus_train), len(y_train), corpus_train[0], y_train[0], corpus_train[24999], y_train[24999]

(25000,
 25000,
 'Hi, Everyone, If you saw "Singing in the Rain," you remember the scene of Gene Kelly dancing in the rain. You also remember the dance number of Donald O\'Connor, "Make \'em Laugh." If you saw "Royal Wedding," you will remember Fred Astaire dancing on the ceiling. If you saw "Jailhouse Rock," you will even remember the title dance number choreographed by The King himself.<br /><br />That is what is missing here. There could have been some blockbuster dance numbers in this presentation. The closest was Chuck McGowan\'s "I Can Do That." the mere fact that you have some talented people on stage moving together does not make a great dance film. Richard Attenborough was to blame for this failure. He pointed the camera at the stage and thought that would be a good thing.<br /><br />Yelling at people auditioning for a part in a Broadway production is not entertainment. Michael Douglas would be just as badly cast if he were in a Western or a comedy. He is OK when he is in a Mi

In [5]:
len(corpus_test), len(y_test), corpus_test[0], y_test[0]

(25000,
 25000,
 'Yes, in this movie you are treated to multiple little snowmen on the attack in apparently a very warm climate so yes this movie is definitely not to be taken seriously. It is in fact a much worse movie than the original as at least with that one the whole production looked like it cost more than a couple of bucks and a video camera to make. It has its funny moments, but really playing off the cheapness of your movie and making that be your intended laughs is kind of weak film making if you ask me. You can not come up with a good story, your effects are going to really be bad, hey let us just make the movie look as bad as possible with horrible one liners and we have our movie. The first one at least had a somewhat credible story as the snowman in that one attacked during the winter and not what amounts to a resort. It also had better effects too, this one is just a step or two ahead of "Hobgoblins" as far as the monsters are concerned and you really want to be more th

In [6]:
# Vectorizing training/test sequence data.
class Vectorizer():
    def __init__(self, vocab_size, max_len):
        self.vocab_size = vocab_size
        self.max_len = max_len
        # init tokenizer
        self.tokenizer = Tokenizer(num_words=vocab_size)

    # this is old don't use this
    def __tranform(self, corpus):
        sequences, lengths = [], []
        # transform word sequences into indices sequences
        for sentence in tqdm(corpus_train):
            encoded = one_hot(sentence, self.vocab_size)
            sequences.append(encoded)
            lengths.append(len(encoded))
        # pad sequences to have one length
        sequences = pad_sequences(sequences, maxlen=self.max_len)
        return np.array(sequences), lengths
    
    def fit_transform(self, corpus):
        """Use this with training set to initialzer the tokenizer word dictionnary
        """
        # train tokenizer on training corpus
        self.tokenizer.fit_on_texts(corpus)
        return self.transform(corpus)
    
    def transform(self, corpus):
        """Use this with test set
        """
        # generate sequences of indices
        sequences = self.tokenizer.texts_to_sequences(corpus)
        # pad sequences to have one length
        sequences = pad_sequences(sequences, maxlen=self.max_len, padding='post')
        return sequences
    
    @property
    def get_vocab_size(self):
        return self.vocab_size
    
    @property
    def get_sentence_max_len(self):
        return self.max_len
    
    def get_words(self):
        """Get the list of words learned by this tokenizer
        """
        return [word for word, index in self.tokenizer.word_index.items()]

In [7]:
# generate the index sequences
indexer = Vectorizer(vocab_size=50000, max_len=200)
term_idx_train = indexer.fit_transform(corpus_train)
term_idx_test = indexer.transform(corpus_test)

In [8]:
term_idx_train.shape, term_idx_test.shape

((25000, 200), (25000, 200))

In [9]:
term_idx_train[0]

array([  374,     1,   422,   833,   608,  4311,    31,     1,   708,
         306,     7,     7,    12,     6,    48,     6,  1009,   130,
          47,    97,    25,    74,    46,  2640,   833,  1391,     8,
          11,  2974,     1,  4483,    13,  3432, 40830,    10,    67,
          78,    12,     1,  2682,   189,    12,    22,    25,    46,
        1016,    81,    20,   864,   724,   292,   124,    21,    94,
           3,    84,   833,    19,   742,  6259,    13,     5,  1816,
          15,    11,  2095,    26,  3366,     1,   367,    30,     1,
         864,     2,   194,    12,    59,    27,     3,    49,   151,
           7,     7,  4558,    30,    81, 11926,    15,     3,   170,
           8,     3,  2130,   362,     6,    21,   718,   485,  1763,
          59,    27,    40,    14,   906,   174,    44,    26,    68,
           8,     3,  1005,    39,     3,   209,    26,     6,   605,
          51,    26,     6,     8,     3,   485,  1763,    17,   118,
          72,    64,

In [10]:
# input params
vocab_size             = indexer.get_vocab_size
word_embed_vector_size = 16
sentence_len_max       = indexer.get_sentence_max_len
epochs                 = 100
batch_size             = 1024

# load pre-trained embedding GloVe https://nlp.stanford.edu/projects/glove/

# Keras model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=word_embed_vector_size, input_length=sentence_len_max))
model.add(Dropout(0.04))
model.add(LSTM(units=64))
model.add(Dropout(0.02))
model.add(Dense(1, activation='sigmoid'))

In [11]:
# compile the model
adam = Adam(lr=1e-4)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy', 'binary_accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 16)           800000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200, 16)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                20736     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 820,801
Trainable params: 820,801
Non-trainable params: 0
_________________________________________________________________


In [12]:
# store the network architecture visualization graph to disk 
#plot_model(model, to_file='model.png', show_shapes=True)

# obtain the pydot.Graph object and render it
#SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [13]:
# combine all data
X = np.concatenate((term_idx_train, term_idx_test), axis=0)
y = np.append([], [y_train, y_test])

print('vocab_size=%s sentence_len_max=%s training=%s label=%s' % (vocab_size, sentence_len_max, X.shape, len(y)))

vocab_size=50000 sentence_len_max=200 training=(50000, 200) label=50000


In [14]:
# save callback
ckpt_callback = ModelCheckpoint('sentiment_weights.{epoch:02d}-{val_loss:.2f}.hdf5', 
                                 monitor='val_loss', 
                                 verbose=1, 
                                 save_best_only=True, 
                                 mode='auto')

# train model
history = model.fit(
    x                = X,
    y                = y,
    epochs           = epochs,
    batch_size       = batch_size,
    validation_split = 0.2,
    callbacks        = [ckpt_callback],
    verbose          = 1
)

Train on 40000 samples, validate on 10000 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.73487, saving model to sentiment_weights.01-0.73.hdf5
Epoch 2/100

Epoch 00002: val_loss did not improve
Epoch 3/100

Epoch 00003: val_loss did not improve
Epoch 4/100

Epoch 00004: val_loss did not improve
Epoch 5/100

Epoch 00005: val_loss did not improve
Epoch 6/100

Epoch 00006: val_loss did not improve
Epoch 7/100

Epoch 00007: val_loss did not improve
Epoch 8/100

Epoch 00008: val_loss did not improve
Epoch 9/100

Epoch 00009: val_loss did not improve
Epoch 10/100

Epoch 00010: val_loss did not improve
Epoch 11/100

Epoch 00011: val_loss did not improve
Epoch 12/100

Epoch 00012: val_loss did not improve
Epoch 13/100

Epoch 00013: val_loss did not improve
Epoch 14/100

Epoch 00014: val_loss improved from 0.73487 to 0.72885, saving model to sentiment_weights.14-0.73.hdf5
Epoch 15/100

Epoch 00015: val_loss did not improve
Epoch 16/100

Epoch 00016: val_loss did not improve
E


Epoch 00034: val_loss did not improve
Epoch 35/100

Epoch 00035: val_loss did not improve
Epoch 36/100

Epoch 00036: val_loss did not improve
Epoch 37/100

Epoch 00037: val_loss did not improve
Epoch 38/100

Epoch 00038: val_loss did not improve
Epoch 39/100

Epoch 00039: val_loss did not improve
Epoch 40/100

Epoch 00040: val_loss did not improve
Epoch 41/100

Epoch 00041: val_loss did not improve
Epoch 42/100

Epoch 00042: val_loss did not improve
Epoch 43/100

Epoch 00043: val_loss did not improve
Epoch 44/100

Epoch 00044: val_loss did not improve
Epoch 45/100

Epoch 00045: val_loss did not improve
Epoch 46/100

Epoch 00046: val_loss did not improve
Epoch 47/100

KeyboardInterrupt: 

In [15]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation binary accuracy values
plt.plot(history.history['binary_accuracy'])
plt.plot(history.history['val_binary_accuracy'])
plt.title('Model binary accuracy')
plt.ylabel('Binary Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

NameError: name 'history' is not defined

In [17]:
y_pred_lstm = model.predict(x=term_idx_train, batch_size=batch_size)
accuracy_train_lstm = (y_train == y_pred_lstm).mean()

In [18]:
pd.DataFrame(y_pred_lstm).describe()

Unnamed: 0,0
count,25000.0
mean,0.499034
std,0.000301
min,0.497457
25%,0.499093
50%,0.499093
75%,0.499108
max,0.500543


In [19]:
y_test_pred_lstm = model.predict(x=term_idx_test, batch_size=batch_size)
accuracy_test_lstm = (y_test == y_test_pred_lstm).mean()

In [20]:
pd.DataFrame(y_pred_lstm).describe()

Unnamed: 0,0
count,25000.0
mean,0.499034
std,0.000301
min,0.497457
25%,0.499093
50%,0.499093
75%,0.499108
max,0.500543


In [22]:
embeddings = model.layers[0].get_weights()
embeddings

In [27]:
words = indexer.get_words()

[[5, 9], [5, 8], [5, 8], [3, 8], [1], [6], [5, 8], [7, 5], [5, 8], [6, 3, 9, 7]]
[[5 9 0 0 0]
 [5 8 0 0 0]
 [5 8 0 0 0]
 [3 8 0 0 0]
 [1 0 0 0 0]
 [6 0 0 0 0]
 [5 8 0 0 0]
 [7 5 0 0 0]
 [5 8 0 0 0]
 [6 3 9 7 0]]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 5, 32)             320       
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 161       
Total params: 481
Trainable params: 481
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 0.800


In [None]:
# conclusion, we're doomed to fail, not able to train the embedding layer
# accuracy is stuck at a local minima but why always same? (optimizer is not improving from one epoch to the next)
# I will retry by hot encoding 'y', also check this https://jovianlin.io/embeddings-in-keras/ 