In [1]:
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dropout
from math import floor
import torch
import sys

Using TensorFlow backend.


In [2]:
# pipe output to text file. This has to be done as we loose socket connection if output of RNN model is piped
# to std_out
old_stdout = sys.stdout
sys.stdout = open('RNN_word_embedding_model_output.txt', 'w')

In [3]:
# check that gpu is available
torch.cuda.is_available()

True

In [4]:
'''
An RNN (architecture details mentioned in docstring) is used to perform sentiment classification for movie reviews.
The data source is Kaggle IMDB movie review data (https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data).
As a preprocessing step, we use word-embedding (Facebook's Glove embedding) as inputs to the RNN. 
This ensures that we do not use 1-hot sparse vectors as inputs to RNN.
'''

"\nAn RNN (architecture details mentioned in docstring) is used to perform sentiment classification for movie reviews.\nThe data source is Kaggle IMDB movie review data (https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data).\nAs a preprocessing step, we use word-embedding (Facebook's Glove embedding) as inputs to the RNN. \nThis ensures that we do not use 1-hot sparse vectors as inputs to RNN.\n"

In [5]:
'''
Loads the IMDB data from Kaggle (https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data). The numbers range from 0 (most negative) to 5 (most positive). 
This functions threholds the label to be 1 if (label >=3), else 0

Parameters:
    filename: the relative path of the dataset
'''
def getData(filename):
    fp = open(filename)
    line = fp.readline()

    cnt = 1
    x=[]
    y=[]
    max_length = 0
    while line:
       lst = line.strip().split()    
       # first two items in the line are indices
       temp_sentence = lst[2:-1]
       max_length = max(max_length,len(temp_sentence))

       if len(temp_sentence) != 0:
            sentence = ' '.join(temp_sentence[:-1])
            category = lst[-1]
            x.append(sentence)
            y.append(category)
       line = fp.readline()
       cnt += 1
    fp.close()
    assert(len(x) == len(y))
    # remove the headers from x and y
    x = x[1:]
    y = y[1:]

    # modify Y to be 0 or 1
    # 1 - positive, 0 - negative
    y = [1 if int(i) >= 3 else 0 for i in y]

    return (x, y, max_length)


In [6]:
'''
Preprocess the data: (1) tokenize the data (2) Pad the data to ensure inputs are of the same length when
sent to RNN

Parameters:
    x: the vector of text that is tokenized
    max_length: the maximum length of a sequence of words from vector x 
'''

def preprocess(x, max_length):

    # prepare tokenizer
    t = Tokenizer()
    t.fit_on_texts(x)
    vocab_size = len(t.word_index) + 1

    # integer (unique) encode the documents
    # pad documents to a max_length words
    encode_x = t.texts_to_sequences(x)
    padded_x = pad_sequences(encode_x, maxlen=max_length, padding='post')

    return padded_x, vocab_size, t

In [7]:
'''
Wrapper function that (1) Loads the IMDB data (2) Preprocesses the data 
(3) Creates an embedding matrix using Facebook's glove embedding
'''

def model():
  
    x_train, y_train, max_length_train = getData('data/train.tsv')

    # use 10% of the data as test data at the end
    split = floor(len(x_train)*0.9)
    temp_x_train = x_train
    temp_y_train = y_train
    
    # preprocess the training data for RNN computation
    # padding the data ensures the length of each input fed to RNN is equal
    x_train = temp_x_train[:split]
    y_train = temp_y_train[:split]
    padded_x_train, vocab_size_train, train_tokenizer = preprocess(x_train, max_length_train)

    # similar preprocessing for test input
    x_test = temp_x_train[split:]
    y_test = temp_y_train[split:]

    # pad by the test data by the same amount as training data
    # to ensure model input matches
    padded_x_test, _ ,_ = preprocess(x_test, max_length_train)
    
    # load the glove embedding into memory
    # From glove, 400000 word vectors loaded into memory
    embeddings_index = dict()
    fp = open('data/glove.6B.100d.txt')
    for line in fp:
        values = line.split()
        word = values[0]
        coefs = asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    fp.close()
    print('Loaded %s word vectors.' % len(embeddings_index))

    # create a weight matrix for words in training docs
    # the dimension of the matrix is (training vocab size x 100)
    embedding_matrix = zeros((vocab_size_train, 100))
    
    total_words = 0    
    found_words = 0
    for word, i in train_tokenizer.word_index.items():
        total_words += 1
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            found_words += 1
            embedding_matrix[i] = embedding_vector

    print("Percent of words found from the embedding matrix for training data: ", (found_words/total_words))
    
    return (vocab_size_train, embedding_matrix, max_length_train, padded_x_train, y_train, padded_x_test, y_test)


In [8]:
'''
Defines a RNN architecture: 2 layer bi-directional LSTM with sigmoid output for sentiment binary classification

Hyper-parameters:
    Dropout: 0.5 dropout for the nodes in a given layer, if applied
    Hidden size: 100
    num_epochs: 3
    Validation data split: 20% of training data such that model is trained on 80% of data and validated on 20% in each epoch

Parameters:
    vocab_size_train: the size (# words) of the vocabulary being trained
    
    embedding_matrix: the 100-dimensional Glove embedding used as inputs to the RNN
    
    max_length_train: the maximum length of a sequence of words from the vector being trained on
    
    padded_x_train: the padded training data, done to ensure the inputs feed to the RNN a constant length
    
    y_train: The labels for the training data
    
    padded_x_test: the padded test (hold-out) data, done to ensure the inputs feed to the RNN a constant length
    
    y_test: the labels for the test data 
    
    dropout: A boolean flag to embed drop-out in the RNN
'''
def build_and_run_model(vocab_size_train, embedding_matrix, max_length_train, padded_x_train, y_train, padded_x_test, y_test, dropout=False):
    
    print("RNN model with dropout") if dropout else print("RNN model without dropout")
    
    # define hyperparameters used for model
    use_dropout = dropout
    model = Sequential()
    hidden_size = 100
    num_epochs = 3
    

    e = Embedding(vocab_size_train, hidden_size, weights=[embedding_matrix], input_length=max_length_train, trainable=False)
    model.add(e)
    model.add(LSTM(hidden_size, return_sequences=True))
    model.add(LSTM(hidden_size, return_sequences=False))
    model.add(Dense(1, activation='sigmoid'))
    if use_dropout:
        model.add(Dropout(0.5))
    
    # print summary of model
    print("Model summary: ")
    print(model.summary())

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    hist = model.fit(padded_x_train, y_train, epochs=num_epochs, validation_split = 0.2, verbose=1)
    
    scores = model.evaluate(padded_x_test, y_test, verbose=1)
    print(hist.history)
    print('Test accuracy: ', scores[1])

In [9]:
(vocab_size_train, embedding_matrix, max_length_train, padded_x_train, y_train, padded_x_test, y_test) = model()
# run the model without dropout
build_and_run_model(vocab_size_train, embedding_matrix, max_length_train, padded_x_train, y_train, padded_x_test, y_test, dropout=False)

In [10]:
# run the model with dropout to compare performance
build_and_run_model(vocab_size_train, embedding_matrix, max_length_train, padded_x_train, y_train, padded_x_test, y_test, dropout=True)

In [11]:
# close piping std_out to file
sys.stdout = old_stdout

In [1]:
!cat RNN_word_embedding_model_output.txt

Loaded 194504 word vectors.
Percent of words found from the embedding matrix for training data:  0.9562231164145969
RNN model without dropout
Model summary: 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 52, 100)           1441500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 52, 100)           80400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 101       
Total params: 1,602,401
Trainable params: 160,901
Non-trainable params: 1,441,500
_________________________________________________________________
None
Train on 112362 samples, validate on 28091 samples
Epoch 1/3


Epoch 2/3




Epoch 3/3


{'val_loss': [0.4482373547188635, 0.4421253055710718, 0.459706536178671], 'val_acc': [0.8081235982970197, 0.8119326474586729, 0.8101171193535859], 'loss': [0.46026072596708384, 0.42070968814238097, 0.3934688964291281], 'acc': [0.8014008294629786, 0.8252968085295741, 0.8379968316680086]}
Test accuracy:  0.6846084839546358
RNN model with dropout
Model summary: 
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 52, 100)           1441500   
_________________________________________________________________
lstm_3 (LSTM)                (None, 52, 100)           80400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
_______________________________



Epoch 2/3




Epoch 3/3




{'val_loss': [0.5482055095468436, 0.5559732835717439, 0.5513360203299126], 'val_acc': [0.7203374746253952, 0.7203374746253952, 0.7203018760350994], 'loss': [2.428531735244824, 2.558720492150784, 2.3838787364504785], 'acc': [0.7526209928644775, 0.7593047471575729, 0.7704384044449051]}
Test accuracy:  0.7359348968727443


In [2]:
# The final Test accuracy for training the model with the architecture is 73%.