Set the seed

In [0]:
import numpy as np
np.random.seed(42)

Data can be downloaded from Kaggle -> https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [0]:
import pandas as pd

df = pd.read_csv('kaggle/labeledTrainData.tsv.zip',  #filepath
                 header=0, delimiter="\t", quoting=3)

In [0]:
df.shape

## Data Preprocessing

1.Split Data into Training and Test Data

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2, 
    random_state=42
)

2.Build Tokenizer to get Number sequences for Each review

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

#Vocab size
top_words = 10000

t = Tokenizer(num_words=top_words)
t.fit_on_texts(X_train.tolist())

#Get the word index for each of the word in the review
X_train = t.texts_to_sequences(X_train.tolist())
X_test = t.texts_to_sequences(X_test.tolist())

3.Pad sequences to make each review size equalGet the word index for each of the word in the review

In [0]:
from tensorflow.python.keras.preprocessing import sequence

#Each review size
max_review_length = 300

X_train = sequence.pad_sequences(X_train,maxlen=max_review_length,padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, padding='post')

## Build Embedding Matrix from Pre-Trained Word2Vec model

Load pre-trained Gensim Embeddings

In [0]:
#Install gensim
!pip install gensim --quiet

#Load pre-trained model
import gensim
word2vec = gensim.models.Word2Vec.load('word2vec-movie-50')

#Embedding Length
embedding_vector_length = word2vec.wv.vectors.shape[1]

print('Loaded word2vec model..')
print('Model shape: ', word2vec.wv.vectors.shape)

Build matrix for current data

In [0]:
#Initialize embedding matrix to all zeros
embedding_matrix = np.zeros((top_words + 1, #Vocablury size + 1
                             embedding_vector_length))

#Steps for populating embedding matrix

#1. Check each word in tokenizer vocablury to see if it exist in pre-trained
# word2vec model.
#2. If found, update embedding matrix with embeddings for the word 
# from word2vec model

for word, i in sorted(t.word_index.items(),key=lambda x:x[1]):
    if i > top_words:
        break
    if word in word2vec.wv.vocab:
        embedding_vector = word2vec.wv[word]
        embedding_matrix[i] = embedding_vector

In [0]:
#Check embeddings for word 'great'
embedding_matrix[t.word_index['great']]

## Build the Graph

In [0]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dropout, Dense, Embedding, Flatten, LSTM

#Build a sequential model
model = Sequential()

Add Embedding layer

In [0]:
model.add(Embedding(top_words + 1,
                    embedding_vector_length,
                    input_length=max_review_length,
                    weights=[embedding_matrix], #Pre-trained embedding
                    trainable=False) #We do not want to change embedding
         )

Output from Embedding is 3 dimension 
- batch_size x max_review_length x embedding_vector_length. 

In [0]:
#Add Layer with 100 LSTM Memory Units
model.add(LSTM(100, 
               dropout=0.2, #Dropout applied prior to feeding input to LSTM
               recurrent_dropout=0.2)) #Droput applied to the output of LSTM

#Output Layer
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Execute the graph

In [0]:
model.fit(X_train,y_train,
          epochs=1,
          batch_size=128,          
          validation_data=(X_test, y_test),
          verbose=1)

In [0]:
model.predict(X_test[100:102])

In [0]:
model.summary()