Set the seed

In [0]:
import numpy as np
np.random.seed(42)

Data can be downloaded from Kaggle at the following URL

- https://www.kaggle.com/c/word2vec-nlp-tutorial/data

In [0]:
import pandas as pd

#Change filepath based on where you have stored the data
df = pd.read_csv('kaggle/labeledTrainData.tsv.zip',header=0, delimiter="\t", quoting=3)

print(df.shape)

Split Data into Training and Test Data

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2, 
    random_state=42
)

In [0]:
X_train.shape

## Prepare Data

1.Convert reviews to Number sequences using Tokenizer

In [0]:
from tensorflow.python.keras.preprocessing.text import Tokenizer

#Vocablury size
top_words = 5000
t = Tokenizer(num_words=top_words)

#Fit tokenizer of training data
t.fit_on_texts(X_train.tolist())

#Get the word index for each of the word in the review
X_train = t.texts_to_sequences(X_train.tolist())
X_test = t.texts_to_sequences(X_test.tolist())

In [0]:
#Length of different reviews is different
print('Length of review# 32 is: ', len(X_train[32]))
print('Length of review# 1208 is: ', len(X_train[1208]))

2.Pad the sequences - to make every review equal in size

In [0]:
from tensorflow.python.keras.preprocessing import sequence

#Length for each review
max_review_length = 300

X_train = sequence.pad_sequences(X_train,maxlen=max_review_length,
                                 padding='post')

X_test = sequence.pad_sequences(X_test, maxlen=max_review_length, 
                                padding='post')

In [0]:
#Length of different reviews should be SAME now
print('Length of review# 32 is: ', len(X_train[32]))
print('Length of review# 1208 is: ', len(X_train[1208]))

In [0]:
X_train[1208]

## Build the Graph

In [0]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dropout, Dense, Embedding, Flatten

In [0]:
# Define how many numbers per word for Word embeddings
embedding_vector_length = 50 

#Build a model
model = Sequential()

Add Embedding layer

In [0]:
model.add(
    Embedding(top_words+1, #Vocablury Size, why +1
                    embedding_vector_length, #How many numbers per word
                    input_length=max_review_length) #Words in each review
         )

Output from Embedding is 3 dimension 
- batch_size x max_review_length x embedding_vector_length. 

We need to flatten the output for Dense layer

In [0]:
#Flatten the input
model.add(Flatten())

#Dense Layers
model.add(Dense(200,activation='relu'))
model.add(Dense(100,activation='relu'))
model.add(Dense(60,activation='relu'))
model.add(Dense(30,activation='relu'))

#Output layer
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

## Execute the graph

In [0]:
#Change number of epochs appropriately
model.fit(X_train,y_train,
          epochs=1,
          batch_size=128,
          shuffle=True, 
          validation_data=(X_test, y_test))

In [0]:
model.predict(X_test[0:2])