In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('labeledTrainData.tsv.zip',header=0, delimiter="\t", quoting=3)

In [None]:
df.shape

(25000, 3)

In [None]:
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [None]:
df.loc[0, 'review']

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

Split Data into Training and Test Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['review'],
    df['sentiment'],
    test_size=0.2,
    random_state=42
)

In [None]:
X_train.shape

(20000,)

In [None]:
X_test.shape

(5000,)

# Build the Tokenizer

In [None]:
import tensorflow as tf

In [None]:
top_words = 10000 #Vocablury size
t = tf.keras.preprocessing.text.Tokenizer(num_words=top_words) # num_words -> Vocablury size

In [None]:
#Fit tokenizer with actual training data
t.fit_on_texts(X_train.tolist())

In [None]:
#Vocabulary
t.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'his': 23,
 'are': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'has': 44,
 'if': 45,
 'there': 46,
 'some': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'even': 55,
 'time': 56,
 'she': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'had': 65,
 'their': 66,
 'can': 67,
 'me': 68,
 'were': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'get': 74,
 'been': 75,
 'bad': 76,
 'will': 77,
 'also': 78,
 'do': 79,
 'into': 80,
 'other': 81,
 'great': 82,
 'f

# Prepare Training and Test Data

Get the word index for each of the word in the review

In [None]:
X_train[0:1]

23311    "This movie is just plain dumb.<br /><br />Fro...
Name: review, dtype: object

In [None]:
X_train = t.texts_to_sequences(X_train.tolist())

In [None]:
X_train[0:1]

[[11,
  17,
  6,
  40,
  1058,
  973,
  7,
  7,
  36,
  1,
  976,
  4,
  2844,
  14,
  1855,
  4246,
  5,
  1,
  1352,
  1,
  19,
  6,
  32,
  3232,
  8,
  1604,
  8652,
  7,
  7,
  1855,
  4246,
  6,
  28,
  4,
  1233,
  278,
  1016,
  2,
  7945,
  34,
  269,
  1279,
  290,
  5,
  3351,
  1,
  730,
  4246,
  2000,
  181,
  989,
  5,
  74,
  5,
  1,
  904,
  11,
  19,
  500,
  4246,
  80,
  3,
  9824,
  31,
  3467,
  86,
  16,
  76,
  477,
  34,
  24,
  70,
  97,
  973,
  5,
  74,
  241,
  16,
  230,
  28,
  6,
  35,
  842,
  1275,
  12,
  26,
  5,
  3,
  3693,
  1237,
  7,
  7,
  81,
  541,
  4,
  1,
  17,
  24,
  205,
  43,
  4,
  1,
  292,
  4877,
  297,
  267,
  837,
  30,
  1,
  15,
  1844,
  39,
  1,
  76,
  226,
  34,
  2553,
  8104,
  23,
  1426,
  8,
  1,
  142,
  2,
  1,
  2348,
  2000,
  24,
  8105,
  6,
  624,
  179,
  1,
  4,
  1,
  1301,
  51,
  1,
  369,
  6,
  2636,
  20,
  38,
  2367,
  2975,
  18,
  441,
  94,
  118,
  1,
  497,
  51,
  1,
  369,
  2602,
  142,
  15,


In [None]:
X_test = t.texts_to_sequences(X_test.tolist())

How many words in each review?

# Pad Sequences - Important

In [None]:
#Define maximum number of words to consider in each review
max_review_length = 300

In [None]:
#Pad training and test reviews
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train,
                                                        maxlen=max_review_length,
                                                        padding='pre')
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test,
                                                       maxlen=max_review_length,
                                                       padding='pre')

In [None]:
X_train[0:1]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,   11,   17,    6,   40, 1058,
         973,    7,    7,   36,    1,  976,    4, 2844,   14, 1855, 4246,
           5,    1, 1352,    1,   19,    6,   32, 3232,    8, 1604, 8652,
           7,    7, 1855, 4246,    6,   28,    4, 1233,  278, 1016,    2,
        7945,   34,  269, 1279,  290,    5, 3351,    1,  730, 4246, 2000,
         181,  989,    5,   74,    5,    1,  904,   11,   19,  500, 4246,
          80,    3, 9824,   31, 3467,   86,   16,   76,  477,   34,   24,
          70,   97,  973,    5,   74, 

In [None]:
X_train.shape

(20000, 300)

In [None]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0,0,0,0,0,0,0,0,0,0,...,61,273,12,9,215,104,742,43,4,154
1,0,0,0,0,0,0,0,0,0,0,...,2,10,409,4426,51,9,1096,1066,7,7
2,0,0,0,0,0,0,0,0,0,0,...,2,567,20,4247,68,11,6,3419,7,7
3,9,2,211,3,224,158,180,5,1,19,...,2,367,2,1,2455,22,1545,290,480,21
4,0,0,0,0,0,0,0,0,0,0,...,82,14,1,330,291,146,15,1,1297,359


In [None]:
X_test.shape

(5000, 300)

# Build the Graph

In [None]:
#Initialize model
tf.keras.backend.clear_session()
model = tf.keras.Sequential()

Add Embedding layer
 - Embedding Layer Input = Batch_Size * Length of each review

In [None]:
model.add(tf.keras.layers.Embedding(top_words + 1, #Vocablury size
                                    50, #Embedding size
                                    input_length=max_review_length) #Number of words in each review
          )

In [None]:
model.output

<KerasTensor: shape=(None, 300, 50) dtype=float32 (created by layer 'embedding')>

Embedding Layer Output -
[Batch_Size , Review Length , Embedding_Size]

Add LSTM Layer with 256 as RNN state size

In [None]:
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))

In [None]:
model.add(tf.keras.layers.LSTM(256,dropout=0.4)) #RNN State - size of cell state and hidden state

In [None]:
model.output

<KerasTensor: shape=(None, 256) dtype=float32 (created by layer 'lstm')>

Use Dense layer for output layer

In [None]:
model.add(tf.keras.layers.BatchNormalization())
model.add(tf.keras.layers.Dropout(0.5))

In [None]:
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))

In [None]:
#Compile the model
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 50)           500050    
_________________________________________________________________
batch_normalization (BatchNo (None, 300, 50)           200       
_________________________________________________________________
dropout (Dropout)            (None, 300, 50)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 256)               314368    
_________________________________________________________________
batch_normalization_1 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 2

# Execute the graph

In [None]:
model.fit(X_train,y_train,
          epochs=10,
          batch_size=64,
          validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1d6ccdbe190>