In [1]:
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/imdb-reviews/dataset.csv


In [2]:
def ingest_train():
    data = pd.read_csv('../input/imdb-reviews/dataset.csv', encoding='latin-1')
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map(int)
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    return data

In [3]:
data = ingest_train()

In [4]:
data.describe()

Unnamed: 0,Sentiment
count,25000.0
mean,0.5
std,0.50001
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [5]:
data.head()

Unnamed: 0,SentimentText,Sentiment
0,"first think another Disney movie, might good, ...",1
1,"Put aside Dr. House repeat missed, Desperate H...",0
2,"big fan Stephen King's work, film made even gr...",1
3,watched horrid thing TV. Needless say one movi...,0
4,truly enjoyed film. acting terrific plot. Jeff...,1


In [6]:
from sklearn.model_selection import train_test_split

SEED = 2000

x_train, x_validation, y_train, y_validation = train_test_split(data.SentimentText, data.Sentiment, test_size=.2, random_state=SEED)

In [7]:
# Using Keras Tokenizer to split each word in a sentence
# Using exts_to_sequences method in order to get a sequential representation of each row
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=100000)
tokenizer.fit_on_texts(x_train)
sequences = tokenizer.texts_to_sequences(x_train)

Using TensorFlow backend.


In [8]:
# Checking the max lenght of rows in the corpus for padding.
length = []
for x in x_train:
    length.append(len(x.split()))
max(length)

981

In [9]:
# Maximum lenght will be 1000
x_train_seq = pad_sequences(sequences, maxlen=1000)
x_train_seq[:5]

array([[   0,    0,    0, ..., 1292,   15,    4],
       [   0,    0,    0, ..., 1491,  211,  235],
       [   0,    0,    0, ...,   53,   93,    7],
       [   0,    0,    0, ...,   41,  675,  139],
       [   0,    0,    0, ...,  231,    6,  168]], dtype=int32)

In [10]:
# All the data transformed having the same length of 1000
# Doing the same thing to the validation set.
sequences_val = tokenizer.texts_to_sequences(x_validation)
x_val_seq = pad_sequences(sequences_val, maxlen=1000)

In [11]:
# CNN
# Defining a CNN using an embedding layer of 200x1000 dimension as an input with 100000 as max feature
# Adding to the 1D Convolutional layer 100x2000 filters
# Adding Global Max Pooling layer which will extract the maximum value from each filter
# The output will be a one dimensional vector with length equal to the number of the filters.
from keras.models import Sequential
from keras.layers import Dense, Conv1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from time import time

acc = []
times = []

model_cnn = Sequential()

e = Embedding(100000, 100, input_length=1000)
model_cnn.add(e)
model_cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(256, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

t0 = time()
model_cnn.fit(x_train_seq, y_train, validation_data=(x_val_seq, y_validation), epochs=5, batch_size=32, verbose=2)
score,accu = model_cnn.evaluate(x_val_seq, y_validation, verbose = 2, batch_size = 32)
tv_time = time()-t0

acc.append(accu*100)
times.append(tv_time*0.0166667)

print("score: %.2f" % (score))
print("acc: %.2f" % (accu))

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
 - 10s - loss: 0.3748 - accuracy: 0.8223 - val_loss: 0.2664 - val_accuracy: 0.8910
Epoch 2/5
 - 6s - loss: 0.1182 - accuracy: 0.9592 - val_loss: 0.2864 - val_accuracy: 0.8926
Epoch 3/5
 - 6s - loss: 0.0139 - accuracy: 0.9972 - val_loss: 0.3468 - val_accuracy: 0.8978
Epoch 4/5
 - 6s - loss: 9.3380e-04 - accuracy: 0.9999 - val_loss: 0.3650 - val_accuracy: 0.8992
Epoch 5/5
 - 6s - loss: 2.0707e-04 - accuracy: 1.0000 - val_loss: 0.3819 - val_accuracy: 0.9004
score: 0.38
acc: 0.90


In [12]:
# LSTM
# Using a single LSTM layer preceded by an embedding layer with 100000 as max feature and 128 dimension of each word in a sequence
# Following with a dense layer with softmax function.
from keras.layers import SpatialDropout1D, LSTM, Dropout

model_lstm = Sequential()

model_lstm.add(Embedding(100000, 128))
model_lstm.add(SpatialDropout1D(0.4))
model_lstm.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(1,activation='softmax'))
model_lstm.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model_lstm.summary())

t0 = time()
model_lstm.fit(x_train_seq, y_train, epochs = 7, batch_size=32, verbose = 2)
score,accu = model_lstm.evaluate(x_val_seq, y_validation, verbose = 2, batch_size = 32)
tv_time = time()-t0

acc.append(accu*100)
times.append(tv_time*0.0166667)
print("score: %.2f" % (score))
print("acc: %.2f" % (accu))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         12800000  
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, None, 128)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 12,931,713
Trainable params: 12,931,713
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/7
 - 507s - loss: 7.6551 - accuracy: 0.5008
Epoch 2/7
 - 504s - loss: 7.6551 - accuracy: 0.5008
Epoch 3/7
 - 506s - loss: 7.6551 - accuracy: 0.5008
Epoch 4/7
 - 506s - loss: 7.6551 - accuracy: 0.5008
Epoch 5/7
 - 510s - loss: 7

In [13]:
# Plotting the summary of the comparison between the CNN and LSTM models
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import numpy as np


names = ["CNN","LSTM"]

trace1 = go.Bar(
    x=names,
    y=acc,
    name='Accuracy (%)'
)
trace2 = go.Bar(
    x=names,
    y=times,
    name='Train & Test Time (Min)'
)

data = [trace1, trace2]
layout = go.Layout(
    barmode='group'
)

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='grouped-bar')