# IMBD movie review sentiment analysis by using Conv1D & LSTM

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import tensorflow as tf
seed = 2021
np.random.seed(seed)
tf.random.set_seed(seed)
import warnings
warnings.filterwarnings('ignore')

In [4]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = None)
X_train.shape, X_test.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


((25000,), (25000,))

In [5]:
index_dict = {}
for key, value in imdb.get_word_index().items() :
    index_dict[value] = key
len(index_dict)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


88584

In [6]:
print(X_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [7]:
' '.join(index_dict[i] for i in X_train[0])

"the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but wh

In [8]:
print('Length of the longest news :', max(len(i) for i in X_train))
print('Average length of news :', sum(map(len, X_train)) / len(X_train))

Length of the longest news : 2494
Average length of news : 238.71364


## Preprocessing

### Sentiment classification by using Conv1D & LSTM
- Words frequency : 5,000
- Number of words in sentence : 200
- 10000 of test data for validation

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, Dropout, MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [10]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 5000)

In [11]:
max_len = 500
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [12]:
from sklearn.model_selection import train_test_split
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify = y_test, test_size = 0.4, random_state = seed
)
X_test.shape, X_val.shape, y_test.shape, y_val.shape

((15000, 500), (10000, 500), (15000,), (10000,))

## Processing

In [13]:
model = Sequential([
    Embedding(5000, 120),
    Dropout(0.5),
    Conv1D(64, 5, activation = 'relu'),
    MaxPooling1D(pool_size = 4),
    LSTM(60),
    Dense(1, activation = 'sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 120)         600000    
_________________________________________________________________
dropout (Dropout)            (None, None, 120)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 64)          38464     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, None, 64)          0         
_________________________________________________________________
lstm (LSTM)                  (None, 60)                30000     
_________________________________________________________________
dense (Dense)                (None, 1)                 61        
Total params: 668,525
Trainable params: 668,525
Non-trainable params: 0
__________________________________________________

In [14]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [15]:
model_file = 'Models/best_imdb_conv1d_lstm.h5'
mc = ModelCheckpoint(model_file, save_best_only = True, verbose = 1)
es = EarlyStopping(patience = 10)

In [16]:
history = model.fit(X_train, y_train, batch_size = 100, epochs = 50,
                    validation_data = (X_val, y_val), verbose = 0, callbacks = [mc, es])


Epoch 00001: val_loss improved from inf to 0.31111, saving model to Models/best_imdb_conv1d_lstm.h5

Epoch 00002: val_loss improved from 0.31111 to 0.27005, saving model to Models/best_imdb_conv1d_lstm.h5

Epoch 00003: val_loss did not improve from 0.27005

Epoch 00004: val_loss did not improve from 0.27005

Epoch 00005: val_loss did not improve from 0.27005

Epoch 00006: val_loss did not improve from 0.27005

Epoch 00007: val_loss did not improve from 0.27005

Epoch 00008: val_loss did not improve from 0.27005

Epoch 00009: val_loss did not improve from 0.27005

Epoch 00010: val_loss did not improve from 0.27005

Epoch 00011: val_loss did not improve from 0.27005

Epoch 00012: val_loss did not improve from 0.27005


In [18]:
best_model = load_model(model_file)
best_model.evaluate(X_test, y_test)



[0.26832544803619385, 0.8895999789237976]

# How does it in original?

In [22]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words = 5000)

In [23]:
max_len = 500
X_train = pad_sequences(X_train, maxlen = max_len)
X_test = pad_sequences(X_test, maxlen = max_len)

In [24]:
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, stratify = y_test, test_size = 0.4, random_state = seed
)
X_test.shape, X_val.shape, y_test.shape, y_val.shape

((15000, 500), (10000, 500), (15000,), (10000,))

In [25]:
model = Sequential([
    Embedding(5000, 120),
    LSTM(120),
    Dense(1, activation = 'sigmoid')
])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 120)         600000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 120)               115680    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 121       
Total params: 715,801
Trainable params: 715,801
Non-trainable params: 0
_________________________________________________________________


In [26]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

In [27]:
model_file = 'Models/best_imdb_lstm.h5'
mc = ModelCheckpoint(model_file, save_best_only = True, verbose = 1)
es = EarlyStopping(patience = 10)

In [28]:
history = model.fit(X_train, y_train, batch_size = 100, epochs = 50,
                    validation_data = (X_val, y_val), verbose = 0, callbacks = [mc, es])


Epoch 00001: val_loss improved from inf to 0.34968, saving model to Models/best_imdb_lstm.h5

Epoch 00002: val_loss improved from 0.34968 to 0.32394, saving model to Models/best_imdb_lstm.h5

Epoch 00003: val_loss improved from 0.32394 to 0.31474, saving model to Models/best_imdb_lstm.h5

Epoch 00004: val_loss did not improve from 0.31474

Epoch 00005: val_loss did not improve from 0.31474

Epoch 00006: val_loss did not improve from 0.31474

Epoch 00007: val_loss did not improve from 0.31474

Epoch 00008: val_loss did not improve from 0.31474

Epoch 00009: val_loss did not improve from 0.31474

Epoch 00010: val_loss did not improve from 0.31474

Epoch 00011: val_loss did not improve from 0.31474

Epoch 00012: val_loss did not improve from 0.31474

Epoch 00013: val_loss did not improve from 0.31474


In [29]:
best_model = load_model(model_file)
best_model.evaluate(X_test, y_test)



[0.3128144145011902, 0.8719333410263062]