In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, GRU
from tensorflow.keras.preprocessing import sequence
import tensorflow_datasets as tfds

# to get deterministic output
tf.random.set_seed(123)

sys.path.append(os.path.abspath(".."))

### Load the Dataset

In [2]:
word_to_index_map = tf.keras.datasets.imdb.get_word_index()
index_to_word_map = {value: key for key, value in word_to_index_map.items()}

print("Total word count:", len(word_to_index_map))
list(word_to_index_map.items())[:10]

Total word count: 88584


[('fawn', 34701),
 ('tsukino', 52006),
 ('nunnery', 52007),
 ('sonja', 16816),
 ('vani', 63951),
 ('woods', 1408),
 ('spiders', 16115),
 ('hanging', 2345),
 ('woody', 2289),
 ('trawling', 52008)]

In [3]:
list(index_to_word_map.items())[:10]

[(34701, 'fawn'),
 (52006, 'tsukino'),
 (52007, 'nunnery'),
 (16816, 'sonja'),
 (63951, 'vani'),
 (1408, 'woods'),
 (16115, 'spiders'),
 (2345, 'hanging'),
 (2289, 'woody'),
 (52008, 'trawling')]

In [4]:
max_features = 10000   # use top 10,000 words
max_seq_len = 200      # truncate sequences after 200 words

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=max_features)

# make each sequence to same length
X_train = sequence.pad_sequences(X_train, maxlen=max_seq_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_seq_len)

print("Train set shape:", X_train.shape, y_train.shape)
print("Test set shape :", X_test.shape, y_test.shape)

Train set shape: (25000, 200) (25000,)
Test set shape : (25000, 200) (25000,)


### Visualize the Dataset

In [5]:
def decode_to_text(num_sequence):
    return ' '.join([index_to_word_map.get(i, '?') for i in num_sequence])

def encode_to_numseq(sentence):
    return [word_to_index_map.get(word, 0) for word in sentence.lower().split()]

In [6]:
print(X_train[0], "\n")
print(decode_to_text(X_train[0]))

[   5   25  100   43  838  112   50  670    2    9   35  480  284    5
  150    4  172  112  167    2  336  385   39    4  172 4536 1111   17
  546   38   13  447    4  192   50   16    6  147 2025   19   14   22
    4 1920 4613  469    4   22   71   87   12   16   43  530   38   76
   15   13 1247    4   22   17  515   17   12   16  626   18    2    5
   62  386   12    8  316    8  106    5    4 2223 5244   16  480   66
 3785   33    4  130   12   16   38  619    5   25  124   51   36  135
   48   25 1415   33    6   22   12  215   28   77   52    5   14  407
   16   82    2    8    4  107  117 5952   15  256    4    2    7 3766
    5  723   36   71   43  530  476   26  400  317   46    7    4    2
 1029   13  104   88    4  381   15  297   98   32 2071   56   26  141
    6  194 7486   18    4  226   22   21  134  476   26  480    5  144
   30 5535   18   51   36   28  224   92   25  104    4  226   65   16
   38 1334   88   12   16  283    5   16 4472  113  103   32   15   16
 5345 

In [7]:
y_train[:10]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0])

### Design the Model

In [8]:
model_rnn = Sequential([
    Input(shape=(max_seq_len,)),
    Embedding(max_features, 128),
    SimpleRNN(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
model_rnn.summary()

In [9]:
model_lstm = Sequential([
    Input(shape=(max_seq_len,)),
    Embedding(max_features, 128),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
model_lstm.summary()

In [10]:
model_gru = Sequential([
    Input(shape=(max_seq_len,)),
    Embedding(max_features, 128),
    GRU(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])
model_gru.summary()

In [11]:
# select model to train
#model = model_rnn
model = model_lstm
#model = model_gru

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

### Train the Model

In [12]:
training = model.fit(
    X_train, y_train,
    batch_size=64,
    epochs=3,
    validation_split=0.2,
    verbose=1
)

Epoch 1/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 243ms/step - accuracy: 0.7612 - loss: 0.4819 - val_accuracy: 0.8126 - val_loss: 0.4143
Epoch 2/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 245ms/step - accuracy: 0.8450 - loss: 0.3653 - val_accuracy: 0.8036 - val_loss: 0.4474
Epoch 3/3
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 252ms/step - accuracy: 0.8622 - loss: 0.3310 - val_accuracy: 0.8410 - val_loss: 0.3815


### Evaluate the Model

In [13]:
test_scores = model.evaluate(X_test, y_test, batch_size=64, verbose=1)

scoring_metric_names = ['Loss', 'Accuracy']
for i, metric_name in enumerate(scoring_metric_names):
    metric = metric_name.lower()
    val_metric = f'val_{metric}'
    train_score = training.history[metric][-1]
    val_score = training.history[val_metric][-1]
    print(f"Train {metric_name}: {train_score:.2f}")
    print(f"Val {metric_name}  : {val_score:.2f}")
    print(f"Test {metric_name} : {test_scores[i]:.2f}")
    print()

[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 46ms/step - accuracy: 0.8411 - loss: 0.3886
Train Loss: 0.33
Val Loss  : 0.38
Test Loss : 0.39

Train Accuracy: 0.86
Val Accuracy  : 0.84
Test Accuracy : 0.84



### Make Prediction

In [14]:
def predict(review_text):
    encoded_numseq= encode_to_numseq(review_text)
    print("Input:", decode_to_text(encoded_numseq))
    padded_numseq= sequence.pad_sequences([encoded_numseq], maxlen=max_seq_len)
    prediction = model.predict(padded_numseq)
    positive = prediction[0][0] > 0.5
    print("Result:", "Positive" if positive else "Negative")

In [15]:
predict("This movie was fantastic and I really loved the story and the acting")

Input: this movie was fantastic and i really loved the story and the acting
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 401ms/step
Result: Positive
