# Sentiment Analysis using the IMDb reviews dataset - RNN-based model with attention/transformers mechanisms

Ref.

Kaggle

[Sentiment Analysis of IMDB Movie Reviews - gold](https://www.kaggle.com/code/lakshmi25npathi/sentiment-analysis-of-imdb-movie-reviews)

[Sentiment Analysis of IMDB Movie Reviews - cooper](https://www.kaggle.com/code/bhavikjikadara/sentiment-analysis-of-imdb-movie-reviews)

[IMDB Dataset Sentiment Analysis using RNN](https://www.kaggle.com/code/tanyildizderya/imdb-dataset-sentiment-analysis-using-rnn)

Keras

[Keras - IMDB movie review sentiment classification dataset](https://keras.io/api/datasets/imdb/)

這是一個包含 25,000 部電影評論的 IMDB 數據集，按情感（正面/負面）進行標記。評論已經過預處理，每個評論都被編碼為一個詞索引列表（整數）。為了方便起見，詞彙按照數據集中的整體頻率進行索引，因此例如整數 "3" 編碼了數據中第三個最常見的詞。這使得可以快速進行過濾操作，例如："只考慮前 10,000 個最常見的詞，但排除前 20 個最常見的詞"。

按照慣例，"0" 不代表特定的詞，而是用於編碼填充標記。

Benchmark

[Sentiment Analysis on IMDb](https://paperswithcode.com/sota/sentiment-analysis-on-imdb)

In [5]:
# Python version: 3.10.6


import os

import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import pickle
from sklearn import metrics
import numpy as np
import pandas as pd

import keras
from keras import layers, losses, optimizers, regularizers
from keras.utils import pad_sequences, plot_model
from keras.datasets import imdb
import matplotlib.pyplot as plt
from tqdm.keras import TqdmCallback

os.environ["KERAS_BACKEND"] = "tensorflow"

# from tensorflow.python.client import device_lib
# print(device_lib.list_local_devices())

tf.config.list_physical_devices("GPU")

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [6]:
# sequence preprocessing parameters
start_char = 1
oov_char = 2
index_from = 3
max_num_words = 5000
max_sequence_length = 240

# training parameters
training = True
use_cuDNN = True
embedding_output_dim = 64
num_epochs = 20
batch_size = 64
validation_split = 0.1
validation_steps = 30
learning_rate = 1e-4

## Load dataset

In [7]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(
    num_words=max_num_words,
    skip_top=0,
    maxlen=None,
    seed=113,
    start_char=start_char,
    oov_char=oov_char,
    index_from=index_from,
)

## EDA

In [8]:
print("Training data shape:", x_train.shape)
print("Training label shape:", y_train.shape)
print("Test data shape:", x_test.shape)
print("Test label shape:", y_test)

Training data shape: (25000,)
Training label shape: (25000,)
Test data shape: (25000,)
Test label shape: [0 1 1 ... 0 0 0]


In [9]:
unique, counts = np.unique(y_train, return_counts=True)
print("y train distribution: ", dict(zip(unique, counts)))

unique, counts = np.unique(y_test, return_counts=True)
print("y test distribution: ", dict(zip(unique, counts)))

y train distribution:  {0: 12500, 1: 12500}
y test distribution:  {0: 12500, 1: 12500}


In [10]:
def print_decoded_sequence(data, index=0):
    # 取得字典 mapping => { word: index ...}
    word_index = imdb.get_word_index()

    # key, value => word, index
    inverted_word_index = dict((index + index_from, word) for (word, index) in word_index.items())

    # Update `inverted_word_index` to include `start_char` and `oov_char`
    inverted_word_index[0] = "[MASK]"
    inverted_word_index[start_char] = "[START]"
    inverted_word_index[oov_char] = "[OOV]"

    # X data (word sequence)

    print(data[index])
    decoded_sequence = " ".join(inverted_word_index[i] for i in data[index])
    print(decoded_sequence)

    # y data (labels: positive or negative)

    print(y_train[index])

In [11]:
if max_num_words is None:
    max_num_words = len(imdb.get_word_index().items())

print(max_num_words)

print_decoded_sequence(x_train)

88584
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
[START] this film was just bri

In [None]:
# training label distribution

plt.figure()
sns.countplot(pd.DataFrame(y_train, columns=["class"]), x="class")
plt.xlabel("Classes")
plt.ylabel("Frequency")
plt.title("y train")

In [None]:
# test label distribution

plt.figure()
sns.countplot(pd.DataFrame(y_test, columns=["class"]), x="class")
plt.xlabel("Classes")
plt.ylabel("Frequency")
plt.title("y test")

In [None]:
# words distribution


def visualize_sequence_distribution(x_train, x_test):
    review_len_train = []
    review_len_test = []
    for i, j in zip(x_train, x_test):
        review_len_train.append(len(i))
        review_len_test.append(len(j))

    print("min:", min(review_len_train), "max:", max(review_len_train))
    print("min:", min(review_len_test), "max:", max(review_len_test))

    sns.displot(review_len_train, rug_kws={"alpha": 0.3})
    plt.xlabel("review length")
    plt.title("review train")
    sns.displot(review_len_test, rug_kws={"alpha": 0.3})
    plt.xlabel("review length")
    plt.title("review test")

In [None]:
visualize_sequence_distribution(x_train, x_test)

In [None]:
mean_sequence_len = np.mean([len(seq) for seq in x_train])
mean_sequence_len

## Data preprocessing

Keras's IMDB

X data : 資料已經預處理過，包括
* normalization => setting English stopwords
* removing html strips and noise text
* removing special characters
* segmentation (斷詞)
* removing stopwords
* encoding (編碼)

y data : 代表正向 (positive) 或負向 (negative) 的評論

In [None]:
# 使每個 sequence 有相同的長度

x_train = pad_sequences(x_train, maxlen=max_sequence_length, padding="pre")
x_test = pad_sequences(x_test, maxlen=max_sequence_length, padding="pre")

In [None]:
print("Training data shape:", x_train.shape)
print("Training label shape:", y_train.shape)
print("Test data shape:", x_test.shape)
print("Test label shape:", y_test.shape)

In [None]:
visualize_sequence_distribution(x_train, x_test)

In [None]:
print_decoded_sequence(x_train)

## RNN mechanism

Ref.

[一文搞懂RNN（循环神经网络）基础篇](https://zhuanlan.zhihu.com/p/30844905)

![rnn](./images/rnn.png)

S<sub>t</sub> 代表神經元在 t 時刻上，同時接收
* 輸入 X 與其輸入權重 U
* 前一次 S 在 t-1 時刻的 S<sub>t-1</sub> 與其 W (S<sub>t-1</sub> --> S<sub>t</sub> 的權重)

展開

![rnn-expand](./images/rnn-expand.png)

## Attention mechanism

Ref.

[完全解析RNN, Seq2Seq, Attention注意力机制](https://zhuanlan.zhihu.com/p/51383402)

[A simple overview of RNN, LSTM and Attention Mechanism](https://medium.com/swlh/a-simple-overview-of-rnn-lstm-and-attention-mechanism-9e844763d07b)

[注意力機制 (Attention Mechanism) 的理解與實作](https://www.kaggle.com/code/lianghsunhuang/attention-mechanism)

Attention 的架構

![attention-mechanism](./images/attention-mechanism.png)

## Build model

Ref.

[Keras 中的循环神经网络 (RNN)](https://tensorflow.google.cn/guide/keras/rnn?hl=zh-cn)

[Keras实现CNN、RNN（基于attention 的双向RNN）及两者的融合](https://blog.csdn.net/xwd18280820053/article/details/80060544)

[用LSTM模型分類IMDB電影資料集評論](https://dysonma.github.io/2020/11/21/LSTM_IMDB/)

very useful

[Text classification with an RNN](https://www.tensorflow.org/text/tutorials/text_classification_rnn)

In [None]:
rnn_activation = "selu" if not use_cuDNN else "tanh"

model = keras.Sequential(name="BidirectionalRnn")
model.add(layers.Embedding(max_num_words, embedding_output_dim, input_length=max_sequence_length, mask_zero=True))
model.add(layers.Bidirectional(layers.LSTM(64, activation=rnn_activation, return_sequences=True)))
model.add(layers.Bidirectional(layers.LSTM(32, activation=rnn_activation, return_sequences=False)))
model.add(layers.Dense(64, activation="relu"))
model.add(layers.Dropout(0.5))
model.add(keras.layers.Dense(1, activation="sigmoid"))

## Preview model

In [None]:
model.build((None, max_sequence_length))
model.summary()

In [None]:
model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False),
    optimizer=optimizers.Adam(learning_rate=learning_rate),
    metrics=["accuracy"],
)

In [None]:
plot_model(model, show_shapes=True, show_layer_names=False)

## Train model

In [None]:
train_history = model.fit(
    x_train,
    y_train,
    epochs=num_epochs,
    # verbose=0,
    batch_size=batch_size,
    validation_split=validation_split,
    validation_steps=validation_steps,
    # callbacks=[TqdmCallback(verbose=0)],
)

## Evaluation

In [None]:
if training:
    plt.figure(figsize=[6, 4])
    plt.plot(train_history.history["loss"], "black", linewidth=2.0)
    plt.plot(train_history.history["val_loss"], "green", linewidth=2.0)
    plt.legend(["Training Loss", "Validation Loss"], fontsize=14)
    plt.xlabel("Epochs", fontsize=10)
    plt.ylabel("Loss", fontsize=10)
    plt.title("Loss Curves", fontsize=12)

In [None]:
if training:
    plt.figure(figsize=[6, 4])
    plt.plot(train_history.history["accuracy"], "black", linewidth=2.0)
    plt.plot(train_history.history["val_accuracy"], "blue", linewidth=2.0)
    plt.legend(["Training Accuracy", "Validation Accuracy"], fontsize=14)
    plt.xlabel("Epochs", fontsize=10)
    plt.ylabel("Accuracy", fontsize=10)
    plt.title("Accuracy Curves", fontsize=12)

In [None]:
# on training 7: found best index is 40

np.argmin(train_history.history["val_loss"])

In [None]:
if not training:
    test_result = model.evaluate(x_test, y_test, batch_size=batch_size)

## Save result

## Final prediction

## Test result insights