# Import dependencies

In [None]:
# To mute annoying warnings in notebook
import warnings
import time
from datetime import datetime

# For data science
import pandas as pd
import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, TerminateOnNaN, ProgbarLogger, TensorBoard, LearningRateScheduler

# For graph
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Getting data, observations
Get dataset

In [None]:
# Get dataset from file
df = pd.read_csv(
    f"../data/IMDB_Dataset.csv",
)

# Show dataset head
df.head()

Tokenize text.

In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)  # Set maximum number of words to keep
tokenizer.fit_on_texts(df['review'])  # Fit tokenizer on the reviews
sequences = tokenizer.texts_to_sequences(df['review'])  # Convert text to sequences of word indices

Set crop parameters for words.

In [None]:
max_features = 20000  # Only consider the top 20k words
max_length = 200

Ensure that all sequences in a dataset have the same length by padding or truncating them as needed.

In [None]:
padded_sequences = pad_sequences(
    sequences=sequences,
    maxlen=max_length,
    padding='post',
)

Code labels.

In [None]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

Split dataframe to train and test data.

In [None]:
# Get split subsets
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences,
    df['sentiment'],
    test_size=0.3,
    random_state=42
)

Instantiate a Keras tensor.

In [None]:
# Input layer
inputs = keras.Input(
    shape=(None,),
    dtype="int32",
)

Turn positive integers (indexes) into dense vectors of fixed size.

In [None]:
# Embedding Layer
x = layers.Embedding(
    input_dim=max_features,
    output_dim=128
)(inputs)

Set layers.

In [None]:
# First Bidirectional LSTM
x = layers.Bidirectional(
    layers.LSTM(
        units=64,
        return_sequences=True,
    )
)(x)

# Second Bidirectional LSTM
x = layers.Bidirectional(layers.LSTM(64))(x)

# Dense (Fully Connected) Layer
outputs = layers.Dense(
    units=1,
    activation="sigmoid",
)(x)

model = keras.Model(inputs, outputs)

model.summary()

Set callbacks.

In [None]:
# Get time log
logs = "logs/" + datetime.now().strftime("%Y%m%d-%H%M%S")

# Get TensorBoard 
tboard_callback = TensorBoard(
    log_dir = logs,
    histogram_freq = 1,
    profile_batch = '500,520'
)

# Define early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=2,          # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=False  # Restore weights from the epoch with the best validation loss
)
# Define terminate if Nan result appeared
terminate_on_nan = TerminateOnNaN()

# Define progress bar with metrics
progbar_logger = ProgbarLogger(
    count_mode="samples",
    stateful_metrics=['acc'],
)

# Define a learning rate scheduler function
def lr_scheduler(epoch, lr):
    decay_rate = 0.01
    decay_step = 1
    if epoch % decay_step == 0 and epoch:
        return lr * decay_rate
    return lr

# Define a learning rate scheduler callback
lr_scheduler_callback = LearningRateScheduler(lr_scheduler)

Compile model.

In [None]:
start_time = time.time()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

Fit model.

In [None]:
# Get fit object to invoke them.
history = model.fit(
    x=X_train,
    y=y_train,
    batch_size=32,
    epochs=5,
    validation_data=(X_test, y_test),
    callbacks = [
        early_stopping,
        terminate_on_nan,
        progbar_logger,
        lr_scheduler_callback,
        tboard_callback,
    ],
)

print(f"--- {((time.time() - start_time)):.2f} seconds ---")

# Result visualization
## Get tensorboard

In [None]:
%load_ext tensorboard

# Launch TensorBoard and navigate to the Profile tab to view performance profile
%tensorboard --logdir=logs --port=0


Scalars trends are quite representative.

# Get graph
## We can get loss as a trend.

In [None]:
# Plot train loss
sns.lineplot(
    x=range(1, len(history.history['loss']) + 1),
    y=history.history['loss'],
    label='Train'
)

# Plot validation loss
sns.lineplot(
    x=range(1, len(history.history['val_loss']) + 1),
    y=history.history['val_loss'],
    label='Test'
)

plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc: .2f}')

Looks like model is trained well.

# Summary
1. LSTM RNN used for classification.
2. Model shows good performance.
3. Model hyperparameters seemed to be optimized further.