# Import dependencies

In [None]:
# To mute annoying warnings in notebook
import warnings
import time

# For data science
import numpy as np
import pandas as pd
import keras
from keras import layers
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

warnings.filterwarnings("ignore")

# Getting data, observations
Get dataset

In [None]:
# Get dataset from file
df = pd.read_csv(
    f"../data/IMDB_Dataset.csv",
)

# Show dataset head
df.head()

In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)  # Set maximum number of words to keep
tokenizer.fit_on_texts(df['review'])  # Fit tokenizer on the reviews
sequences = tokenizer.texts_to_sequences(df['review'])  # Convert text to sequences of word indices

In [None]:
max_features = 20000  # Only consider the top 20k words
max_length = 200

In [None]:
padded_sequences = pad_sequences(
    sequences=sequences,
    maxlen=max_length,
    padding='post',
)

In [None]:
# Get split subsets
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences,
    df['sentiment'],
    test_size=0.3,
    random_state=42
)

In [None]:
inputs = keras.Input(
    shape=(None,),
    dtype="int32",
)

x = layers.Embedding(
    input_dim=max_features,
    output_dim=128
)(inputs)

x = layers.Bidirectional(
    layers.LSTM(
        units=64,
        return_sequences=True,
    )
)(x)

x = layers.Bidirectional(layers.LSTM(64))(x)

outputs = layers.Dense(
    units=1,
    activation="sigmoid",
)(x)

model = keras.Model(inputs, outputs)

model.summary()

In [None]:
start_time = time.time()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

model.fit(
    x=X_train,
    y=y_train,
    batch_size=32,
    epochs=2,
    validation_data=(X_test, y_test),
)

print(f"--- {((time.time() - start_time)):.2f} seconds ---")