# Import dependencies

In [None]:
# To mute annoying warnings in notebook
import warnings
import time

import numpy as np
import pandas as pd
import keras
from keras import layers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# Getting data, observations
Get dataset

In [None]:
# Get dataset from file
df = pd.read_csv(
    f"../data/IMDB_Dataset.csv",
)

# Show dataset head
df.head()

In [None]:
# Set feature and target subsets
features = df.review
target = df.sentiment

In [None]:
tf_idf = TfidfVectorizer(stop_words='english')
feature_vector = tf_idf.fit_transform(features)

pd.DataFrame.sparse.from_spmatrix(
    data=feature_vector,
    index=features.index,
    columns=tf_idf.get_feature_names_out()
)

In [None]:
# Make transform
target_vector = target.map({'positive': 1, 'negative': 0})

# Show transformed data
target_vector

In [None]:
# Get split subsets
X_train, X_test, y_train, y_test = train_test_split(
    feature_vector,
    target_vector,
    test_size=0.3,
    random_state=42
)

In [None]:
max_features = 20000  # Only consider the top 20k words
max_length = 200 

In [None]:
inputs = keras.Input(
    shape=(None,),
    dtype="int32",
)

x = layers.Embedding(max_features, 128)(inputs)

x = layers.Bidirectional(
    layers.LSTM(
        64,
        return_sequences=True,
    )
)(x)

x = layers.Bidirectional(layers.LSTM(64))(x)

outputs = layers.Dense(
    units=1,
    activation="sigmoid",
)(x)

model = keras.Model(inputs, outputs)

model.summary()

In [None]:
start_time = time.time()

model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

model.fit(
    x=X_train,
    y=y_train,
    batch_size=32,
    epochs=2,
    validation_data=(X_test, y_test),
)

print(f"--- {((time.time() - start_time)):.2f} seconds ---")