In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from utils.helper import find_project_root

In [None]:
DATASET_PATH = find_project_root() / Path("datasets/nlp/")

df = pd.read_csv(DATASET_PATH / "IMDB Dataset.csv.zip")

In [None]:
from nlp.data.preprocessing import TextPreprocessor

In [None]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [None]:
from torch.utils.data import random_split
import torch

train_size = int(0.7 * len(df))
val_size = int(0.10 * len(df))
test_size = len(df) - train_size - val_size
print(train_size, val_size, test_size)

train_dataset, val_dataset, test_dataset = random_split(
    df,
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

X_train, y_train = train_dataset.dataset["review"].reset_index(drop=True), train_dataset.dataset["sentiment"].reset_index(drop=True)
X_val, y_val = val_dataset.dataset["review"].reset_index(drop=True), val_dataset.dataset["sentiment"].reset_index(drop=True)
X_test, y_test = test_dataset.dataset["review"].reset_index(drop=True), test_dataset.dataset["sentiment"].reset_index(drop=True)

# Model Training

In [None]:
from nlp.features.rnn_dataset import Vocabulary, RNNDataset, SequenceEncoder

In [None]:
preprocessor = TextPreprocessor()
X_train_tokens = preprocessor.tokenize_batch(X_train)
X_val_tokens = preprocessor.tokenize_batch(X_val)
X_test_tokens = preprocessor.tokenize_batch(X_test)

In [None]:
vocab = Vocabulary(min_freq=5)
vocab.build(X_train_tokens)

In [None]:
from nlp.pipelines.rnn_pipeline import RNNPipeline
from nlp.models.rnn_model import LSTMClassifier
from sklearn.metrics import classification_report, confusion_matrix

encoder = SequenceEncoder(
    vocab=vocab,
    tokenizer=lambda x: x.lower().split(),
    max_len=200
)

model = LSTMClassifier(
    vocab_size=len(vocab),
    embedding_dim=100,
    hidden_dim=256,
    num_layers=2,
    bidirectional=True
)

pipeline = RNNPipeline(model, encoder, device="cpu")

pipeline.fit(X_train, y_train, epochs=3, X_val=X_val, y_val=y_val)

40 min

Epoch 0 | Loss 0.6526 | Train Acc 0.7655 | Val Acc 0.7655
Epoch 1 | Loss 0.4414 | Train Acc 0.8618 | Val Acc 0.8618
Epoch 2 | Loss 0.3191 | Train Acc 0.9031 | Val Acc 0.9031

In [None]:
print("Test accuracy:", pipeline.score(X_test, y_test))

In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
mis_idx = np.where(y_pred != y_test)[0]

len(mis_idx)

In [None]:
for i in mis_idx[:10]:
    print("\n\n----------")
    print("TEXT:", X_test.iloc[i])
    print("TRUE:", y_test.iloc[i])
    print("PRED:", y_pred[i])