In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from utils.helper import find_project_root

In [None]:
DATASET_PATH = find_project_root() / Path("datasets/nlp/")

df = pd.read_csv(DATASET_PATH / "IMDB Dataset.csv.zip")

In [None]:
from nlp.data.preprocessing import TextPreprocessor

In [None]:
df["sentiment"] = df["sentiment"].map({"positive": 1, "negative": 0})

In [None]:
df

In [None]:
from torch.utils.data import random_split
import torch

train_size = int(0.7 * len(df))
val_size = int(0.15 * len(df))
test_size = len(df) - train_size - val_size
print(train_size, val_size, test_size)

train_dataset, val_dataset, test_dataset = random_split(
    df,
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

X_train, y_train = train_dataset.dataset["review"].reset_index(drop=True), train_dataset.dataset["sentiment"].reset_index(drop=True)
X_val, y_val = val_dataset.dataset["review"].reset_index(drop=True), val_dataset.dataset["sentiment"].reset_index(drop=True)
X_test, y_test = test_dataset.dataset["review"].reset_index(drop=True), test_dataset.dataset["sentiment"].reset_index(drop=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)

In [None]:
preprocessor = TextPreprocessor()
X_train_tokens = preprocessor.tokenize_batch(X_train)
X_val_tokens = preprocessor.tokenize_batch(X_val)
X_test_tokens = preprocessor.tokenize_batch(X_test)

In [None]:
from torch.utils.data import DataLoader
from nlp.features.rnn_dataset import Vocabulary, RNNDataset
from nlp.models.rnn_model import LSTMClassifier
from nlp.pipelines.rnn_trainer import Trainer

In [None]:
vocab = Vocabulary(min_freq=5)
vocab.build(X_train_tokens)

In [None]:
train_dataset = RNNDataset(X_train_tokens, y_train, vocab)
val_dataset = RNNDataset(X_val_tokens, y_val, vocab)
test_dataset = RNNDataset(X_test_tokens, y_test, vocab)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

# Model Training

In [None]:
model = LSTMClassifier(
    vocab_size=len(vocab),
    embedding_dim=100,
    hidden_dim=128,
    bidirectional=True
)

In [None]:
trainer = Trainer(model, train_loader, val_loader, device="cpu")

for epoch in range(2):
    loss = trainer.train_epoch()
    acc = trainer.evaluate()
    print(f"Epoch {epoch} | Loss {loss:.4f} | Acc {acc:.4f}")

# Evaluate

In [None]:
trainer.evaluate()