In [None]:
import os
import random
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from torch.optim import AdamW
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    get_linear_schedule_with_warmup,
)

# ------------------------------------------------------------------
# Reproducibility & device safety
# ------------------------------------------------------------------
SEED = 38
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if DEVICE.type == "cuda":
    torch.cuda.manual_seed_all(SEED)
print(f"Using device: {DEVICE}")

Using device: cuda


In [11]:
DATA_PATH = Path("data/imdb_data.csv")
df = pd.read_csv(DATA_PATH).rename(columns={"review": "text", "sentiment": "label"})

train_df, test_df = train_test_split(
    df, test_size=0.20, stratify=df["label"], random_state=SEED
)
train_df, val_df = train_test_split(
    train_df, test_size=0.20, stratify=train_df["label"], random_state=SEED
)

print(
    f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}"
)

Train size: 32000, Validation size: 8000, Test size: 10000


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


def encode(texts):
    """Return input_ids & attention_mask tensors (on CPU)."""
    enc = tokenizer(
        list(texts),
        padding="max_length",
        truncation=True,
        max_length=256,
        return_tensors="pt",
    )
    return enc["input_ids"], enc["attention_mask"]


label_map = {"positive": 1, "negative": 0}



In [None]:
def build_dataset(frame: pd.DataFrame):
    ids, masks = encode(frame["text"].values)
    labels = torch.tensor(frame["label"].map(label_map).values)
    return TensorDataset(ids, masks, labels)


train_ds = build_dataset(train_df)
val_ds = build_dataset(val_df)

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=16, shuffle=False)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
).to(DEVICE)

optimiser = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3  # epochs = 3
scheduler = get_linear_schedule_with_warmup(
    optimiser, num_warmup_steps=0, num_training_steps=total_steps
)

In [None]:
def epoch_metrics(logits_list, labels_list):
    preds = torch.cat(logits_list).argmax(dim=1).cpu().numpy()
    gold = torch.cat(labels_list).cpu().numpy()
    acc = accuracy_score(gold, preds)
    f1 = f1_score(gold, preds, average="macro")
    return acc, f1


def save_checkpoint(model, optimiser, path: Path):
    path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(
        {
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimiser.state_dict(),
        },
        path,
    )
    print(f"Model saved to {path}")

In [15]:
EPOCHS = 3
for epoch in range(1, EPOCHS + 1):
    print(f"\nEpoch {epoch}/{EPOCHS}")
    # ---- train ----------------------------------------------------
    model.train()
    running_loss = 0.0
    for ids, masks, labels in tqdm(train_loader, desc="Training"):
        ids, masks, labels = ids.to(DEVICE), masks.to(DEVICE), labels.to(DEVICE)

        optimiser.zero_grad()
        outputs = model(
            input_ids=ids, attention_mask=masks, labels=labels, return_dict=True
        )
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimiser.step()
        scheduler.step()

        running_loss += loss.item()

    train_loss = running_loss / len(train_loader)

    # ---- validate -------------------------------------------------
    model.eval()
    val_loss = 0.0
    all_logits, all_labels = [], []
    with torch.no_grad():
        for ids, masks, labels in tqdm(val_loader, desc="Validating"):
            ids, masks, labels = ids.to(DEVICE), masks.to(DEVICE), labels.to(DEVICE)
            outputs = model(
                input_ids=ids, attention_mask=masks, labels=labels, return_dict=True
            )
            val_loss += outputs.loss.item()
            all_logits.append(outputs.logits.cpu())
            all_labels.append(labels.cpu())

    val_loss /= len(val_loader)
    acc, f1 = epoch_metrics(all_logits, all_labels)

    print(
        f"Train loss: {train_loss:.4f} | "
        f"Val loss: {val_loss:.4f} | "
        f"Accuracy: {acc:.4f} | "
        f"F1: {f1:.4f}"
    )

# ------------------------------------------------------------------
# Save final checkpoint
# ------------------------------------------------------------------
save_checkpoint(model, optimiser, Path("models/imdb_bert.pth"))


Epoch 1/3


Training: 100%|██████████| 2000/2000 [05:36<00:00,  5.94it/s]
Validating: 100%|██████████| 500/500 [00:26<00:00, 18.57it/s]


Train loss: 0.2661 | Val loss: 0.1957 | Accuracy: 0.9276 | F1: 0.9276

Epoch 2/3


Training: 100%|██████████| 2000/2000 [05:36<00:00,  5.94it/s]
Validating: 100%|██████████| 500/500 [00:26<00:00, 18.70it/s]


Train loss: 0.1485 | Val loss: 0.2639 | Accuracy: 0.9267 | F1: 0.9267

Epoch 3/3


Training: 100%|██████████| 2000/2000 [05:37<00:00,  5.92it/s]
Validating: 100%|██████████| 500/500 [00:27<00:00, 18.35it/s]


Train loss: 0.0770 | Val loss: 0.3291 | Accuracy: 0.9264 | F1: 0.9264
Model saved to models/imdb_bert.pth


In [21]:
# Encode test data and create DataLoader
test_ds = build_dataset(test_df)
test_loader = DataLoader(test_ds, batch_size=16, shuffle=False)

model.eval()                                   # switch to inference mode
logits_list, labels_list = [], []

with torch.no_grad():
    for ids, masks, labels in tqdm(test_loader, desc="Testing"):
        ids, masks = ids.to(DEVICE), masks.to(DEVICE)
        outputs = model(input_ids=ids, attention_mask=masks, return_dict=True)
        logits_list.append(outputs.logits.cpu())
        labels_list.append(labels)

logits = torch.cat(logits_list)
y_true = torch.cat(labels_list).numpy()
y_pred = logits.argmax(dim=1).numpy()

accuracy  = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average="macro")
recall    = recall_score(y_true, y_pred, average="macro")
f1        = f1_score(y_true, y_pred, average="macro")

print("\n=== Test-set metrics ===")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")

print("\n--- Per-class report ---")
target_names = ["negative", "positive"]
print(classification_report(y_true, y_pred, target_names=target_names))

print("\n--- Confusion matrix ---")
print(confusion_matrix(y_true, y_pred))

Testing: 100%|██████████| 625/625 [00:33<00:00, 18.94it/s]


=== Test-set metrics ===
Accuracy : 0.9239
Precision: 0.9241
Recall   : 0.9239
F1-score : 0.9239

--- Per-class report ---
              precision    recall  f1-score   support

    negative       0.93      0.91      0.92      5000
    positive       0.92      0.93      0.92      5000

    accuracy                           0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000


--- Confusion matrix ---
[[4569  431]
 [ 330 4670]]



