# DI725 Final Assignment Notebook
This notebook orchestrates training for:
- `nanoGPT` from scratch
- `GPT-2` fine-tuning

It uses modular `.py` scripts and logs metrics to WANDB.

## 1. Install Required Packages

In [1]:
#pip install torch transformers datasets evaluate wandb pandas scikit-learn seaborn

## 2. Login to Weights & Biases

In [2]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mesrasekerci[0m ([33mesrasekerci-metu-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## 3. Train `nanoGPT` from Scratch

In [3]:
!python3 train_sentiment_nanogpt.py

[34m[1mwandb[0m: Currently logged in as: [33mesrasekerci[0m ([33mesrasekerci-metu-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/esrasekerci/Desktop/DI725/assignments/assignment_1/wandb/run-20250406_225321-2yfgtvbg[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdefiant-spot-19[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/esrasekerci-metu-middle-east-technical-university/di725-sentiment-transformer-restore[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/esrasekerci-metu-middle-east-technical-university/di725-sentiment-transformer-restore/runs/2yfgtvbg[0m
  output = torch._nested_tensor_from_mask(
Epoch 1: Loss = 1.0271, Val Acc = 0.7048, F1 = 0.6416
✅ Model saved (new best F1)
Epo

## 4. Fine-Tune Pretrained GPT-2

In [4]:
!python3 train_gpt2.py

[34m[1mwandb[0m: Currently logged in as: [33mesrasekerci[0m ([33mesrasekerci-metu-middle-east-technical-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/Users/esrasekerci/Desktop/DI725/assignments/assignment_1/wandb/run-20250406_225537-2xuynldk[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mgpt2-final[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/esrasekerci-metu-middle-east-technical-university/di725-sentiment-gpt2[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/esrasekerci-metu-middle-east-technical-university/di725-sentiment-gpt2/runs/2xuynldk[0m
Map: 100%|█████████████████████████| 1082/1082 [00:00<00:00, 1207.17 examples/s]
Map: 100%|███████████████████████████| 271/271 [00:00<00:00, 1152.45 examples/s]
S

In [5]:
import pandas as pd
import re
import string
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from sklearn.metrics import classification_report

from model import SentimentTransformer

# === Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Label mapping
sentiment_map = {'negative': 0, 'neutral': 1, 'positive': 2}
inv_map = {v: k for k, v in sentiment_map.items()}

# === Load test data
df_test = pd.read_csv("data/test.csv")
df_test.columns = df_test.columns.str.lower().str.strip()

# === Preprocessing (same as training)
custom_phrases = [
    r'\bis there anything else i can (assist|help) you with\b',
    r'\bthank you for choosing brownbox\b',
    r'\bthank you\b',
    r'\byoure welcome\b',
    r'\btake care\b',
    r'\bgoodbye\b',
    r'\bplease\b',
    r'\bthanks\b',
    r'\bsure\b',
    r'\bno thats all\b',
    r'\bhave a (nice|great|good) day\b',
    r'\bappreciate\b',
    r'\bfor contacting brownbox customer support\b'
]

misspellings = {
    'ts': 'this', 'witn': 'within', 'anytng': 'anything',
    'ithis': 'it has', 'thathis': 'that is', 'as you': 'assure you',
    'en that': 'ensure that'
}

def clean_conversation(text):
    text = text.lower()
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'\+?\d[\d\s\-().]{8,}\d', '', text)
    text = re.sub(r'http\S+|www\S+', '', text)

    lines = text.strip().split('\n')
    if lines and lines[0].startswith("agent:"):
        if any(greet in lines[0] for greet in [
            "thank you for calling", "hi", "hello", "this is", "my name is",
            "how can i help you", "how may i assist you"
        ]):
            lines = lines[1:]

    text = ' '.join(lines)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()

    for phrase in custom_phrases:
        text = re.sub(phrase, '', text)

    for wrong, right in misspellings.items():
        text = text.replace(wrong, right)

    text = re.sub(r'\b(customer|agent)\b\s*$', '', text)
    return text

df_test["text"] = df_test["conversation"].astype(str).apply(clean_conversation)
df_test["label"] = df_test["customer_sentiment"].str.strip().str.lower().map(sentiment_map)

# === Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
BLOCK_SIZE = 128

# === Dataset
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.encodings = tokenizer(
            list(texts),
            truncation=True,
            padding="max_length",
            max_length=max_len,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx],
        }

# === DataLoader
test_dataset = TextDataset(df_test["text"], df_test["label"], tokenizer, BLOCK_SIZE)
test_loader = DataLoader(test_dataset, batch_size=32)

# === NanoGPT Evaluation
model_nano = SentimentTransformer(
    vocab_size=tokenizer.vocab_size,
    emb_dim=256,
    max_len=BLOCK_SIZE
).to(device)

model_nano.load_state_dict(torch.load("sentiment_transformer_best.pt", map_location=device))
model_nano.eval()

all_preds_nano, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model_nano(input_ids, attn_mask)
        preds = torch.argmax(logits, dim=1)

        all_preds_nano.extend(preds.cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

print("📊 NanoGPT Evaluation:")
print(classification_report(all_labels, all_preds_nano,
                            target_names=[inv_map[i] for i in sorted(inv_map)], digits=4))

# === GPT-2 Evaluation
model_gpt2 = GPT2ForSequenceClassification.from_pretrained("gpt2_sentiment_model").to(device)
model_gpt2.eval()

all_preds_gpt2 = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attn_mask = batch["attention_mask"].to(device)

        outputs = model_gpt2(input_ids=input_ids, attention_mask=attn_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds_gpt2.extend(preds.cpu().tolist())

print("📊 GPT-2 Evaluation:")
print(classification_report(all_labels, all_preds_gpt2,
                            target_names=[inv_map[i] for i in sorted(inv_map)], digits=4))

  model_nano.load_state_dict(torch.load("sentiment_transformer_best.pt", map_location=device))
  output = torch._nested_tensor_from_mask(


📊 NanoGPT Evaluation:
              precision    recall  f1-score   support

    negative     0.8889    0.8000    0.8421        10
     neutral     0.5000    0.9000    0.6429        10
    positive     0.6667    0.2000    0.3077        10

    accuracy                         0.6333        30
   macro avg     0.6852    0.6333    0.5976        30
weighted avg     0.6852    0.6333    0.5976        30

📊 GPT-2 Evaluation:
              precision    recall  f1-score   support

    negative     0.9091    1.0000    0.9524        10
     neutral     0.6000    0.9000    0.7200        10
    positive     1.0000    0.4000    0.5714        10

    accuracy                         0.7667        30
   macro avg     0.8364    0.7667    0.7479        30
weighted avg     0.8364    0.7667    0.7479        30

