# ***תרגיל 6 של הפרוייקט***

**Setup & Install (Colab)**

In [1]:
# ============================================================
# Block 1 — Setup & Install (Colab)
# ============================================================
!pip -q install transformers datasets accelerate scikit-learn

import os
import random
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from dataclasses import dataclass
from typing import Dict, List, Tuple

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup
)

# Make plots appear in notebook
# %matplotlib inline

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)


ImportError: cannot import name 'AdamW' from 'transformers' (/usr/local/lib/python3.12/dist-packages/transformers/__init__.py)

**Reproducibility (Seeds)**

In [None]:
# ============================================================
# Block 2 — Reproducibility (Seeds)
# ============================================================
def set_seed(seed: int = 42) -> None:
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)


**Config (Paths, Labels, Hyperparams)**

In [None]:
# ============================================================
# Block 3 — Config (Paths, Labels, Hyperparams)
# ============================================================
# Path to your CSV (upload it to Colab or mount Google Drive)
CSV_PATH = "/content/train-filtered_question_level.csv"  # <-- change if needed

# Labels mapping (must match your project definition)
label2id = {"easy": 0, "medium": 1, "hard": 2}
id2label = {v: k for k, v in label2id.items()}
NUM_LABELS = 3

# According to your dataset analysis:
# 95th percentile max length = 44 tokens
MAX_LEN = 44

# Baseline training hyperparams (you can change later for tuning runs)
BASE_LR = 3e-5
BASE_BATCH_SIZE = 16
BASE_EPOCHS = 3
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.1  # warmup steps = 10% of total steps


**Load Dataset (CSV) + Minimal Cleaning**

In [None]:
# ============================================================
# Block 4 — Load Dataset (CSV) + Minimal Cleaning
# ============================================================
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(f"CSV not found at: {CSV_PATH}")

df = pd.read_csv(CSV_PATH)

# Expected columns: question, level
required_cols = {"question", "level"}
if not required_cols.issubset(set(df.columns)):
    raise ValueError(f"CSV must contain columns: {required_cols}. Found: {df.columns}")

# Minimal safety cleanup (keep it simple; in Transformer stage we avoid heavy preprocessing)
df["question"] = df["question"].astype(str)
df["level"] = df["level"].astype(str)

# Remove exact duplicates by question (if not already done)
df = df.drop_duplicates(subset=["question"]).reset_index(drop=True)

# Keep only the 3 known labels
df = df[df["level"].isin(label2id.keys())].reset_index(drop=True)

print("Dataset size:", len(df))
print(df["level"].value_counts())


In [None]:
# ============================================================
# Block 4.5 — Balance Dataset (7000 per label) + Shuffle
# Insert between Block 4 and Block 5
# ============================================================

SAMPLES_PER_CLASS = 7000

# Sanity check: make sure each label has at least 7000 samples
counts = df["level"].value_counts()
missing = [lbl for lbl in label2id.keys() if counts.get(lbl, 0) < SAMPLES_PER_CLASS]
if missing:
    raise ValueError(
        f"Not enough samples for labels: {missing}. "
        f"Counts: {counts.to_dict()}"
    )

# Undersample each class to exactly 7000
balanced_df = (
    df.groupby("level", group_keys=False)
      .apply(lambda g: g.sample(n=SAMPLES_PER_CLASS, random_state=42))
      .reset_index(drop=True)
)

# Full shuffle after balancing (important)
balanced_df = balanced_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

df = balanced_df  # overwrite df so the next blocks use the balanced dataset

print("Balanced dataset size:", len(df))
print(df["level"].value_counts())


**Stratified Split (Train / Val / Test)**

In [None]:
# ============================================================
# Block 5 — Stratified Split (Train / Val / Test)
# ============================================================
# 70% train, 15% val, 15% test
train_df, temp_df = train_test_split(
    df,
    test_size=0.30,
    random_state=42,
    stratify=df["level"]
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=temp_df["level"]
)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))
print("Test distribution:\n", test_df["level"].value_counts())


**PyTorch Dataset (Tokenizer-based)**

In [None]:
# ============================================================
# Block 6 — PyTorch Dataset (Tokenizer-based)
# ============================================================
class TriviaDataset(torch.utils.data.Dataset):
    """
    A simple dataset wrapper that tokenizes each question using the model tokenizer.
    """
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_len: int):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self) -> int:
        return len(self.df)

    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        text = self.df.loc[idx, "question"]
        label_str = self.df.loc[idx, "level"]
        label_id = label2id[label_str]

        # Tokenizer creates input_ids + attention_mask (+ token_type_ids for some models)
        encoded = self.tokenizer(
            text,
            padding="max_length",       # explicit padding
            truncation=True,            # explicit truncation
            max_length=self.max_len,
            return_tensors="pt"
        )

        item = {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "labels": torch.tensor(label_id, dtype=torch.long)
        }

        # Some models return token_type_ids (BERT); DistilBERT usually does not use it
        if "token_type_ids" in encoded:
            item["token_type_ids"] = encoded["token_type_ids"].squeeze(0)

        return item


**Training Utilities (Accuracy, Loops, Plots)**

In [None]:
# ============================================================
# Block 7 — Training Utilities (Accuracy, Loops, Plots)
# ============================================================
def batch_accuracy(logits: torch.Tensor, labels: torch.Tensor) -> float:
    preds = torch.argmax(logits, dim=1)
    correct = (preds == labels).sum().item()
    return correct / labels.size(0)

@dataclass
class TrainHistory:
    train_loss: List[float]
    val_loss: List[float]
    train_acc: List[float]
    val_acc: List[float]

def train_one_epoch(model, loader, optimizer, scheduler) -> Tuple[float, float]:
    model.train()
    total_loss, total_acc, n_batches = 0.0, 0.0, 0

    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        total_acc += batch_accuracy(logits, batch["labels"])
        n_batches += 1

    return total_loss / n_batches, total_acc / n_batches

@torch.no_grad()
def eval_one_epoch(model, loader) -> Tuple[float, float]:
    model.eval()
    total_loss, total_acc, n_batches = 0.0, 0.0, 0

    for batch in loader:
        batch = {k: v.to(DEVICE) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        logits = outputs.logits

        total_loss += loss.item()
        total_acc += batch_accuracy(logits, batch["labels"])
        n_batches += 1

    return total_loss / n_batches, total_acc / n_batches

def plot_history(history: TrainHistory, title: str) -> None:
    epochs = range(1, len(history.train_loss) + 1)

    plt.figure()
    plt.plot(epochs, history.train_loss, label="train_loss")
    plt.plot(epochs, history.val_loss, label="val_loss")
    plt.title(title + " — Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

    plt.figure()
    plt.plot(epochs, history.train_acc, label="train_acc")
    plt.plot(epochs, history.val_acc, label="val_acc")
    plt.title(title + " — Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()
    plt.show()


**Evaluation Utilities (Report + Confusion Matrix)**

In [None]:
# ============================================================
# Block 8 — Evaluation Utilities (Report + Confusion Matrix)
# ============================================================
@torch.no_grad()
def predict(model, loader) -> Tuple[List[int], List[int]]:
    model.eval()
    all_preds, all_labels = [], []

    for batch in loader:
        labels = batch["labels"].numpy().tolist()
        batch = {k: v.to(DEVICE) for k, v in batch.items()}

        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy().tolist()

        all_preds.extend(preds)
        all_labels.extend(labels)

    return all_preds, all_labels

def show_metrics(y_true: List[int], y_pred: List[int], title: str) -> None:
    print("\n" + "="*70)
    print(title)
    print("="*70)

    print("\nClassification Report:")
    print(classification_report(
        y_true, y_pred,
        target_names=[id2label[i] for i in range(NUM_LABELS)],
        digits=4
    ))

    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix (rows=true, cols=pred):")
    print(cm)


**One Experiment Runner (Model + Tokenizer + Fine-Tuning)**

In [None]:
# ============================================================
# Block 9 — One Experiment Runner (Model + Tokenizer + Fine-Tuning)
# ============================================================
def run_experiment(
    model_name: str,
    run_name: str,
    lr: float = BASE_LR,
    batch_size: int = BASE_BATCH_SIZE,
    epochs: int = BASE_EPOCHS
) -> Dict:
    print("\n" + "#"*80)
    print(f"Running: {run_name}")
    print(f"Model: {model_name}")
    print(f"LR={lr}, Batch={batch_size}, Epochs={epochs}, MAX_LEN={MAX_LEN}")
    print("#"*80)

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    train_ds = TriviaDataset(train_df, tokenizer, MAX_LEN)
    val_ds   = TriviaDataset(val_df, tokenizer, MAX_LEN)
    test_ds  = TriviaDataset(test_df, tokenizer, MAX_LEN)

    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader   = torch.utils.data.DataLoader(val_ds, batch_size=batch_size, shuffle=False)
    test_loader  = torch.utils.data.DataLoader(test_ds, batch_size=batch_size, shuffle=False)

    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=NUM_LABELS,
        label2id=label2id,
        id2label=id2label
    ).to(DEVICE)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=WEIGHT_DECAY)

    total_steps = len(train_loader) * epochs
    warmup_steps = int(total_steps * WARMUP_RATIO)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    history = TrainHistory(train_loss=[], val_loss=[], train_acc=[], val_acc=[])

    for ep in range(1, epochs + 1):
        tr_loss, tr_acc = train_one_epoch(model, train_loader, optimizer, scheduler)
        va_loss, va_acc = eval_one_epoch(model, val_loader)

        history.train_loss.append(tr_loss)
        history.train_acc.append(tr_acc)
        history.val_loss.append(va_loss)
        history.val_acc.append(va_acc)

        print(f"Epoch {ep}/{epochs} | train_loss={tr_loss:.4f} train_acc={tr_acc:.4f} | val_loss={va_loss:.4f} val_acc={va_acc:.4f}")

    plot_history(history, title=run_name)

    # Final evaluation on TEST
    y_pred, y_true = predict(model, test_loader)
    show_metrics(y_true, y_pred, title=f"{run_name} — Test Evaluation")

    return {
        "run_name": run_name,
        "model_name": model_name,
        "lr": lr,
        "batch_size": batch_size,
        "epochs": epochs,
        "history": history,
        "y_true": y_true,
        "y_pred": y_pred,
        "model": model,
        "tokenizer": tokenizer
    }


** Run Stage 1 (Two Models)**

In [None]:
# ============================================================
# Block 10 — Run Stage 1 (Two Models)
# ============================================================
# 1) Small baseline model (works fast)
distilbert_result = run_experiment(
    model_name="distilbert-base-uncased",
    run_name="Stage1_DistilBERT_uncased",
    lr=3e-5,
    batch_size=16,
    epochs=3
)

# 2) Larger model for comparison (after baseline works)
bert_cased_result = run_experiment(
    model_name="bert-base-cased",
    run_name="Stage1_BERT_cased",
    lr=2e-5,          # often a bit safer for larger models; feel free to tune
    batch_size=16,
    epochs=3
)


**Quick Side-by-Side Summary (Accuracy Only)**

In [None]:
# ============================================================
# Block 11 — Quick Side-by-Side Summary (Accuracy Only)
# ============================================================
from sklearn.metrics import accuracy_score

distil_acc = accuracy_score(distilbert_result["y_true"], distilbert_result["y_pred"])
bert_acc = accuracy_score(bert_cased_result["y_true"], bert_cased_result["y_pred"])

print("\n" + "="*70)
print("Stage 1 Summary (Test Accuracy)")
print("="*70)
print(f"DistilBERT-uncased: {distil_acc:.4f}")
print(f"BERT-base-cased:    {bert_acc:.4f}")


In [None]:
# ============================================================
# Block 12 — (Optional) Save Models to Drive / Local
# ============================================================
# You can save each fine-tuned model for later stages
# Example:
# distilbert_result["model"].save_pretrained("/content/distilbert_stage1")
# distilbert_result["tokenizer"].save_pretrained("/content/distilbert_stage1")
#
# bert_cased_result["model"].save_pretrained("/content/bert_cased_stage1")
# bert_cased_result["tokenizer"].save_pretrained("/content/bert_cased_stage1")
