In [5]:
!pip install -U datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m24.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [1]:
import json, time, random, numpy as np, pandas as pd, torch, seaborn as sns, matplotlib.pyplot as plt
from pathlib import Path
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, classification_report, f1_score, mean_absolute_error, confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
########## EDITABLE PARAMS ##########
RUN_ID      = "deberta_tuned_eda"
MODEL_PATH  = "model/deberta_tuned_eda"
DATA_PATH   = "data/validation_split.csv"
SEEDS       = [13, 21, 42]
BATCH_SIZE  = 8
MAX_LEN     = 128
#####################################

OUT_ROOT = Path(f"results/{RUN_ID}")
OUT_ROOT.mkdir(parents=True, exist_ok=True)

label2id = {"negative":0,"neutral":1,"positive":2}
id2label = {v:k for k,v in label2id.items()}

# ---- one-time data prep ---------------------------------------------
df   = pd.read_csv(DATA_PATH)
df["label"] = df["label"].map(label2id).astype("int64")
ds   = Dataset.from_pandas(df)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize(batch):
    return tokenizer(
        batch["sentence"],
        truncation=True,
        max_length=128
    )

ds = ds.map(tokenize, batched=True)
ds = ds.remove_columns(["sentence"])

collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
dl   = DataLoader(ds, batch_size=BATCH_SIZE, collate_fn=collator)

# ---- load model once; re-seed & eval for each pass ------------------
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
model.to(device)

MC_SAMPLES = 3

records = []

for seed in SEEDS:
    print('current seed: ', seed)
    torch.manual_seed(seed); np.random.seed(seed)

    tic = time.time()
    model.train()                    # <<< DROPOUT ACTIVE
    preds_prob_runs = []             # store P(class) for every run
    labels_all = []

    for mc in range(MC_SAMPLES):
        print('current mc: ', mc)
        torch.manual_seed(seed + mc)   # deterministic but different mask
        np.random.seed(seed + mc)

        probs_run = []                # collect this pass
        with torch.no_grad():
            for batch in dl:
                batch = {k: v.to(device) for k, v in batch.items()}
                labels = batch.pop("labels")          # (B,)
                logits = model(**batch).logits        # (B, 3)
                probs  = torch.softmax(logits, dim=-1)# (B, 3)

                probs_run.append(probs.cpu())
                if mc == 0:                           # save labels once
                    labels_all.append(labels.cpu())

        preds_prob_runs.append(torch.cat(probs_run))  # shape (N,3)

    latency = time.time() - tic
    probs_all = torch.stack(preds_prob_runs, dim=0)   # (MC, N, 3)
    probs_mean = probs_all.mean(0)                    # (N, 3)
    probs_entropy = (-probs_mean * probs_mean.log()).sum(-1)  # (N,)

    y_int = torch.cat(labels_all).numpy().astype(int)   # (N,)
    p_int = probs_mean.argmax(-1).numpy()               # hard class
    p_raw = probs_mean.numpy().dot(np.arange(3))        # expected value

    # ----- metrics -----
    mae   = np.abs(p_raw - y_int).mean()
    score = 0.5 * (2 - mae)
    acc   = accuracy_score(y_int, p_int)

    # ----- metrics -----
    mae   = np.abs(p_raw - y_int).mean()
    score = 0.5 * (2 - mae)                      # competition metric
    rec = {
        "seed": seed,
        "score":       float(score),
        "mae":         float(mae),
        "accuracy":    float(acc),
        "latency_sec": float(latency),
        "entropy_mean": float(probs_entropy.mean())
    }
    records.append(rec)

    # ----- artefacts -----
    out_dir = OUT_ROOT / f"seed_{seed}"
    out_dir.mkdir(exist_ok=True)
    json.dump(rec, open(out_dir/"metrics.json", "w"), indent=2)

    cm = confusion_matrix(y_int, p_int, labels=[0,1,2], normalize="true")
    fig, ax = plt.subplots(figsize=(3,3))
    sns.heatmap(cm, annot=True, fmt=".2f",
                xticklabels=list(label2id), yticklabels=list(label2id), ax=ax)
    ax.set_title(f"{RUN_ID} | seed {seed}")
    fig.tight_layout(); fig.savefig(out_dir/"confusion_matrix.png", dpi=200)
    plt.close(fig)

    mis = df.iloc[np.where(p_int != y_int)[0]][["id","sentence","label"]]
    mis["pred"] = [id2label[i] for i in p_int[p_int != y_int]]
    mis["label"] = mis["label"].map(id2label)
    mis.to_csv(out_dir/"misclassified.csv", index=False)

df_rec = pd.DataFrame(records)
agg = {
    "score_mean": df_rec["score"].mean(),
    "score_std":  df_rec["score"].std(ddof=0),
    "mae_mean":   df_rec["mae"].mean(),
    "mae_std":    df_rec["mae"].std(ddof=0),
    "latency_sec_mean": df_rec["latency_sec"].mean(),
    "latency_sec_std":  df_rec["latency_sec"].std(ddof=0),
    "params_M_total": round(sum(t.numel() for t in model.parameters())/1e6,1),
    "params_M_trainable": round(sum(t.numel() for t in model.parameters()
                                    if t.requires_grad)/1e6,1),
    "batch_size": BATCH_SIZE,
    "max_len": MAX_LEN
}
json.dump(agg, open(OUT_ROOT/"aggregate.json", "w"), indent=2)
print(agg)

Map:   0%|          | 0/10210 [00:00<?, ? examples/s]

current seed:  13
current mc:  0
current mc:  1
current mc:  2
current seed:  21
current mc:  0
current mc:  1
current mc:  2
current seed:  42
current mc:  0
current mc:  1
current mc:  2
{'score_mean': np.float64(0.8898685281757358), 'score_std': 0.0009868185118183427, 'mae_mean': np.float64(0.22026294364852841), 'mae_std': 0.001973637023636662, 'latency_sec_mean': np.float64(97.97735110918681), 'latency_sec_std': 0.18685182565637093, 'params_M_total': 184.4, 'params_M_trainable': 184.4, 'batch_size': 8, 'max_len': 128}
