In [5]:
import os
import torch
import random
import pandas as pd
import csv
import warnings
import numpy as np
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel

# =============================
# Config
# =============================
MODEL_PATH = "atipiqal/bert-italian-cased-civil"  # HF model
EPOCHS = 15
BATCH_SIZE = 16
LR = 2e-5
VAL_SPLIT = 0.2
SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_DIR_SUP = "model_out_super"
MODEL_DIR_CAT = "model_out_cat"
os.makedirs(MODEL_DIR_SUP, exist_ok=True)
os.makedirs(MODEL_DIR_CAT, exist_ok=True)
warnings.filterwarnings("ignore", category=FutureWarning)

# =============================
# Load dataset
# =============================
df = pd.read_csv("data/dataset.csv", sep=";", quoting=csv.QUOTE_MINIMAL, encoding="utf-8")
df = df.rename(columns=lambda c: c.strip().upper())
df = df[["DESCRIZIONE", "SUPERCATEGORIA", "CATEGORIA"]].dropna()

# Maps
supers = sorted(df["SUPERCATEGORIA"].unique())
cats = sorted(df["CATEGORIA"].unique())
sup2idx = {s: i for i, s in enumerate(supers)}
cat2idx = {c: i for i, c in enumerate(cats)}
idx2sup = {i: s for s, i in sup2idx.items()}
idx2cat = {i: c for c, i in cat2idx.items()}

# Dataset classes
class SupDataset(torch.utils.data.Dataset):
    def __init__(self, records):
        self.records = records
    def __len__(self): return len(self.records)
    def __getitem__(self, idx):
        r = self.records[idx]
        return r["DESCRIZIONE"], sup2idx[r["SUPERCATEGORIA"]]

class CatDataset(torch.utils.data.Dataset):
    def __init__(self, records):
        self.records = records
    def __len__(self): return len(self.records)
    def __getitem__(self, idx):
        r = self.records[idx]
        return r["DESCRIZIONE"], cat2idx[r["CATEGORIA"]]

# Shuffle records
records = df.to_dict("records")
random.seed(SEED)
random.shuffle(records)

# =============================
# Model class
# =============================
class ClassifierModel(nn.Module):
    def __init__(self, model_path, nlabels):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.encoder = AutoModel.from_pretrained(model_path, add_pooling_layer=False)
        dim = self.encoder.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(dim, 256), nn.ReLU(), nn.Dropout(0.2), nn.Linear(256, nlabels)
        )

    def forward(self, texts):
        texts = ["descrizione tecnica: " + t for t in texts]
        tokens = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(DEVICE)
        output = self.encoder(**tokens)
        emb = output.last_hidden_state[:, 0]
        return self.head(emb)

# =============================
# Generic training function
# =============================
def train_model(model, train_dl, val_dl, loss_fn, model_dir, idx2label):
    opt = torch.optim.AdamW(model.parameters(), lr=LR)
    best_acc = 0
    for epoch in range(EPOCHS):
        model.train()
        total, correct = 0, 0
        for texts, labels in train_dl:
            labels = labels.to(DEVICE)
            opt.zero_grad()
            out = model(texts)
            loss = loss_fn(out, labels)
            loss.backward()
            opt.step()
            total += labels.size(0)
            correct += (out.argmax(dim=1) == labels).sum().item()
        acc_train = correct / total

        model.eval()
        total, correct = 0, 0
        with torch.no_grad():
            for texts, labels in val_dl:
                labels = labels.to(DEVICE)
                out = model(texts)
                total += labels.size(0)
                correct += (out.argmax(dim=1) == labels).sum().item()
        acc_val = correct / total
        print(f"{model_dir} | Epoch {epoch+1}: Train Acc={acc_train:.4f} | Val Acc={acc_val:.4f}")

        if acc_val > best_acc:
            best_acc = acc_val
            torch.save(model.state_dict(), os.path.join(model_dir, "best_model.pt"))
            print(f"✅ Model saved to {model_dir}")

    # Save label mappings
    with open(os.path.join(model_dir, "label_mappings.txt"), "w", encoding="utf-8") as f:
        for i, lbl in idx2label.items():
            f.write(f"{i}: {lbl}\n")

# =============================
# Train SUPERCATEGORIA model
# =============================
sup_ds = SupDataset(records)
val_size = int(len(sup_ds) * VAL_SPLIT)
train_size = len(sup_ds) - val_size
train_sup, val_sup = random_split(sup_ds, [train_size, val_size])
train_sup_dl = DataLoader(train_sup, batch_size=BATCH_SIZE, shuffle=True)
val_sup_dl = DataLoader(val_sup, batch_size=BATCH_SIZE)

sup_counts = torch.tensor(np.bincount([sup2idx[r["SUPERCATEGORIA"]] for r in records]), dtype=torch.float)
weight_sup = 1.0 / sup_counts
loss_sup = nn.CrossEntropyLoss(weight=weight_sup.to(DEVICE))

sup_model = ClassifierModel(MODEL_PATH, len(supers)).to(DEVICE)
train_model(sup_model, train_sup_dl, val_sup_dl, loss_sup, MODEL_DIR_SUP, idx2sup)

# =============================
# Train CATEGORIA model
# =============================
cat_ds = CatDataset(records)
val_size = int(len(cat_ds) * VAL_SPLIT)
train_size = len(cat_ds) - val_size
train_cat, val_cat = random_split(cat_ds, [train_size, val_size])
train_cat_dl = DataLoader(train_cat, batch_size=BATCH_SIZE, shuffle=True)
val_cat_dl = DataLoader(val_cat, batch_size=BATCH_SIZE)

cat_counts = torch.tensor(np.bincount([cat2idx[r["CATEGORIA"]] for r in records]), dtype=torch.float)
weight_cat = 1.0 / cat_counts
loss_cat = nn.CrossEntropyLoss(weight=weight_cat.to(DEVICE))

cat_model = ClassifierModel(MODEL_PATH, len(cats)).to(DEVICE)
train_model(cat_model, train_cat_dl, val_cat_dl, loss_cat, MODEL_DIR_CAT, idx2cat)


model_out_super | Epoch 1: Train Acc=0.9131 | Val Acc=0.9883
✅ Model saved to model_out_super
model_out_super | Epoch 2: Train Acc=0.9945 | Val Acc=0.9903
✅ Model saved to model_out_super
model_out_super | Epoch 3: Train Acc=0.9960 | Val Acc=0.9901
model_out_super | Epoch 4: Train Acc=0.9967 | Val Acc=0.9923
✅ Model saved to model_out_super
model_out_super | Epoch 5: Train Acc=0.9989 | Val Acc=0.9909
model_out_super | Epoch 6: Train Acc=0.9979 | Val Acc=0.9896
model_out_super | Epoch 7: Train Acc=0.9972 | Val Acc=0.9916
model_out_super | Epoch 8: Train Acc=0.9994 | Val Acc=0.9929
✅ Model saved to model_out_super
model_out_super | Epoch 9: Train Acc=0.9988 | Val Acc=0.9874
model_out_super | Epoch 10: Train Acc=0.9990 | Val Acc=0.9912
model_out_super | Epoch 11: Train Acc=0.9991 | Val Acc=0.9916
model_out_super | Epoch 12: Train Acc=0.9990 | Val Acc=0.9912
model_out_super | Epoch 13: Train Acc=0.9995 | Val Acc=0.9909
model_out_super | Epoch 14: Train Acc=0.9996 | Val Acc=0.9876
model_out