In [None]:
# -*- coding: utf-8 -*-
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split

import numpy as np
import scipy.sparse as sp
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from src.config import *
from src.utils import load_data
from src.preprocessing import *
from src.nn import *  # CSRDataset, csr_collate


# ------------------------------------------------------------
# Model
# ------------------------------------------------------------
class SparseDeepMLP(nn.Module):
    """
    Sparse input -> dense hidden via torch.sparse.mm, then deeper dense stack.
    AMP OFF (sparse.mm on CUDA doesn't support fp16 addmm_sparse_cuda).
    """
    def __init__(self, in_dim: int, hidden: int, num_classes: int, dropout: float = 0.30):
        super().__init__()
        self.W1 = nn.Parameter(torch.empty(hidden, in_dim))
        self.b1 = nn.Parameter(torch.zeros(hidden))

        self.fc2 = nn.Linear(hidden, hidden)
        self.ln2 = nn.LayerNorm(hidden)

        self.fc3 = nn.Linear(hidden, hidden)
        self.ln3 = nn.LayerNorm(hidden)

        self.fc4 = nn.Linear(hidden, hidden)
        self.ln4 = nn.LayerNorm(hidden)

        self.out = nn.Linear(hidden, num_classes)

        self.drop = nn.Dropout(dropout)
        self.act = nn.GELU()

        nn.init.kaiming_uniform_(self.W1, a=np.sqrt(5))

    def forward(self, X_sparse):
        if X_sparse.is_cuda and X_sparse.dtype != torch.float32:
            X_sparse = X_sparse.float()

        h = torch.sparse.mm(X_sparse, self.W1.t()) + self.b1
        h = self.act(h)
        h = self.drop(h)

        h = self.fc2(h); h = self.ln2(h); h = self.act(h); h = self.drop(h)
        h = self.fc3(h); h = self.ln3(h); h = self.act(h); h = self.drop(h)
        h = self.fc4(h); h = self.ln4(h); h = self.act(h); h = self.drop(h)

        return self.out(h)


# ------------------------------------------------------------
# SoftF1 (macro) + CE (optional, aligns training to macro-F1)
# ------------------------------------------------------------
class SoftF1Loss(nn.Module):
    def __init__(self, num_classes: int, eps: float = 1e-9):
        super().__init__()
        self.C = num_classes
        self.eps = eps

    def forward(self, logits: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
        # logits: (B,C), y: (B,)
        p = torch.softmax(logits, dim=1)
        y_one = torch.nn.functional.one_hot(y, num_classes=self.C).float()

        tp = (p * y_one).sum(dim=0)
        fp = (p * (1 - y_one)).sum(dim=0)
        fn = ((1 - p) * y_one).sum(dim=0)

        soft_f1 = (2 * tp + self.eps) / (2 * tp + fp + fn + self.eps)
        return 1.0 - soft_f1.mean()


class CombinedLoss(nn.Module):
    def __init__(self, ce: nn.Module, softf1: nn.Module, alpha: float = 0.8):
        super().__init__()
        self.ce = ce
        self.softf1 = softf1
        self.alpha = alpha

    def forward(self, logits, y):
        return self.alpha * self.ce(logits, y) + (1.0 - self.alpha) * self.softf1(logits, y)


# ----------------------------
# Utils: predict
# ----------------------------
def predict_proba_csr(model: nn.Module, X_csr: sp.csr_matrix, device, batch_size=4096,
                     log_priors: torch.Tensor | None = None, tau: float = 0.0):
    model.eval()
    dummy_y = np.zeros(X_csr.shape[0], dtype=np.int64)
    ds = CSRDataset(X_csr, dummy_y)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False,
                    collate_fn=lambda b: csr_collate(b, device=device))

    probs = []
    with torch.no_grad():
        for Xb, _ in dl:
            Xb = Xb.float()
            logits = model(Xb)
            if log_priors is not None and tau != 0.0:
                logits = logits + tau * log_priors
            probs.append(torch.softmax(logits, dim=1).detach().cpu().numpy())
    return np.vstack(probs)


def predict_labels_from_df(model: nn.Module, preprocess, X_df, device, batch_size=4096,
                           log_priors: torch.Tensor | None = None, tau: float = 0.0):
    X_csr = preprocess.transform(X_df)
    proba = predict_proba_csr(model, X_csr, device=device, batch_size=batch_size, log_priors=log_priors, tau=tau)
    pred = proba.argmax(axis=1)
    return pred, proba


# ----------------------------
# Main
# ----------------------------
news_df = load_data(DEVELOPMENT_PATH)
X = news_df.drop(columns=["y"])
y = news_df["y"]

X_train_df, X_val_df, y_train_s, y_val_s = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

y_train_np = y_train_s.to_numpy(dtype=np.int64)
y_val_np   = y_val_s.to_numpy(dtype=np.int64)

preprocess = build_preprocess("nn")
X_train_csr = preprocess.fit_transform(X_train_df)
X_val_csr   = preprocess.transform(X_val_df)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

train_ds = CSRDataset(X_train_csr, y_train_np)
val_ds   = CSRDataset(X_val_csr,   y_val_np)

# ---- memory-safe training (accumulation) ----
batch_size_train = 256
accum_steps = 4  # effective batch = 1024
train_dl = DataLoader(
    train_ds,
    batch_size=batch_size_train,
    shuffle=True,
    collate_fn=lambda b: csr_collate(b, device=device),
)
val_dl = DataLoader(
    val_ds,
    batch_size=1024,
    shuffle=False,
    collate_fn=lambda b: csr_collate(b, device=device),
)

in_dim = X_train_csr.shape[1]
num_classes = 7
u = np.unique(y_train_np)
assert u.min() == 0 and u.max() == num_classes - 1 and len(u) == num_classes, u

# ---- class weights (clipped) ----
counts = np.bincount(y_train_np, minlength=num_classes)
class_weights = counts.sum() / (num_classes * np.maximum(counts, 1))
class_weights = np.clip(class_weights, 0.5, 5.0)
class_weights = torch.tensor(class_weights, dtype=torch.float32, device=device)

# ---- logit adjustment ----
priors = counts / counts.sum()
log_priors = torch.log(torch.tensor(priors, device=device, dtype=torch.float32) + 1e-12)
tau = 0.7  # 0.5 / 0.7 / 1.0 (0.7 spesso piÃ¹ stabile di 1.0)

# ---- model ----
hidden = 1536
dropout = 0.30  # more regularization (you were overfitting)
model = SparseDeepMLP(in_dim=in_dim, hidden=hidden, num_classes=num_classes, dropout=dropout).to(device)

# ---- optimizer + schedule (less aggressive) ----
epochs = 60
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=5e-4)

scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=3e-4,
    epochs=epochs,
    steps_per_epoch=max(1, len(train_dl) // accum_steps),
    pct_start=0.10,
    div_factor=40.0,
    final_div_factor=300.0,
)

# ---- loss aligned to macro-F1 ----
USE_SOFTF1 = True
if USE_SOFTF1:
    ce = nn.CrossEntropyLoss(weight=class_weights)
    softf1 = SoftF1Loss(num_classes=num_classes)
    loss_fn = CombinedLoss(ce, softf1, alpha=0.85)  # closer to CE but nudges macro-F1
else:
    loss_fn = nn.CrossEntropyLoss(weight=class_weights)

best_f1 = -1.0
best_state = None

for epoch in range(1, epochs + 1):
    model.train()
    total_loss = 0.0
    optimizer.zero_grad(set_to_none=True)

    for step, (Xb, yb) in enumerate(train_dl, start=1):
        Xb = Xb.float()

        logits = model(Xb)
        logits = logits + tau * log_priors

        loss = loss_fn(logits, yb) / accum_steps
        loss.backward()
        total_loss += loss.item() * accum_steps

        if step % accum_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad(set_to_none=True)

    if len(train_dl) % accum_steps != 0:
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad(set_to_none=True)

    # ---- validation ----
    model.eval()
    all_pred, all_true = [], []
    with torch.no_grad():
        for Xb, yb in val_dl:
            Xb = Xb.float()
            logits = model(Xb)
            logits = logits + tau * log_priors
            pred = torch.argmax(logits, dim=1)
            all_pred.append(pred.detach().cpu().numpy())
            all_true.append(yb.detach().cpu().numpy())

    all_pred = np.concatenate(all_pred)
    all_true = np.concatenate(all_true)

    f1m = f1_score(all_true, all_pred, average="macro")
    f1_per_class = f1_score(all_true, all_pred, average=None)
    avg_loss = total_loss / max(1, len(train_dl))

    if f1m > best_f1:
        best_f1 = f1m
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    print(
        f"Epoch {epoch:02d}/{epochs} | loss={avg_loss:.4f} | "
        f"val_f1_macro={f1m:.4f} | best={best_f1:.4f} | "
        f"per_class={np.round(f1_per_class, 3)}"
    )

# restore best
if best_state is not None:
    model.load_state_dict(best_state)
    model.to(device)
    model.eval()

print(f"\nBest val_f1_macro={best_f1:.4f}\n")

# More detailed report on best model (optional)
with torch.no_grad():
    proba_val = predict_proba_csr(model, X_val_csr, device=device, batch_size=4096,
                                 log_priors=log_priors, tau=tau)
pred_val = proba_val.argmax(axis=1)
print(classification_report(y_val_np, pred_val, digits=4))

# ----------------------------
# Predict on another dataset
# ----------------------------
# other_df = load_data(OTHER_PATH)
# X_other = other_df.drop(columns=["y"], errors="ignore")
# pred_other, proba_other = predict_labels_from_df(
#     model, preprocess, X_other, device=device, batch_size=4096,
#     log_priors=log_priors, tau=tau
# )
# print(pred_other[:20], proba_other.shape)


Device: cuda
Epoch 01/60 | loss=1.7662 | val_f1_macro=0.3355 | best=0.3355 | per_class=[0.408 0.4   0.592 0.018 0.34  0.362 0.228]
Epoch 02/60 | loss=1.1964 | val_f1_macro=0.6968 | best=0.6968 | per_class=[0.687 0.783 0.838 0.53  0.862 0.541 0.636]
Epoch 03/60 | loss=0.5394 | val_f1_macro=0.7037 | best=0.7037 | per_class=[0.718 0.787 0.834 0.571 0.863 0.544 0.609]
Epoch 04/60 | loss=0.3247 | val_f1_macro=0.7021 | best=0.7037 | per_class=[0.736 0.773 0.827 0.529 0.846 0.537 0.666]
Epoch 05/60 | loss=0.2347 | val_f1_macro=0.6839 | best=0.7037 | per_class=[0.71  0.752 0.817 0.53  0.833 0.532 0.612]
Epoch 06/60 | loss=0.1856 | val_f1_macro=0.6773 | best=0.7037 | per_class=[0.722 0.741 0.812 0.519 0.821 0.513 0.612]
Epoch 07/60 | loss=0.1408 | val_f1_macro=0.6805 | best=0.7037 | per_class=[0.734 0.742 0.818 0.513 0.828 0.518 0.611]
Epoch 08/60 | loss=0.1005 | val_f1_macro=0.6751 | best=0.7037 | per_class=[0.717 0.737 0.796 0.519 0.829 0.524 0.605]
Epoch 09/60 | loss=0.0824 | val_f1_macro=0.

In [8]:
X_test = pd.read_csv(EVALUATION_PATH, index_col=0, na_values='\\N')
X_test = initial_prep(X_test, False)
X_other = X_test
pred_other, proba_other = predict_labels_from_df(
    model, preprocess, X_other, device=device, batch_size=4096,
    log_priors=log_priors, tau=tau
)
print(pred_other[:20], proba_other.shape)

[5 2 5 5 5 2 3 2 4 5 0 1 4 2 5 2 1 5 0 6] (20000, 7)


In [4]:
pred_other

array([5, 5, 0, ..., 0, 5, 0], shape=(76611,))

In [9]:
submission_df = pd.DataFrame(
{
    "Id": range(0,20000),
    "Predicted": pred_other
},
index=range(0,20000)
)
submission_df.to_csv('prova.csv', index=False)

