# 0. Imports & Setup

In [None]:
###############################################################################
# 0) Imports & Setup
###############################################################################
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.nn.utils.rnn import pad_sequence

import optuna

import warnings
warnings.filterwarnings("ignore")

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [2]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super().__init__()
        self.alpha = alpha  # Class weights (Tensor of size V)
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = ((1 - pt) ** self.gamma) * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# 1. Global Parameters

In [2]:
###############################################################################
# 1) Global Parameters
###############################################################################
PATH_CSV = '*****************'

MANDATORY_FEATURES = ['*****************', '*****************']  # ➔ Mandatory Featrures

CAT_FEATURES = ['*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
                '*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
                '*****************', '*****************', '*****************']         # ➔ Categorical Features

NUM_FEATURES = ['*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
                '*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
                '*****************', '*****************', '*****************']         # ➔ Numerical Features

TARGET       = ['*****************']

NOT_CONSIDERED = ['*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
                '*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
                '*****************', '*****************', '*****************']

MIN_SEQ = 3         # Minimum length of a sequence
SEQ_LEN = 15        # Length of a sequence (for padding)
MAX_SEQ = 20        # Maximum length of a sequence

SPLIT = 0.9         # Ratio train / test

BATCH_SIZE = 32
EMB_DIM = 32

N_EPOCHS = 1000

EARLY_STOPPING = 5

# 2.Loading and Preprocessing

In [4]:
###############################################################################
# 2) Loading and Preprocessing
###############################################################################

# Load the dataset
df = pd.read_csv(PATH_CSV, usecols=CAT_FEATURES+NUM_FEATURES+MANDATORY_FEATURES+TARGET)

# Delete accessories
df = df[df['*****************']!='*****************']

# Delete not considered machines
other = ['*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
         '*****************', '*****************', '*****************', '*****************', '*****************', '*****************',
         '*****************', '*****************', '*****************']
df = df[~df['*****************'].isin(other)]


# Replace rare machines with 'Other'
min_freq = 100
code_counts = df['*****************'].value_counts()
rare_codes = code_counts[code_counts < min_freq].index
df['*****************'] = df['*****************'].apply(lambda x: x if x not in rare_codes else 'Other')

# Delete clients with less than MIN_SEQ sequences
df['Count'] = df.groupby('*****************')['*****************'].transform('count')
df = df[df['Count'] >= MIN_SEQ].drop(columns=['Count'])

# Date conversion and sorting
df['*****************'] = pd.to_datetime(df['*****************'])
df = df.sort_values(['*****************', '*****************'])

# ----------------PREPROCESSING----------------
if '*****************' in df.columns:
    # Create an age_machine feature
    df['*****************'] = pd.to_datetime(df['*****************'], errors='coerce')
    df['*****************'] = (df['*****************'] - df['*****************']).dt.days / 365.25
    df['*****************'] = df['*****************'].fillna(df['*****************'].median())
    NUM_FEATURES.append('*****************')
    NUM_FEATURES.remove('*****************')

if '*****************' in df.columns:
    df['*****************'] = df['*****************'].isnull().astype(int)
    NUM_FEATURES.append('*****************')

l = ['*****************', '*****************', '*****************', '*****************']

for col in l:
    if col in df.columns:
        df[col] = np.log1p(df[col].fillna(0))

if '*****************' in df.columns:
    df['*****************'] = np.log1p(df['*****************'].fillna(0))
    NUM_FEATURES += ['*****************']
if '*****************' in df.columns:
    df['*****************'] = np.log1p(df['*****************'].fillna(0))
    NUM_FEATURES += ['*****************']
# ------------------------------------------------------------------

In [6]:
# Encodage Label
LabelToIdx = {}
for feat in CAT_FEATURES + TARGET:
    enc = LabelEncoder()
    df[f"{feat}_enc"] = enc.fit_transform(df[feat]) + 1
    LabelToIdx[feat] = {cls: idx+1 for idx, cls in enumerate(enc.classes_)}
    df.drop(columns=[feat], inplace=True)

vocab_sizes = {
    feat: len(LabelToIdx[feat]) + 1  # +1 for padding
    for feat in CAT_FEATURES + TARGET
}

emb_dims = {}
for feature in CAT_FEATURES + TARGET:
    if vocab_sizes[feature] < 100:
        emb_dims[feature] = 8
    elif vocab_sizes[feature] < 1000:
        emb_dims[feature] = 32
    else:
        emb_dims[feature] = 64

target_vocab_size = vocab_sizes[TARGET[0]]

TARGET = [TARGET[0] + '_enc']

#	•	Small vocabularies (fewer than 100) → 4 to 8 dimensions
#	•	Medium vocabularies (100–1000) → 8 to 32 dimensions
#	•	Very large vocabularies (over 1000) → 32+ dimensions

# Scaling Numerical Features
for feat in NUM_FEATURES:
    scaler = StandardScaler()
    df[f"{feat}_enc"] = scaler.fit_transform(df[[feat]])

# 3.Dataset et DataLoader

In [7]:
###############################################################################
# 3) Dataset et DataLoader
###############################################################################

# a) Preparing the DataFrame for LSTM
df_lstm = df.copy()
df_lstm.sort_values(['*****************', '*****************'], inplace=True)
df_lstm = df_lstm.groupby('*****************').agg(list).reset_index()
df_lstm.drop(columns=['*****************'], inplace=True)

# b) Custom dataset with trimming at MIN_SEQ and padding to MAX_SEQ
class MachineDataset(Dataset):
    def __init__(self, df_lstm, cat_features, num_features, target_col, max_seq, min_seq):
        self.samples = []
        self.cat_features = cat_features
        self.num_features = num_features
        self.target_col = target_col
        self.max_seq = max_seq
        self.min_seq = min_seq

        for _, row in df_lstm.iterrows():
            L = len(row[target_col])
            for i in range(1, L):
                if i < min_seq:
                    continue
                sample = {}

                # Categorical
                for feat in self.cat_features:
                    full_seq = row[feat + '_enc'][:i]
                    sample[feat] = torch.tensor(full_seq[-self.max_seq:], dtype=torch.long)

                # Numerical
                for feat in self.num_features:
                    full_seq = row[feat + '_enc'][:i]
                    sample[feat] = torch.tensor(full_seq[-self.max_seq:], dtype=torch.float)

                # Target
                sample['target'] = torch.tensor(row[target_col][i], dtype=torch.long)
                self.samples.append(sample)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        x = {feat: sample[feat] for feat in self.cat_features + self.num_features}
        y = sample['target']
        return x, y

# c) Collate Function - to deal with variable-length sequences
def collate_fn(batch):
    features_batch = {feat: [] for feat in batch[0][0].keys()}
    targets = []

    for x, y in batch:
        for feat in features_batch:
            features_batch[feat].append(x[feat])
        targets.append(y)

    # Padding
    for feat in features_batch:
        dtype = features_batch[feat][0].dtype
        pad_val = 0 if dtype == torch.long else 0.0
        features_batch[feat] = pad_sequence(features_batch[feat], batch_first=True, padding_value=pad_val)

    targets = torch.stack(targets)
    return features_batch, targets


# d) Train/Test Split
clients = df_lstm['IdClient'].unique()
np.random.shuffle(clients)
n_train = int(SPLIT * len(clients))
train_clients = set(clients[:n_train])
test_clients  = set(clients[n_train:])

# e) Creating train and test datasets
df_train = df_lstm[df_lstm['IdClient'].isin(train_clients)].reset_index(drop=True)
df_test  = df_lstm[df_lstm['IdClient'].isin(test_clients)].reset_index(drop=True)

train_dataset = MachineDataset(df_train, cat_features=CAT_FEATURES, num_features=NUM_FEATURES, target_col=TARGET[0], max_seq=MAX_SEQ, min_seq=MIN_SEQ)
test_dataset  = MachineDataset(df_test,  cat_features=CAT_FEATURES, num_features=NUM_FEATURES, target_col=TARGET[0], max_seq=MAX_SEQ, min_seq=MIN_SEQ)

# f) Creating DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# 4.Models

In [8]:
###############################################################################
# 4.1) LSTM
###############################################################################
class LSTMRec(nn.Module):
    def __init__(self,
                 cat_features,
                 num_features,
                 emb_dims,          # dict {feat: (vocab_size, emb_dim)}
                 hidden_dim,
                 target_size,       # |V_target|
                 num_layers=1,
                 dropout=0.5):
        super().__init__()
        self.cat_features = cat_features
        self.num_features = num_features

        # ── 1) Embedding for categorical features ───────────────────
        self.embeddings = nn.ModuleDict({
            feat: nn.Embedding(
                num_embeddings=emb_dims[feat][0],
                embedding_dim=emb_dims[feat][1]
            )
            for feat in cat_features
        })

        # ── 2) Entry size for LSTM ──────────────────────────────────────
        total_emb_dim = sum(emb_dims[f][1] for f in cat_features)
        num_feat_dim  = len(num_features)                # numerical features are already in the right format
        lstm_input_dim = total_emb_dim + num_feat_dim

        # ── 3) Bidirectional LSTM ──────────────────────────────────────────
        self.lstm = nn.LSTM(
            input_size   = lstm_input_dim,
            hidden_size  = hidden_dim,
            num_layers   = num_layers,
            batch_first  = True,
            dropout      = dropout if num_layers > 1 else 0.0,
            bidirectional= True
        )

        # ── 4) Attention layer ──────────────────────────
        self.attn_proj = nn.Linear(hidden_dim * 2, 1, bias=False)

        # ── 5) Classification Head ───────────────────────────────────────
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)
        self.dropout    = nn.Dropout(dropout)
        self.fc         = nn.Linear(hidden_dim * 2, target_size)

    # --------------------------------------------------------------------- #
    def forward(self, x):
        # 1. Cat embedding
        cat_embeds = [self.embeddings[f](x[f]) for f in self.cat_features]    # list[tensor]
        cat_emb    = torch.cat(cat_embeds, dim=-1) if cat_embeds else None   # [B, T, Σemb]

        # 2. Numerical features
        num_feats = torch.stack([x[f] for f in self.num_features],
                                dim=-1) if self.num_features else None       # [B, T, n_num]

        # 3. Concatenate embeddings and numerical features
        if cat_emb is not None and num_feats is not None:
            lstm_in = torch.cat([cat_emb, num_feats], dim=-1)
        elif cat_emb is not None:
            lstm_in = cat_emb
        else:
            lstm_in = num_feats                                             # [B, T, D_in]

        # 4. LSTM
        H, _ = self.lstm(lstm_in)                                           # [B, T, 2*H]

        # 5. Attention : score → weight → context
        scores  = self.attn_proj(H)                                         # [B, T, 1]
        weights = torch.softmax(scores, dim=1)                              # [B, T, 1]
        ctx     = (weights * H).sum(dim=1)                                  # [B, 2*H]

        # 6. Norm + Dropout + FC
        ctx = self.layer_norm(ctx)
        out = self.dropout(ctx)
        return self.fc(out)                                                 # [B, |V|]

In [9]:
###############################################################################
# 4.2) GRU4Rec
###############################################################################
class GRU4Rec(nn.Module):

    def __init__(self, cat_features, num_features, emb_dims, hidden_dim, target_size, num_layers, dropout):
        super().__init__()
        self.cat_features = cat_features
        self.num_features = num_features

        # Embeddings for categorical features
        self.embeddings = nn.ModuleDict({
            feat: nn.Embedding(
                num_embeddings=emb_dims[feat][0],
                embedding_dim=emb_dims[feat][1]
            ) for feat in cat_features
        })

        # Dimensions
        total_emb_dim = sum(emb_dims[f][1] for f in cat_features)
        num_feat_dim = len(num_features)
        input_dim = total_emb_dim + num_feat_dim

        # Bidirectional GRU
        self.gru = nn.GRU(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=True
        )

        # Attention projection layer
        self.attn_proj = nn.Linear(hidden_dim * 2, 1, bias=False)

        # Dropout & final classifier
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, target_size)

        # Optional layer normalization for the context vector
        self.layer_norm = nn.LayerNorm(hidden_dim * 2)

    def forward(self, x):
        # Embedding categorical inputs
        cat_embeds = [self.embeddings[feat](x[feat]) for feat in self.cat_features]
        cat_emb = torch.cat(cat_embeds, dim=-1) if cat_embeds else None

        # Stacking numerical features
        num_feats = torch.stack([x[feat] for feat in self.num_features], dim=-1) if self.num_features else None

        # Combine embeddings and numerical features
        if cat_emb is not None and num_feats is not None:
            rnn_input = torch.cat([cat_emb, num_feats], dim=-1)
        elif cat_emb is not None:
            rnn_input = cat_emb
        else:
            rnn_input = num_feats

        # Forward pass through GRU
        H, _ = self.gru(rnn_input)  # H shape: [batch, seq_len, hidden_dim*2]

        # Attention weights
        scores = self.attn_proj(H)              # [batch, seq_len, 1]
        weights = torch.softmax(scores, dim=1)  # [batch, seq_len, 1]

        # Weighted sum to compute the context vector
        context = (weights * H).sum(dim=1)    # [batch, hidden_dim*2]
        context = self.layer_norm(context)

        # Dropout and final classification
        out = self.dropout(context)
        return self.fc(out)

# 5.Hyperparameters (KFold CrossVal)

In [10]:
def cv_hit3(model_class, model_kwargs,
            df_lstm, cat_features, num_features, target_col,
            max_seq, min_seq, k_folds, batch_size, num_epochs, lr):

    clients = np.random.permutation(df_lstm["*****************"].unique())
    kf = KFold(n_splits=k_folds, shuffle=False)

    hit3_scores = []

    for train_idx, val_idx in kf.split(clients):

        print(f"\n---- Fold: {len(hit3_scores) + 1}/{k_folds}")

        train_cli, val_cli = set(clients[train_idx]), set(clients[val_idx])
        df_tr = df_lstm[df_lstm["*****************"].isin(train_cli)]
        df_va = df_lstm[df_lstm["*****************"].isin(val_cli)]

        # 2. Class Weights
        all_labels = np.fromiter((lbl for seq in df_tr[target_col] for lbl in seq[min_seq:]), dtype=np.int64)
        counts = np.bincount(all_labels, minlength=max(all_labels) + 1)
        weights = 1.0 / np.log1p(np.where(counts == 0, 1, counts))
        class_w_cpu = torch.tensor(weights, dtype=torch.float32)
        class_w_gpu = class_w_cpu

        # 3. Datasets & DataLoaders
        tr_ds = MachineDataset(df_tr, cat_features, num_features, target_col, max_seq, min_seq)
        va_ds = MachineDataset(df_va, cat_features, num_features, target_col, max_seq, min_seq)

        samp_w = class_w_cpu[[tgt for _, tgt in tr_ds]].double()
        train_ld = DataLoader(tr_ds, batch_size=batch_size, sampler=WeightedRandomSampler(samp_w, len(samp_w), replacement=True), collate_fn=collate_fn, num_workers=0, pin_memory=True)
        val_ld = DataLoader(va_ds, batch_size=batch_size, shuffle=False, collate_fn=collate_fn, num_workers=0, pin_memory=True)

        # 4. Model and Training Setup
        model = model_class(**model_kwargs)
        optim = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optim, mode='max', factor=0.2, patience=1, verbose=False, min_lr=1e-6)
        criterion = FocalLoss(alpha=class_w_gpu, gamma=2.0)

        best_hit, no_improve = 0.0, 0  # early stopping variables

        for _ in range(num_epochs):

            # --- Training ---
            model.train()
            for feats, tgt in train_ld:
                feats = {k: v for k, v in feats.items()}

                optim.zero_grad()
                loss = criterion(model(feats), tgt)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optim.step()

            # --- Validation ---
            model.eval()
            correct, total = 0, 0
            with torch.no_grad():
                for feats, tgt in val_ld:
                    feats = {k: v for k, v in feats.items()}
                    top3 = model(feats).topk(3, dim=1).indices
                    correct += (top3 == tgt.unsqueeze(1)).any(1).sum().item()
                    total += tgt.size(0)

            hit_val = correct / total
            scheduler.step(hit_val)

            if hit_val > best_hit:
                best_hit = hit_val
                no_improve = 0
                print(f"Epoch: {_ + 1}/{num_epochs}")
            else:
                no_improve += 1
                print(f"Epoch: {_ + 1}/{num_epochs} - No Improve: {no_improve}")
                if no_improve >= EARLY_STOPPING:
                    break  # early stop

        hit3_scores.append(best_hit)

    return float(np.mean(hit3_scores))

In [11]:
###############################################################################
# 5) Tuning avec Optuna
###############################################################################
# ── 1)  Utility to set emb_dims ───────────────────────────────────────
def suggest_emb_dims(vocab_sizes):
    out = {}
    for feat, size in vocab_sizes.items():
        if size < 100:
            out[feat] = 8
        elif size < 1000:
            out[feat] = 32
        else:
            out[feat] = 64
    return out


# ── 2) tuning with Optuna ────────────────────────────────────────────────────
def tune_model_with_optuna(model_tag, model_class, df_lstm, vocab_sizes, cat_features, num_features, target_vocab_size, n_trials, k_folds, num_epochs):
    emb_dims_const = suggest_emb_dims(vocab_sizes)

    def objective(trial: optuna.trial.Trial):
        # hyperparams communs
        max_seq    = trial.suggest_int("max_seq", 20, 50)
        hidden_dim = trial.suggest_categorical("hidden_dim", [32, 64, 128])
        num_layers = trial.suggest_int("num_layers", 1, 2)
        dropout    = trial.suggest_float("dropout", 0.3, 0.6)
        lr         = trial.suggest_loguniform("lr", 1e-5, 5e-3)
        batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])

        model_kwargs = {
                "cat_features":      cat_features,
                "num_features":      num_features,
                "emb_dims":          {feat: (vocab_sizes[feat], emb_dims_const[feat]) for feat in cat_features},
                "hidden_dim":        hidden_dim,
                "target_size":       target_vocab_size,
                "num_layers":        num_layers,
                "dropout":           dropout,
            }

        # Cross-Validation Hit@3
        return cv_hit3(
            model_class   = model_class,
            model_kwargs  = model_kwargs,
            df_lstm       = df_lstm,
            cat_features  = cat_features,
            num_features  = num_features,
            target_col    = TARGET[0],
            max_seq       = max_seq,
            min_seq       = MIN_SEQ,
            k_folds       = k_folds,
            batch_size    = batch_size,
            num_epochs    = num_epochs,
            lr            = lr,
        )

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)

    print("Best Hit@3 =", study.best_value)
    print("Best params:", study.best_params)

    return study

In [12]:
# study_gru = tune_model_with_optuna(
#     model_tag="gru",
#     model_class=GRU4Rec,
#     df_lstm=df_lstm,
#     vocab_sizes=vocab_sizes,
#     cat_features=CAT_FEATURES,
#     num_features=NUM_FEATURES,
#     target_vocab_size=target_vocab_size,
#     n_trials=20,
#     k_folds=3,
#     num_epochs=100,
# )

# BEST_PARAMS = study_gru.best_params

In [13]:
# Best Hit@3 = *************************
BEST_PARAMS = {'max_seq': ****, 'hidden_dim': ****,, 'num_layers': ****,, 'dropout': ****, 'lr': ****,, 'batch_size': ****}

# 6.Train Best Model

In [None]:
###############################################################################
# 6) Training the Best Model
###############################################################################

BEST_PARAMS = {
    "max_seq":   23,
    "hidden_dim": 32,
    "num_layers": 1,
    "dropout":   0.7848729433262691,
    "lr":        0.0049013565781703,
    "batch_size": 64,
    "n_epochs":  200,
    "patience":  EARLY_STOPPING,
}

###############################################################################
# 1) Rebuilding datasets / dataloaders with the best hyperparameters
###############################################################################
train_dataset_best = MachineDataset(
    df_train,
    cat_features = CAT_FEATURES,
    num_features = NUM_FEATURES,
    target_col   = TARGET[0],
    max_seq      = BEST_PARAMS["max_seq"],
    min_seq      = MIN_SEQ,
)
test_dataset_best = MachineDataset(
    df_test,
    cat_features = CAT_FEATURES,
    num_features = NUM_FEATURES,
    target_col   = TARGET[0],
    max_seq      = BEST_PARAMS["max_seq"],
    min_seq      = MIN_SEQ,
)

# Class weights (calculated on the TRAIN set only)
all_labels = np.fromiter(
    (lbl for seq in df_train[TARGET[0]] for lbl in seq[MIN_SEQ:]),
    dtype=np.int64,
)
counts = np.bincount(all_labels, minlength=max(all_labels) + 1)
class_weights = 1.0 / np.log1p(np.where(counts == 0, 1, counts))
class_w_tensor = torch.tensor(class_weights, dtype=torch.float32)

# Weighted sampler + DataLoaders
sample_weights = class_w_tensor[[t for _, t in train_dataset_best]].double()
train_loader_best = DataLoader(
    train_dataset_best,
    batch_size = BEST_PARAMS["batch_size"],
    sampler    = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True),
    collate_fn = collate_fn,
    num_workers = 0,
    pin_memory  = True,
)
test_loader_best = DataLoader(
    test_dataset_best,
    batch_size = BEST_PARAMS["batch_size"],
    shuffle    = False,
    collate_fn = collate_fn,
    num_workers = 0,
    pin_memory  = True,
)

###############################################################################
# 2) Model + optimizers
###############################################################################
EMB_SPECS = {  # ← new name, avoids ambiguity
    feat: (vocab_sizes[feat], emb_dims[feat])  # emb_dims = dict[int]
    for feat in CAT_FEATURES
}

model_best = GRU4Rec(
    cat_features = CAT_FEATURES,
    num_features = NUM_FEATURES,
    emb_dims     = EMB_SPECS,  # <-- use EMB_SPECS for safety
    hidden_dim   = BEST_PARAMS["hidden_dim"],
    target_size  = target_vocab_size,
    num_layers   = BEST_PARAMS["num_layers"],
    dropout      = BEST_PARAMS["dropout"],
)

optimizer  = torch.optim.AdamW(model_best.parameters(), lr=BEST_PARAMS["lr"], weight_decay=1e-4)
scheduler  = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="max", factor=0.2,
                                                        patience=1, verbose=False, min_lr=1e-6)
criterion   = FocalLoss(alpha=class_w_tensor, gamma=2.0)

###############################################################################
# 3) Training loop with Early-Stopping
###############################################################################
best_hit, epochs_no_improve = 0.0, 0
for epoch in range(1, BEST_PARAMS["n_epochs"] + 1):

    # ---------- Training ----------
    model_best.train()
    for feats, tgt in train_loader_best:
        feats = {k: v for k, v in feats.items()}
        optimizer.zero_grad()
        loss = criterion(model_best(feats), tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_best.parameters(), 1.0)
        optimizer.step()

    # ---------- Validation (here on test_loader_best *during* training) ----------
    model_best.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for feats, tgt in test_loader_best:
            feats = {k: v for k, v in feats.items()}
            top3 = model_best(feats).topk(3, dim=1).indices
            correct += (top3 == tgt.unsqueeze(1)).any(1).sum().item()
            total   += tgt.size(0)

    hit3 = correct / total
    scheduler.step(hit3)

    print(f"Epoch {epoch:03d} | Hit@3 = {hit3:.4f}")

    # ---------- Early-Stopping ----------
    if hit3 > best_hit:
        best_hit = hit3
        epochs_no_improve = 0
        torch.save(model_best.state_dict(), "gru4rec_best.pt")   # save model
    else:
        epochs_no_improve += 1
        print(f"Epoch {epoch:03d} | No Improvement: {epochs_no_improve}/{BEST_PARAMS['patience']}")
        if epochs_no_improve >= BEST_PARAMS["patience"]:
            print("Early stop")
            break


In [None]:
# 1) Recreate test_loader_best if necessary
train_dataset_best = MachineDataset(
    df_train, cat_features=CAT_FEATURES, num_features=NUM_FEATURES,
    target_col=TARGET[0], max_seq=BEST_PARAMS["max_seq"], min_seq=MIN_SEQ
)
test_dataset_best = MachineDataset(
    df_test, cat_features=CAT_FEATURES, num_features=NUM_FEATURES,
    target_col=TARGET[0], max_seq=BEST_PARAMS["max_seq"], min_seq=MIN_SEQ
)

# Compute class weights on df_train
all_labels = np.fromiter((lbl for seq in df_train[TARGET[0]] for lbl in seq[MIN_SEQ:]), dtype=np.int64)
counts = np.bincount(all_labels, minlength=max(all_labels) + 1)
class_weights = 1.0 / np.log1p(np.where(counts == 0, 1, counts))
class_w_tensor = torch.tensor(class_weights, dtype=torch.float32)

# WeightedRandomSampler for training (not required for evaluation, but shows full pipeline)
sample_weights = class_w_tensor[[t for _, t in train_dataset_best]].double()
train_loader_best = torch.utils.data.DataLoader(
    train_dataset_best, batch_size=BEST_PARAMS["batch_size"],
    sampler=WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True),
    collate_fn=collate_fn, num_workers=0, pin_memory=True
)
test_loader_best = torch.utils.data.DataLoader(
    test_dataset_best, batch_size=BEST_PARAMS["batch_size"],
    shuffle=False, collate_fn=collate_fn, num_workers=0, pin_memory=True
)

# 2) Load/reinstantiate the model and switch to eval mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_best = GRU4Rec(
    cat_features=CAT_FEATURES,
    num_features=NUM_FEATURES,
    emb_dims={feat: (vocab_sizes[feat], emb_dims[feat]) for feat in CAT_FEATURES},
    hidden_dim=BEST_PARAMS["hidden_dim"],
    target_size=target_vocab_size,
    num_layers=BEST_PARAMS["num_layers"],
    dropout=BEST_PARAMS["dropout"],
).to(device)

# Load the saved model weights
model_best.load_state_dict(torch.load("gru4rec_best.pt", map_location=device))
model_best.eval()

# 3) Retrieve logits and targets on the test set
all_logits = []
all_targets = []
with torch.no_grad():
    for feats, tgt in test_loader_best:
        feats = {k: v.to(device) for k, v in feats.items()}
        tgt = tgt.to(device)
        output = model_best(feats)  # shape [batch_size, num_classes]
        all_logits.append(output.cpu())
        all_targets.append(tgt.cpu())

logits = torch.cat(all_logits, dim=0).numpy()
targets = torch.cat(all_targets, dim=0).numpy()
num_samples = targets.shape[0]

# 4) Compute Hit@k for k = 1..5
max_k = 5
hit_at_k = []
for k in range(1, max_k + 1):
    topk_preds = np.argsort(logits, axis=1)[:, -k:]
    hits = [1 if targets[i] in topk_preds[i] else 0 for i in range(num_samples)]
    hit_at_k.append(np.mean(hits))

# 5) Plot Hit@k
plt.figure(figsize=(6, 4))
plt.plot(range(1, max_k + 1), hit_at_k, marker='o')
plt.xlabel("k")
plt.ylabel("Hit@k")
plt.title("Hit@k for k = 1 to 5")
plt.xticks(range(1, max_k + 1))
plt.grid(True)
plt.show()

# 6) Classification report and per-class metrics (top-1)
top1_preds = np.argmax(logits, axis=1)

# Reverse LabelToIdx to retrieve class names
inv_LabelToIdx = {v: k for k, v in LabelToIdx['***********'].items()}
target_names = [inv_LabelToIdx[i] for i in sorted(inv_LabelToIdx.keys())]
y_true_str = [inv_LabelToIdx[t] for t in targets]
y_pred_str = [inv_LabelToIdx[p] for p in top1_preds]

report = classification_report(y_true_str, y_pred_str, labels=target_names, output_dict=True)
print("Classification Report (per class):")
for cls in target_names:
    pr = report[cls]['precision']
    rc = report[cls]['recall']
    f1 = report[cls]['f1-score']
    sup = report[cls]['support']
    print(f"{cls:<10} | Precision = {pr:.3f} | Recall = {rc:.3f} | F1 = {f1:.3f} | Support = {sup}")

# 7) Plot F1-score per class
class_f1 = [report[cls]['f1-score'] for cls in target_names]
plt.figure(figsize=(8, 6))
plt.bar(range(len(target_names)), class_f1)
plt.xlabel("Class (************)")
plt.ylabel("F1-score")
plt.title("F1-score per class")
plt.xticks(range(len(target_names)), target_names, rotation=90)
plt.tight_layout()
plt.show()

# 7.Final Model

In [None]:
###############################################################################
# 7) Training the Final Model on the Entire Dataset
###############################################################################

# a) Re‑create the full dataset
full_dataset = MachineDataset(
    df_lstm,               # Use the entire dataset
    cat_features=CAT_FEATURES,
    num_features=NUM_FEATURES,
    target_col=TARGET[0],
    max_seq=BEST_PARAMS["max_seq"],
    min_seq=MIN_SEQ
)

# b) Compute class weights on the full set
all_labels = np.fromiter(
    (lbl for seq in df_lstm[TARGET[0]] for lbl in seq[MIN_SEQ:]),
    dtype=np.int64
)
counts = np.bincount(all_labels, minlength=max(all_labels) + 1)
class_weights = 1.0 / np.log1p(np.where(counts == 0, 1, counts))
class_w_tensor = torch.tensor(class_weights, dtype=torch.float32)

# c) Create the DataLoader with a WeightedSampler
sample_weights = class_w_tensor[[t for _, t in full_dataset]].double()
full_loader = DataLoader(
    full_dataset,
    batch_size=BEST_PARAMS["batch_size"],
    sampler=WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True),
    collate_fn=collate_fn,
    num_workers=0,
    pin_memory=True
)

# d) Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
final_model = GRU4Rec(
    cat_features=CAT_FEATURES,
    num_features=NUM_FEATURES,
    emb_dims={feat: (vocab_sizes[feat], emb_dims[feat]) for feat in CAT_FEATURES},
    hidden_dim=BEST_PARAMS["hidden_dim"],
    target_size=target_vocab_size,
    num_layers=BEST_PARAMS["num_layers"],
    dropout=BEST_PARAMS["dropout"],
).to(device)

# e) Optimizer and Loss
optimizer = torch.optim.AdamW(final_model.parameters(), lr=BEST_PARAMS["lr"], weight_decay=1e-4)
criterion = FocalLoss(alpha=class_w_tensor.to(device), gamma=2.0)

# f) Full training loop
num_epochs = *********   # Number of epochs observed during the best run

for epoch in range(1, num_epochs + 1):
    final_model.train()
    total_loss = 0
    
    for feats, tgt in full_loader:
        feats = {k: v.to(device) for k, v in feats.items()}
        tgt   = tgt.to(device)
        
        optimizer.zero_grad()
        outputs = final_model(feats)
        loss = criterion(outputs, tgt)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(final_model.parameters(), 1.0)
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(full_loader)
    print(f"Epoch {epoch}/{num_epochs} | Loss: {avg_loss:.4f}")

# 8.Inference

In [None]:
###############################################################################
# g) Inference + Client-Centered DataFrame (Top-k)
###############################################################################
k = 5  # ← Desired Top-k

# 1. Dataset that also returns *****************
class InferDataset(Dataset):
    def __init__(self, df_lstm, *args, **kwargs):
        self.base = MachineDataset(df_lstm, *args, **kwargs)
        self.ids  = np.repeat(df_lstm['*****************'].values,
                              [len(seq) - 1 for seq in df_lstm[TARGET[0]]])

    def __len__(self):              
        return len(self.base)

    def __getitem__(self, idx):
        x, y = self.base[idx]
        return x, y, self.ids[idx]  # ← (features, target, id_client)

infer_ds = InferDataset(
    df_lstm,
    cat_features = CAT_FEATURES,
    num_features = NUM_FEATURES,
    target_col   = TARGET[0],
    max_seq      = BEST_PARAMS["max_seq"],
    min_seq      = MIN_SEQ,
)

infer_loader = DataLoader(
    infer_ds, batch_size=256, shuffle=False, collate_fn=lambda batch:
    (
        {f: pad_sequence([b[0][f] for b in batch],
                         batch_first=True,
                         padding_value=0 if batch[0][0][f].dtype==torch.long else 0.)
         for f in batch[0][0]},
        torch.stack([b[1] for b in batch]),
        [b[2] for b in batch]
    )
)

# 2. Top-k Predictions
inv_simma = {v: k for k, v in LabelToIdx['*****************'].items()}
topk_by_cli = {}  # ***************** -> [idx1, idx2, … idxk]

final_model.eval()
with torch.no_grad():
    for feats, _, cli_ids in infer_loader:
        feats = {k: v.to(device) for k, v in feats.items()}
        logits = final_model(feats)             # [B, |V|]
        topk   = logits.argsort(dim=1)[:, -k:]  # [B, k] (encoded)

        for cid, preds in zip(cli_ids, topk.cpu().numpy()):
            topk_by_cli[cid] = preds  # keep only the last sequence per client

# 3. Metadata: Name, Department, Family ↔ CodeSimma
meta = pd.read_csv(
    PATH_CSV,
    usecols=['*****************', '*****************', '*****************',
             '*****************', '*****************']
).drop_duplicates()

map_fam = meta[['*****************', '*****************']].drop_duplicates()
map_fam = dict(map_fam.values)  # ***************** -> *****************

client_info = meta[['*****************', '*****************',
                    '*****************']].drop_duplicates()

# 4. Build the final DataFrame
records = []
for cid, preds in topk_by_cli.items():
    row = {
        '*****************': cid,
        '*****************': client_info.loc[client_info.IdClient==cid,
                                      '*****************'].iat[0],
        '*****************': client_info.loc[client_info.IdClient==cid,
                                       '*****************'].iat[0],
    }
    # Top-k
    for rank, idx in enumerate(preds[::-1], start=1):  # from best to least good
        code = inv_simma[idx]
        fam  = map_fam.get(code, 'NA')
        row[f'pred_{rank}_*****************'] = code
        row[f'pred_{rank}_*****************'] = fam
    records.append(row)

pred_df = pd.DataFrame(records).sort_values('*****************').reset_index(drop=True)