In [1]:
!nvidia-smi

Thu Nov  6 15:21:25 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P0             47W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
import os
import torch
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from datasets import Dataset
from torch.utils.data import TensorDataset, DataLoader, random_split
from torch import nn
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast, BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer, AutoConfig



In [None]:
print('Working dir:', os.getcwd())

Working dir: /content


Data Preprocessing

In [18]:
"""
Module to transform data to be consumable by model
"""

class DatabaseToBertDataset():
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name) # ("bert-base-multilingual-cased")
        self.tokenizerChunkLen = self.tokenizer.model_max_length

        config = AutoConfig.from_pretrained(model_name)
        self.maxTokensLength = config.max_position_embeddings

        # "allenai/longformer-base-4096" has no max_length
        if model_name == "abazoge/DrLongformer":
            self.maxTokensLength = 4096

    def _tokenize(self, df: str) -> pd.DataFrame:
        """Tokenizer function"""
        tokenized = self.tokenizer(
            df["email"].to_list(),
            df["bill"].to_list(),
            padding='max_length',
            max_length=self.maxTokensLength,
            truncation=True,
            stride=128,                        # overlap between chunks
            return_overflowing_tokens=True,    # keep extra chunks
            return_offsets_mapping=True,       # optional: track positions in original text
            return_tensors="pt"                # PyTorch tensors
        )

        # input(tokenized.keys())

        # print(type(tokenized))
        # print(tokenized['input_ids'].shape)
        # print(tokenized['token_type_ids'].shape)
        # print(tokenized['attention_mask'].shape)
        # print(tokenized['offset_mapping'].shape)
        # print(tokenized['overflow_to_sample_mapping'].shape)
        # input('MMMM')

        return tokenized

    def _encode_labels(self, df: pd.DataFrame) -> tuple[pd.DataFrame, np.ndarray]:
        """Encode labels"""

        df = df.copy()
        # Get unique labels from labels description
        with open('labels.json', 'r') as file:
            labels = json.load(file)['etiquettes']
        labelIds = [label['id'] for label in labels]

        # Format original labels
        df['labels'] = df['labels'].apply(
            lambda x: x.split('|') if isinstance(x, str) else []
        )

        # Encode
        mlb = MultiLabelBinarizer(classes=labelIds)
        encoded = mlb.fit_transform(df['labels'])
        df = pd.concat(
            [df, pd.DataFrame(encoded, columns=mlb.classes_)],
            axis=1
        )
        df = df.drop(columns='labels')

        return df, mlb.classes_


    def execute(self, df : pd.DataFrame) -> tuple[dict[torch.Tensor], np.ndarray]:
        """
        Transform from database to dataset consumable by model
        """

        # Tokenize ('input_ids' 'attention_mask''token_type_ids' 'overflow_to_sample_mapping')
        data = self._tokenize(df)

        labelCols = None
        if 'labels' in df.columns:
            df, labelCols = self._encode_labels(df)

            labels_tensor = torch.tensor(df[labelCols].to_numpy())
            expanded_labels = labels_tensor[data['overflow_to_sample_mapping']]

            data['labels'] = expanded_labels

        return data, labelCols

Models

In [4]:
BERT = "bert-base-multilingual-cased"
LONGFORMER = "abazoge/DrLongformer"
BIGBIRD = "google/bigbird-roberta-base"

class BertMeanClassifier(nn.Module):
    def __init__(self, model_name, num_labels, freeze_bert=False):
        super().__init__()
        self.modelName = BERT
        self.bert = AutoModel.from_pretrained(self.modelName)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, batch, device):
        input_ids, attention_mask, token_type_ids = [x.to(device) for x in batch]

        # BERT outputs all hidden states
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # The BERT model returns:
        # - last_hidden_state: (batch_size, seq_len, hidden_dim)
        # - pooler_output: (batch_size, hidden_dim)
        last_hidden_state = outputs.last_hidden_state # (batch_size, seq_len, hidden_dim)
        pooled_output = outputs.pooler_output  # [CLS] embedding after tanh layer

        # Apply dropout + classification on the pooled [CLS] representation
        out = self.classifier(self.dropout(pooled_output))

        return out


class LongformerClassifier(nn.Module):
    def __init__(self, model_name, num_labels, freeze_bert=False):
        super().__init__()
        # "allenai/longformer-base-4096" NOT IN FRENCH... :(
        self.modelName = LONGFORMER
        self.bert = AutoModel.from_pretrained(self.modelName)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, batch, device):
        input_ids, attention_mask = [x.to(device) for x in batch]

        # BERT outputs all hidden states
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # The BERT model returns:
        # - last_hidden_state: (batch_size, seq_len, hidden_dim)
        # - pooler_output: (batch_size, hidden_dim)
        last_hidden_state = outputs.last_hidden_state # (batch_size, seq_len, hidden_dim)
        pooled_output = outputs.pooler_output  # [CLS] embedding after tanh layer

        # Apply dropout + classification on the pooled [CLS] representation
        out = self.classifier(self.dropout(pooled_output))

        return out

class BigbirdClassifier(nn.Module):
    def __init__(self, model_name, num_labels, freeze_bert=False):
        super().__init__()
        self.modelName = BIGBIRD
        self.bert = AutoModel.from_pretrained(self.modelName)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, batch, device):
        input_ids, attention_mask = [x.to(device) for x in batch]

        # BERT outputs all hidden states
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # The BERT model returns:
        # - last_hidden_state: (batch_size, seq_len, hidden_dim)
        # - pooler_output: (batch_size, hidden_dim)
        last_hidden_state = outputs.last_hidden_state # (batch_size, seq_len, hidden_dim)
        pooled_output = outputs.pooler_output  # [CLS] embedding after tanh layer

        # Apply dropout + classification on the pooled [CLS] representation
        out = self.classifier(self.dropout(pooled_output))

        return out


Evaluate

In [19]:
def init_eval_metrics() -> dict[list[float|dict]]:
    """
    Init of the evaluation metrics
    """
    return {
        'loss': [],
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1_micro': [],
        'f1_macro_avg': [],
        'f1_weighted_avg': [],
        'f1_macro': []
    }

def compute_eval_metrics(loss, all_labels, all_preds, labels_list) -> dict[list[float]]:
  """
  Computes evaluation metrics
  """
  f1Report = classification_report(all_labels, all_preds, target_names=labels_list, output_dict=True, zero_division=0)


  evalMetric = init_eval_metrics()
  evalMetric['loss'].append(loss)
  evalMetric['accuracy'].append(accuracy_score(all_labels, all_preds))
  evalMetric['precision'].append(f1Report["micro avg"]["precision"])
  evalMetric['recall'].append(f1Report["micro avg"]["recall"])
  evalMetric['f1_micro'].append(f1Report["micro avg"]["f1-score"])
  evalMetric['f1_macro_avg'].append(f1Report["macro avg"]["f1-score"])
  evalMetric['f1_weighted_avg'].append(f1Report["weighted avg"]["f1-score"])
  evalMetric['f1_macro'].append(f1Report)

  return evalMetric


def save_eval_metrics(eval_metrics, evalMetric):
    """
    Saves metrics
    """
    for metricName, val in eval_metrics.items():
        eval_metrics[metricName] = eval_metrics[metricName] + evalMetric[metricName]

def print_eval_metrics(eval_metrics):
    """
    Prints mean metrics
    """
    loss = sum(eval_metrics['loss'])/len(eval_metrics['loss'])
    accuracy = sum(eval_metrics['accuracy'])/len(eval_metrics['accuracy'])
    precision = sum(eval_metrics['precision'])/len(eval_metrics['precision'])
    recall = sum(eval_metrics['recall'])/len(eval_metrics['recall'])
    f1Micro = sum(eval_metrics['f1_micro'])/len(eval_metrics['f1_micro'])
    f1MacroAvg = sum(eval_metrics['f1_macro_avg'])/len(eval_metrics['f1_macro_avg'])


    print(f"Loss: {loss:.4f} F1-Micro: {f1Micro:.4f} F1-Macro-Avg {f1MacroAvg:.4f} Precision: {precision:.4f} Recall: {recall:.4f}")


def eval_metrics_to_df(eval_metrics) -> tuple[pd.DataFrame]:
    """
    Full display all metrics
    """

    # Golbal metrics
    globalMetrics = {}
    for metricName, val in eval_metrics.items():
        if 'f1_macro' == metricName:
          continue
        valMean = sum(eval_metrics[metricName])/len(eval_metrics[metricName])
        globalMetrics[metricName] = valMean
    globalMetricsDf = pd.DataFrame([globalMetrics])


    # Label metrics
    f1MacrosDfs = [pd.DataFrame(r).transpose() for r in eval_metrics['f1_macro']]
    f1MacrosDf = pd.concat(f1MacrosDfs).groupby(level=0).mean(numeric_only=True)
    f1MacrosDf = f1MacrosDf.drop(index=["micro avg", "macro avg", "weighted avg", "samples avg"])

    return globalMetricsDf, f1MacrosDf




Train

In [20]:
# Read data
df = pd.read_csv(os.path.join('data', 'parsed_synthetic', 'synth_data.csv'))
trainValDf, testDf = train_test_split(df, test_size=0.05, random_state=42)

# === CONFIG ===
MODEL_NAME =  LONGFORMER # BERT
epochs = 20
nKf = 5
batch_size = 2
threshold = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'CONFIG: batch_size={batch_size}, epochs={epochs}, device={device}')


# === Eval Variables ===
evalMetrics = init_eval_metrics()

# === Cross Validation ===
kf = KFold(n_splits=nKf, shuffle=True, random_state=42)
crossVal = kf.split(trainValDf)
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"\n===== Fold {fold+1} =====")
    trainDf = df.iloc[train_idx].copy().reset_index()
    valDf   = df.iloc[val_idx].copy().reset_index()

    # === Tokenize ===
    databaseToBertDataset = DatabaseToBertDataset(model_name=MODEL_NAME)
    train_data, labels_list = databaseToBertDataset.execute(trainDf)
    val_data, _ = databaseToBertDataset.execute(valDf)

    if MODEL_NAME == BERT:
        # === Datasets ===
        train_dataset = TensorDataset(
            train_data["input_ids"],
            train_data["attention_mask"],
            train_data["token_type_ids"],
            train_data["labels"].float()
        )
        val_dataset = TensorDataset(
            val_data["input_ids"],
            val_data["attention_mask"],
            val_data["token_type_ids"],
            val_data["labels"].float()
        )

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # === Model ===
        model = BertMeanClassifier(MODEL_NAME, num_labels=len(labels_list))


    elif MODEL_NAME == LONGFORMER:

        # === Datasets ===
        train_dataset = TensorDataset(
            train_data["input_ids"],
            train_data["attention_mask"],
            train_data["labels"].float()
        )
        val_dataset = TensorDataset(
            val_data["input_ids"],
            val_data["attention_mask"],
            val_data["labels"].float()
        )

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # === Model ===
        model = LongformerClassifier(MODEL_NAME, num_labels=len(labels_list))

    elif MODEL_NAME == BIGBIRD:

        # === Datasets ===
        train_dataset = TensorDataset(
            train_data["input_ids"],
            train_data["attention_mask"],
            train_data["labels"].float()
        )
        val_dataset = TensorDataset(
            val_data["input_ids"],
            val_data["attention_mask"],
            val_data["labels"].float()
        )

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        # === Model ===
        model = BigbirdClassifier(MODEL_NAME, num_labels=len(labels_list))



    model = model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCEWithLogitsLoss()

    # === TRAINING LOOP ===
    for epoch in range(epochs):
        print(f"\n===== EPOCH {epoch+1}/{epochs} =====")

        model.train()
        total_loss = 0.0

        for batch_idx, batch in enumerate(train_loader):
            labels = batch[-1].to(device)
            optimizer.zero_grad()

            # Forward
            outputs = model(batch[:-1], device)

            # Compute loss
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if (batch_idx + 1) % 10 == 0 or batch_idx == 0:
                print(f"  Batch {batch_idx+1}/{len(train_loader)} - Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)

        # === EVALUATION LOOP ===
        model.eval()
        eval_loss = 0.0
        all_labels = []
        all_preds = []

        with torch.no_grad():
            for batch in val_loader:
                labels = batch[-1].to(device)
                outputs = model(batch[:-1], device)

                loss = criterion(outputs, labels)
                eval_loss += loss.item()

                # --- Predictions ---
                preds = torch.sigmoid(outputs) > threshold  # (batch_size, num_labels)

                # Move to CPU for sklearn
                all_labels.append(labels.cpu().numpy())
                all_preds.append(preds.cpu().numpy())

        # === Combine batches ===
        all_labels = np.concatenate(all_labels, axis=0)
        all_preds = np.concatenate(all_preds, axis=0)

        # === Metrics ===
        avg_eval_loss = eval_loss / len(val_loader)
        evalMetric = compute_eval_metrics(avg_eval_loss, all_labels, all_preds, labels_list)
        print_eval_metrics(evalMetric)

    save_eval_metrics(evalMetrics, evalMetric)


print_eval_metrics(evalMetrics)



CONFIG: batch_size=2, epochs=20, device=cuda

===== Fold 1 =====
4098
4098
4098
4098
4098
4098
4098

===== EPOCH 1/20 =====
  Batch 1/61 - Loss: 0.8321
  Batch 10/61 - Loss: 0.6434
  Batch 20/61 - Loss: 0.6763
  Batch 30/61 - Loss: 0.5856
  Batch 40/61 - Loss: 0.4937
  Batch 50/61 - Loss: 0.5267
  Batch 60/61 - Loss: 0.4717
Loss: 0.5284 F1-Micro: 0.0000 F1-Macro-Avg 0.0000 Precision: 0.0000 Recall: 0.0000

===== EPOCH 2/20 =====
  Batch 1/61 - Loss: 0.4755
  Batch 10/61 - Loss: 0.6179
  Batch 20/61 - Loss: 0.4472
  Batch 30/61 - Loss: 0.4903
  Batch 40/61 - Loss: 0.5575
  Batch 50/61 - Loss: 0.3684
  Batch 60/61 - Loss: 0.5855
Loss: 0.5288 F1-Micro: 0.0000 F1-Macro-Avg 0.0000 Precision: 0.0000 Recall: 0.0000

===== EPOCH 3/20 =====
  Batch 1/61 - Loss: 0.5898
  Batch 10/61 - Loss: 0.5337
  Batch 20/61 - Loss: 0.7546
  Batch 30/61 - Loss: 0.5000
  Batch 40/61 - Loss: 0.5496
  Batch 50/61 - Loss: 0.5138
  Batch 60/61 - Loss: 0.4625
Loss: 0.5287 F1-Micro: 0.0000 F1-Macro-Avg 0.0000 Precis

In [21]:
globalMetricsDf, labelMetricsDf = eval_metrics_to_df(evalMetrics)

In [22]:
globalMetricsDf

Unnamed: 0,loss,accuracy,precision,recall,f1_micro,f1_macro_avg,f1_weighted_avg
0,0.419626,0.151828,0.464892,0.1703,0.243599,0.189893,0.19136


In [23]:
labelMetricsDf

Unnamed: 0,precision,recall,f1-score,support
aide_hygiene_personnelle,0.4,0.188889,0.232143,6.4
commission_faire_marche,0.1,0.057143,0.072727,6.0
cuisiner,0.516667,0.33,0.363377,6.8
logistique_medicale,0.0,0.0,0.0,5.4
menage_epoussetage,0.4,0.32,0.35,4.6
menage_lessive,0.333333,0.283333,0.304762,5.0
menage_plancher,0.0,0.0,0.0,3.8
mobilite_confort,0.0,0.0,0.0,5.4
mobilite_transport_accompagnement_rv,0.4,0.25,0.304762,6.2
nourrir_personne,0.303333,0.32381,0.287143,5.6
