In [1]:
!nvidia-smi

Tue Oct 21 16:43:52 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   68C    P8             14W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import torch
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from torch import nn
from transformers import AutoTokenizer, AutoModel

Data Preprocessing

In [None]:
"""
Module to transform data to be consumable by model
"""


from transformers import BertTokenizerFast, AutoTokenizer



class DatabaseToBertDataset():
    def __init__(self, model_name):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name) # ("bert-base-multilingual-cased")
        self.tokenizerChunkLen = self.tokenizer.model_max_length

    def _tokenize(self, df: str) -> pd.DataFrame:
        """Tokenizer function"""
        tokenized = self.tokenizer(
            df["email"].to_list(),
            df["bill"].to_list(),
            padding='max_length',
            truncation=True,
            stride=128,                        # overlap between chunks
            return_overflowing_tokens=True,    # keep extra chunks
            return_offsets_mapping=True,       # optional: track positions in original text
            return_tensors="pt"                # PyTorch tensors
        )

        # print(type(tokenized))
        # print(tokenized['input_ids'].shape)
        # print(tokenized['token_type_ids'].shape)
        # print(tokenized['attention_mask'].shape)
        # print(tokenized['offset_mapping'].shape)
        # print(tokenized['overflow_to_sample_mapping'].shape)
        # input('MMMM')

        return tokenized

    def _encode_labels(self, df: pd.DataFrame) -> tuple[pd.DataFrame, np.ndarray]:
        """Encode labels"""

        # Get unique labels from labels description
        with open('labels.json', 'r') as file:
            labels = json.load(file)['etiquettes']
        labelIds = [label['id'] for label in labels]

        # Format original labels
        df['labels'] = df['labels'].apply(
            lambda x: x.split('|') if isinstance(x, str) else []
        )

        # Encode
        mlb = MultiLabelBinarizer(classes=labelIds)
        encoded = mlb.fit_transform(df['labels'])
        df = pd.concat(
            [df, pd.DataFrame(encoded, columns=mlb.classes_)],
            axis=1
        )
        df = df.drop(columns='labels')

        return df, mlb.classes_


    def execute(self, df : pd.DataFrame) -> tuple[dict[torch.Tensor], np.ndarray]:
        """
        Transform from database to dataset consumable by model
        """

        # Tokenize ('input_ids' 'attention_mask''token_type_ids' 'overflow_to_sample_mapping')
        data = self._tokenize(df)

        labelCols = None
        if 'labels' in df.columns:
            df, labelCols = self._encode_labels(df)

            labels_tensor = torch.tensor(df[labelCols].to_numpy())
            expanded_labels = labels_tensor[data['overflow_to_sample_mapping']]

            data['labels'] = expanded_labels

        return data, labelCols

Models

In [None]:
class BertMeanClassifier(nn.Module):
    def __init__(self, model_name, num_labels, freeze_bert=False):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.2)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask, token_type_ids):
        # BERT outputs all hidden states
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        # The BERT model returns:
        # - last_hidden_state: (batch_size, seq_len, hidden_dim)
        # - pooler_output: (batch_size, hidden_dim)
        last_hidden_state = outputs.last_hidden_state # (batch_size, seq_len, hidden_dim)
        pooled_output = outputs.pooler_output  # [CLS] embedding after tanh layer

        # Apply dropout + classification on the pooled [CLS] representation
        out = self.classifier(self.dropout(pooled_output))

        return out
