In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import s3fs
import sys
import warnings
from dotenv import load_dotenv
import torch
import fireducks.pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import (
                        CamembertTokenizer,
                        CamembertForSequenceClassification,
                        Trainer,
                        TrainingArguments
                        )

sys.path.append("../src")
from ml_utils import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/onyxia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting fr-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
load_dotenv()
pd.set_option("display.max_columns", None)
warnings.simplefilter("ignore")
fs = s3fs.S3FileSystem(
            client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
            key=os.environ["Accesskey"],
            secret=os.environ["Secretkey"],
            token=os.environ["Token"]
)

In [4]:
with fs.open("elissamim/text_classification_men/data/stages-votes.json", "r") as file:
    df = pd.read_json(file)

df = df.groupby("phrase_text", as_index = False)["sol"].apply(lambda x: x.mode().iloc[0])
df["sol"]=df["sol"].apply(lambda x: 1 if x == "ok" else 0)
df["clean_phrase_text"] = df["phrase_text"].apply(lambda x: nltk_text_preprocessing(x, True))
df = df[df["clean_phrase_text"] != ""]
df.head()

Unnamed: 0,phrase_text,sol,clean_phrase_text
0,* Aider à la mise en place de l évènement Shar...,0,aider mise place évènemer shareplan envoi rapp...
1,* Comprendre le métier des achats * Comment or...,0,comprendre métier achat comment organiser appe...
2,* Fendre du bois en forêt au merlin manuelleme...,0,fendre boi forêt merlin manuellemer débarder b...
4,"2 jours au CDI , 1 jour en arts plastiques , 1...",0,2 jour cdi 1 jour art plastique 1 jour musiqu ...
5,4 jours au sein du Bureau des affaires institu...,1,4 jour sein bureau affaire institutionnel fina...


# Dataset creation

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    df["clean_phrase_text"].tolist(),
    df["sol"].tolist(),
    test_size=.2,
    stratify=df["sol"],
    random_state=42
)

# We use CamemBERT Tokenizer (Byte-Level BPE) with truncation to 512 tokens and add [PAD] tokens for sequences that are too short
tokenizer = CamembertTokenizer.from_pretrained("camembert-base",
                                              do_lower_case=True)
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=512)

# We define the dataset class for PyTorch model fine-tuning
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}
    
    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, y_train)
val_dataset = ClassificationDataset(val_encodings, y_val)

# Hyperparameter tuning

In [7]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Define the search space for Optuna
def model_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 500),
    }

# Define basic training args (used as a base)
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    save_strategy="no",  # Disable saving for search runs
    logging_dir="./logs",
    load_best_model_at_end=False,
)

# Use the Trainer to run hyperparameter search
trainer = Trainer(
    model_init=lambda: CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run the search
best_trial = trainer.hyperparameter_search(
    direction="maximize",              
    hp_space=model_hp_space,
    n_trials=10                       
)

print("Best trial:")
print(best_trial)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-05-01 16:29:44,362] A new study created in memory with name: no-name-1994bfd8-037c-4b25-8573-910cd664aa9a
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 16:30:26,507] Trial 0 finished with value: 0.6804511278195489 and parameters: {'learning_rate': 1.5280571463102804e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'weight_decay': 0.11198410887080655, 'warmup_steps': 147}. Best is trial 0 with value: 0.6804511278195489.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 16:31:29,266] Trial 1 finished with value: 0.7030075187969925 and parameters: {'learning_rate': 1.079115015424602e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'weight_decay': 0.25810209532699663, 'warmup_steps': 293}. Best is trial 1 with value: 0.7030075187969925.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.5516


[I 2025-05-01 16:33:05,523] Trial 2 finished with value: 0.6954887218045113 and parameters: {'learning_rate': 3.090742533042018e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'weight_decay': 0.042243791459208535, 'warmup_steps': 85}. Best is trial 1 with value: 0.7030075187969925.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 16:34:05,753] Trial 3 finished with value: 0.6390977443609023 and parameters: {'learning_rate': 2.743744229102663e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.27501968177635955, 'warmup_steps': 495}. Best is trial 1 with value: 0.7030075187969925.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.6196


[I 2025-05-01 16:35:43,930] Trial 4 finished with value: 0.706766917293233 and parameters: {'learning_rate': 1.7542609935697248e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'weight_decay': 0.16110440354272643, 'warmup_steps': 291}. Best is trial 4 with value: 0.706766917293233.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 16:36:44,515] Trial 5 finished with value: 0.6992481203007519 and parameters: {'learning_rate': 4.082475406459598e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.153120858048974, 'warmup_steps': 179}. Best is trial 4 with value: 0.706766917293233.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 16:37:45,117] Trial 6 finished with value: 0.7180451127819549 and parameters: {'learning_rate': 3.312003575478027e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.01075471901811119, 'warmup_steps': 428}. Best is trial 6 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 16:38:57,878] Trial 7 finished with value: 0.650375939849624 and parameters: {'learning_rate': 1.655329997089462e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.11417892351149324, 'warmup_steps': 317}. Best is trial 6 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 16:39:48,677] Trial 8 finished with value: 0.6879699248120301 and parameters: {'learning_rate': 2.649836506254842e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.236259292771691, 'warmup_steps': 471}. Best is trial 6 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.63


[I 2025-05-01 16:41:08,826] Trial 9 finished with value: 0.7105263157894737 and parameters: {'learning_rate': 1.335590798374518e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'weight_decay': 0.18541575128731658, 'warmup_steps': 415}. Best is trial 6 with value: 0.7180451127819549.


Best trial:
BestRun(run_id='6', objective=0.7180451127819549, hyperparameters={'learning_rate': 3.312003575478027e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.01075471901811119, 'warmup_steps': 428}, run_summary=None)


# Final model

In [10]:
# We load the classifier : CamemBERT Head with a classification head with the number of outputs for the final layer of 2
# The loss is then by default CrossEntropyLoss()
model = CamembertForSequenceClassification.from_pretrained("camembert-base", 
                                                           num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# We use the previous hyperparameters
best_params = best_trial.hyperparameters

# We define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    warmup_steps=best_params["warmup_steps"],
)

# We train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.7004
20,0.6936
30,0.6974
40,0.6959
50,0.6948
60,0.6897
70,0.6856
80,0.6872
90,0.6805
100,0.6655


Predicted labels: [0 1 0 0 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 1 1 1 1 0 0 0
 1 0 1 0 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 0 0 0 1 0 0 1 0 1
 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 1 0 0 0 0 1 1
 1 0 1 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 1
 1 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 1 0 1 0 0 1 0 1 0 1 1 1 1 0 1 0 1 0 0 0 1 0 0 1 1 0 1 0 1 0 1 0
 1 1 1 0 1 0 1 1 0 0 0 0 1 1 0 1 1 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0 1 1
 0 0 0 0 0 0 0]


In [14]:
trainer.evaluate(eval_dataset=val_dataset)

{'eval_loss': 0.563167154788971,
 'eval_accuracy': 0.7142857142857143,
 'eval_runtime': 0.9588,
 'eval_samples_per_second': 277.434,
 'eval_steps_per_second': 35.461,
 'epoch': 3.0}

# Stockage du modèle

In [12]:
trainer.save_model("../models/camembert_model")
tokenizer.save_pretrained("../models/camembert_model")

('../models/camembert_model/tokenizer_config.json',
 '../models/camembert_model/special_tokens_map.json',
 '../models/camembert_model/sentencepiece.bpe.model',
 '../models/camembert_model/added_tokens.json')