In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import s3fs
import sys
import warnings
from dotenv import load_dotenv
import torch
import fireducks.pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import (
                        CamembertTokenizer,
                        CamembertForSequenceClassification,
                        Trainer,
                        TrainingArguments
                        )

sys.path.append("../src")
from ml_utils import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /home/onyxia/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
load_dotenv()
pd.set_option("display.max_columns", None)
warnings.simplefilter("ignore")
fs = s3fs.S3FileSystem(
            client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
            key=os.environ["Accesskey"],
            secret=os.environ["Secretkey"],
            token=os.environ["Token"]
)

In [4]:
with fs.open("elissamim/text_classification_men/data/stages-votes.json", "r") as file:
    df = pd.read_json(file)

df = df.groupby("phrase_text", as_index = False)["sol"].apply(lambda x: x.mode().iloc[0])
df["sol"]=df["sol"].apply(lambda x: 1 if x == "ok" else 0)
df["clean_phrase_text"] = df["phrase_text"].apply(lambda x: nltk_text_preprocessing(x, True))
df = df[df["clean_phrase_text"] != ""]
df.head()

Unnamed: 0,phrase_text,sol,clean_phrase_text
0,* Aider à la mise en place de l évènement Shar...,0,aider mise place évènemer shareplan envoi rapp...
1,* Comprendre le métier des achats * Comment or...,0,comprendre métier achat comment organiser appe...
2,* Fendre du bois en forêt au merlin manuelleme...,0,fendre boi forêt merlin manuellemer débarder b...
4,"2 jours au CDI , 1 jour en arts plastiques , 1...",0,2 jour cdi 1 jour art plastique 1 jour musiqu ...
5,4 jours au sein du Bureau des affaires institu...,1,4 jour sein bureau affaire institutionnel fina...


# Dataset creation

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    df["clean_phrase_text"].tolist(),
    df["sol"].tolist(),
    test_size=.2,
    stratify=df["sol"],
    random_state=42
)

# We use CamemBERT Tokenizer (Byte-Level BPE) with truncation to 512 tokens and add [PAD] tokens for sequences that are too short
tokenizer = CamembertTokenizer.from_pretrained("camembert-base",
                                              do_lower_case=True)
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=512)

# We define the dataset class for PyTorch model fine-tuning
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}
    
    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, y_train)
val_dataset = ClassificationDataset(val_encodings, y_val)

# Hyperparameter tuning

In [6]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Define the search space for Optuna
def model_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 500),
    }

# Define basic training args (used as a base)
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    save_strategy="no",  # Disable saving for search runs
    logging_dir="./logs",
    load_best_model_at_end=False,
)

# Use the Trainer to run hyperparameter search
trainer = Trainer(
    model_init=lambda: CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run the search
best_trial = trainer.hyperparameter_search(
    direction="maximize",              
    hp_space=model_hp_space,
    n_trials=10                       
)

print("Best trial:")
print(best_trial)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-05-02 11:02:42,532] A new study created in memory with name: no-name-ef1883d9-5005-4cd0-a046-a0da1d516948
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:04:02,101] Trial 0 finished with value: 0.6992481203007519 and parameters: {'learning_rate': 1.1829987925713312e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'weight_decay': 0.10529155358099171, 'warmup_steps': 38}. Best is trial 0 with value: 0.6992481203007519.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.604


[I 2025-05-02 11:05:22,531] Trial 1 finished with value: 0.706766917293233 and parameters: {'learning_rate': 2.5526459319370905e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 8, 'weight_decay': 0.03400533690760766, 'warmup_steps': 352}. Best is trial 1 with value: 0.706766917293233.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:06:13,761] Trial 2 finished with value: 0.631578947368421 and parameters: {'learning_rate': 3.5590103952229653e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.1221796142950218, 'warmup_steps': 375}. Best is trial 1 with value: 0.706766917293233.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:07:37,688] Trial 3 finished with value: 0.6917293233082706 and parameters: {'learning_rate': 2.7851416451301224e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'weight_decay': 0.134319158579845, 'warmup_steps': 407}. Best is trial 1 with value: 0.706766917293233.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:08:38,995] Trial 4 finished with value: 0.7180451127819549 and parameters: {'learning_rate': 3.508041813680323e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.07105827435264765, 'warmup_steps': 209}. Best is trial 4 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:09:54,403] Trial 5 finished with value: 0.6616541353383458 and parameters: {'learning_rate': 2.3975468903234254e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.2832071864568592, 'warmup_steps': 429}. Best is trial 4 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:10:41,419] Trial 6 finished with value: 0.6052631578947368 and parameters: {'learning_rate': 2.5025259904991174e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'weight_decay': 0.28207299911384626, 'warmup_steps': 335}. Best is trial 4 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:11:42,730] Trial 7 finished with value: 0.6691729323308271 and parameters: {'learning_rate': 2.451159948130985e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.14231984381092444, 'warmup_steps': 349}. Best is trial 4 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:12:58,001] Trial 8 finished with value: 0.6992481203007519 and parameters: {'learning_rate': 1.6638563488023205e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.27853336038394677, 'warmup_steps': 234}. Best is trial 4 with value: 0.7180451127819549.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-02 11:13:59,207] Trial 9 finished with value: 0.6917293233082706 and parameters: {'learning_rate': 4.890531391464383e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.1426052713647916, 'warmup_steps': 187}. Best is trial 4 with value: 0.7180451127819549.


Best trial:
BestRun(run_id='4', objective=0.7180451127819549, hyperparameters={'learning_rate': 3.508041813680323e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.07105827435264765, 'warmup_steps': 209}, run_summary=None)


# Final model

In [7]:
# We load the classifier : CamemBERT Head with a classification head with the number of outputs for the final layer of 2
# The loss is then by default CrossEntropyLoss()
model = CamembertForSequenceClassification.from_pretrained("camembert-base", 
                                                           num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# We use the previous hyperparameters
best_params = best_trial.hyperparameters

# We define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    warmup_steps=best_params["warmup_steps"],
)

# We train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.6901
20,0.6865
30,0.6854
40,0.6647
50,0.6728
60,0.677
70,0.6568
80,0.6455
90,0.6529
100,0.6432


TrainOutput(global_step=136, training_loss=0.6513011560720556, metrics={'train_runtime': 60.666, 'train_samples_per_second': 69.891, 'train_steps_per_second': 2.242, 'total_flos': 169953297321600.0, 'train_loss': 0.6513011560720556, 'epoch': 4.0})

In [8]:
trainer.evaluate(eval_dataset=val_dataset)

{'eval_loss': 0.5859740972518921,
 'eval_accuracy': 0.6729323308270677,
 'eval_runtime': 1.0704,
 'eval_samples_per_second': 248.512,
 'eval_steps_per_second': 31.765,
 'epoch': 4.0}

# Stockage du modèle

In [9]:
trainer.save_model("../models/camembert_model")
tokenizer.save_pretrained("../models/camembert_model")

('../models/camembert_model/tokenizer_config.json',
 '../models/camembert_model/special_tokens_map.json',
 '../models/camembert_model/sentencepiece.bpe.model',
 '../models/camembert_model/added_tokens.json')