In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import s3fs
import sys
import warnings
from dotenv import load_dotenv
import torch
import fireducks.pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import (
                        CamembertTokenizer,
                        CamembertForSequenceClassification,
                        Trainer,
                        TrainingArguments
                        )

sys.path.append("../src")
from ml_utils import *

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home/onyxia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/onyxia/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/onyxia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m88.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
load_dotenv()
pd.set_option("display.max_columns", None)
warnings.simplefilter("ignore")
fs = s3fs.S3FileSystem(
            client_kwargs={"endpoint_url": "https://minio.lab.sspcloud.fr"},
            key=os.environ["Accesskey"],
            secret=os.environ["Secretkey"],
            token=os.environ["Token"]
)

In [4]:
with fs.open("elissamim/text_classification_men/data/stages-votes.json", "r") as file:
    df = pd.read_json(file)

df = df.groupby("phrase_text", as_index = False)["sol"].apply(lambda x: x.mode().iloc[0])
df["sol"]=df["sol"].apply(lambda x: 1 if x == "ok" else 0)
df["clean_phrase_text"] = df["phrase_text"].apply(lambda x: nltk_text_preprocessing(x, True))
df = df[df["clean_phrase_text"] != ""]
df.head()

Unnamed: 0,phrase_text,sol,clean_phrase_text
0,* Aider à la mise en place de l évènement Shar...,0,aider mise place évènemer shareplan envoi rapp...
1,* Comprendre le métier des achats * Comment or...,0,comprendre métier achat comment organiser appe...
2,* Fendre du bois en forêt au merlin manuelleme...,0,fendre boi forêt merlin manuellemer débarder b...
4,"2 jours au CDI , 1 jour en arts plastiques , 1...",0,2 jour cdi 1 jour art plastique 1 jour musiqu ...
5,4 jours au sein du Bureau des affaires institu...,1,4 jour sein bureau affaire institutionnel fina...


# Dataset creation

In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    df["clean_phrase_text"].tolist(),
    df["sol"].tolist(),
    test_size=.2,
    stratify=df["sol"],
    random_state=42
)

# We use CamemBERT Tokenizer (Byte-Level BPE) with truncation to 512 tokens and add [PAD] tokens for sequences that are too short
tokenizer = CamembertTokenizer.from_pretrained("camembert-base",
                                              do_lower_case=True)
train_encodings = tokenizer(X_train, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(X_val, truncation=True, padding=True, max_length=512)

# We define the dataset class for PyTorch model fine-tuning
class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        return {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        } | {"labels": torch.tensor(self.labels[idx])}
    
    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, y_train)
val_dataset = ClassificationDataset(val_encodings, y_val)

# Hyperparameter tuning

In [6]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

# Define the search space for Optuna
def model_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3),
        "warmup_steps": trial.suggest_int("warmup_steps", 0, 500),
    }

# Define basic training args (used as a base)
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    save_strategy="no",  # Disable saving for search runs
    logging_dir="./logs",
    load_best_model_at_end=False,
)

# Use the Trainer to run hyperparameter search
trainer = Trainer(
    model_init=lambda: CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=2),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Run the search
best_trial = trainer.hyperparameter_search(
    direction="maximize",              
    hp_space=model_hp_space,
    n_trials=10                       
)

print("Best trial:")
print(best_trial)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-05-01 17:09:44,693] A new study created in memory with name: no-name-27fcd94f-b6ec-40c4-a192-71fa81c41ded
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:10:46,822] Trial 0 finished with value: 0.6804511278195489 and parameters: {'learning_rate': 2.7615572930070498e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'weight_decay': 0.2683816465299079, 'warmup_steps': 244}. Best is trial 0 with value: 0.6804511278195489.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:11:56,198] Trial 1 finished with value: 0.7142857142857143 and parameters: {'learning_rate': 3.068008638790823e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.16797127363433026, 'warmup_steps': 361}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:12:45,346] Trial 2 finished with value: 0.6917293233082706 and parameters: {'learning_rate': 2.459905599444873e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.10464445333449184, 'warmup_steps': 289}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:13:35,208] Trial 3 finished with value: 0.6954887218045113 and parameters: {'learning_rate': 2.9825308697925627e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.195748161503588, 'warmup_steps': 403}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:14:42,709] Trial 4 finished with value: 0.6804511278195489 and parameters: {'learning_rate': 1.3701211332195358e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'weight_decay': 0.14752180925925465, 'warmup_steps': 50}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:15:35,740] Trial 5 finished with value: 0.7030075187969925 and parameters: {'learning_rate': 3.60728807007218e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.08821973552823019, 'warmup_steps': 162}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:17:00,659] Trial 6 finished with value: 0.6954887218045113 and parameters: {'learning_rate': 1.0928983199146314e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 16, 'weight_decay': 0.28650267123385836, 'warmup_steps': 466}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:18:00,975] Trial 7 finished with value: 0.6616541353383458 and parameters: {'learning_rate': 2.2676744042386718e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.17075978411540554, 'warmup_steps': 455}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:19:00,989] Trial 8 finished with value: 0.6766917293233082 and parameters: {'learning_rate': 2.8341253640375073e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.06653482352299632, 'warmup_steps': 323}. Best is trial 1 with value: 0.7142857142857143.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


[I 2025-05-01 17:20:02,340] Trial 9 finished with value: 0.706766917293233 and parameters: {'learning_rate': 2.1648356288774424e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.20207847005448648, 'warmup_steps': 245}. Best is trial 1 with value: 0.7142857142857143.


Best trial:
BestRun(run_id='1', objective=0.7142857142857143, hyperparameters={'learning_rate': 3.068008638790823e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.16797127363433026, 'warmup_steps': 361}, run_summary=None)


# Final model

In [7]:
# We load the classifier : CamemBERT Head with a classification head with the number of outputs for the final layer of 2
# The loss is then by default CrossEntropyLoss()
model = CamembertForSequenceClassification.from_pretrained("camembert-base", 
                                                           num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# We use the previous hyperparameters
best_params = best_trial.hyperparameters

# We define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
    learning_rate=best_params["learning_rate"],
    per_device_train_batch_size=best_params["per_device_train_batch_size"],
    num_train_epochs=best_params["num_train_epochs"],
    weight_decay=best_params["weight_decay"],
    warmup_steps=best_params["warmup_steps"],
)

# We train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.7075
20,0.7023
30,0.6989
40,0.6936
50,0.6844
60,0.6851
70,0.6713
80,0.6646
90,0.6699
100,0.6625


TrainOutput(global_step=170, training_loss=0.6617711151347441, metrics={'train_runtime': 71.7565, 'train_samples_per_second': 73.861, 'train_steps_per_second': 2.369, 'total_flos': 212441621652000.0, 'train_loss': 0.6617711151347441, 'epoch': 5.0})

In [8]:
trainer.evaluate(eval_dataset=val_dataset)

{'eval_loss': 0.5719053745269775,
 'eval_accuracy': 0.7218045112781954,
 'eval_runtime': 1.0047,
 'eval_samples_per_second': 264.748,
 'eval_steps_per_second': 33.84,
 'epoch': 5.0}

# Stockage du modèle

In [9]:
trainer.save_model("../models/camembert_model")
tokenizer.save_pretrained("../models/camembert_model")

('../models/camembert_model/tokenizer_config.json',
 '../models/camembert_model/special_tokens_map.json',
 '../models/camembert_model/sentencepiece.bpe.model',
 '../models/camembert_model/added_tokens.json')