# Tokenizers à base de sous-mots

WordPiece: BERT, DistliBERT

Unigram: XLNet, ALBERT

Byte-Pair Encoding : GPT-2, RoBERTo

# La librairie Datasets par HF

https://huggingface.co/docs/datasets/index

# Installations et imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install numpy==1.26.4 transformers datasets mlflow dagshub evaluate tweet-preprocessor

In [None]:
from datasets import load_dataset

import time
import os
from google.colab import userdata

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib import rcParams
import matplotlib.font_manager as fm

import dagshub
import mlflow

import preprocessor as p
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, DataCollatorWithPadding, TrainingArguments, AutoModelForSequenceClassification, Trainer, EarlyStoppingCallback
from inspect import signature

import evaluate

import torch

from scipy.special import softmax

from itertools import product

In [None]:
# Récupère automatiquement le secret
dagshub_token = userdata.get('DAGSHUB_TOKEN')

# Initialisation Dagshub
dagshub.auth.add_app_token(dagshub_token)

# Connecter MLflow à Dagshub
dagshub.init(repo_owner='fabiencappelli', repo_name='Projet_07', mlflow=True)

In [None]:
font_path = os.path.expanduser("/content/drive/MyDrive/Colab Notebooks/fonts/Exo2-VariableFont_wght.ttf")  # Remplacez par le chemin exact
fm.fontManager.addfont(font_path)

# Définir la police globale avec le nom de la police
rcParams["font.family"] = "Exo 2"
# deux couleurs pertinentes pour aller avec la présentation
bleuclair = (0.15, 0.55, 0.82)
couleur_complementaire = (1 - bleuclair[0], 1 - bleuclair[1], 1 - bleuclair[2])
bleufonce = "#073642"

In [None]:
csvPath = '/content/drive/MyDrive/Colab Notebooks/Projet_07/df_cleaned.csv'
imgPrezPath = '/content/drive/MyDrive/Colab Notebooks/Projet_07/presentationimg'
mlflow.set_experiment("DISTILBERT")
checkpoint = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
import pandas as pd
from datasets import Dataset

# Load the data using pandas
df = pd.read_csv(csvPath)

# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Fonctions

In [None]:
def clean_and_tokenize(examples):
    # Clean sur chaque tweet
    cleaned_texts = [p.clean(t) for t in examples["text"]]
    # Puis tokenisation
    return tokenizer(
        cleaned_texts,
        truncation=True,
        padding=True,
        max_length=128,
    )

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric       = evaluate.load("f1")
roc_auc_metric  = evaluate.load("roc_auc")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = softmax(logits, axis=-1)[:,1]
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy":   accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "f1":         f1_metric.compute(predictions=preds, references=labels, average="binary")["f1"],
        "roc_auc":    roc_auc_metric.compute(prediction_scores=probs, references=labels)["roc_auc"],
    }

In [None]:
def stratified_sample_hf(dataset, label_col="target", frac=0.2, seed=34):
    # on extrait tous les labels de la colonne indiquée, sous forme de tableau numpy
    labels = np.array(dataset[label_col])
    # on crée une liste de tous les indices (positions) du dataset, de 0 à N-1
    indices = np.arange(len(labels))
    # on sample les indices en stratifiant
    sample_indices, _ = train_test_split(indices, train_size=frac, stratify=labels, random_state=seed)
    # on retourne la portion du dataset selon les indices
    return dataset.select(sample_indices)


# Sampling et GridSearch

In [None]:
sampled_ds = stratified_sample_hf(dataset)

In [None]:
tokenized_ds = sampled_ds.shuffle(seed=34).map(
    clean_and_tokenize,
    batched=True,
)
tokenized_ds = tokenized_ds.rename_column("target", "labels")
# Retirer la colonne text (et toute autre colonne non‑utile)
tokenized_ds = tokenized_ds.remove_columns(["text"])

# Passer au format torch pour que Trainer sache comment caster
tokenized_ds.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Split
split1 = tokenized_ds.train_test_split(test_size=0.2, seed=34)
train_val_ds = split1["train"]
test_ds = split1["test"]
split2 = train_val_ds.train_test_split(test_size=0.1, seed=34)
train_ds = split2["train"]
val_ds   = split2["test"]

# Hyperparams à explorer
learning_rates = [2e-5, 3e-5, 5e-5]
batch_sizes = [32, 64]
best_acc = 0
best_params = None
best_model = None

In [None]:
for lr, bs in product(learning_rates, batch_sizes):
  with mlflow.start_run():
    training_args = TrainingArguments("test-trainer",
                                  report_to=[],
                                  load_best_model_at_end=True,
                                  eval_strategy="epoch",
                                  per_device_train_batch_size=bs,
                                  per_device_eval_batch_size=bs,
                                  dataloader_num_workers=2,
                                  num_train_epochs=15,
                                  learning_rate=lr,
                                  dataloader_pin_memory=True,
                                  fp16=True,
                                  bf16=False,
                                  save_strategy="epoch",
                                  logging_steps=50,
                                  remove_unused_columns=True)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics  = compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )
    trainer.train()
    metrics = trainer.evaluate(val_ds)
    mlflow.log_metrics({"val_f1": metrics["eval_f1"], "val_acc": metrics["eval_accuracy"]})
    mlflow.log_params({"learning_rate": lr, "batch_size": bs})
    if metrics["eval_accuracy"] > best_acc:
        best_acc = metrics["eval_accuracy"]
        best_params = {"learning_rate": lr, "batch_size": bs}
        best_model = model

# Refit sur l'ensemble des données, avec les meilleurs hyperparamètres

In [None]:
tokenized_ds = dataset.shuffle(seed=34).map(
    clean_and_tokenize,
    batched=True,
)

tokenized_ds = tokenized_ds.rename_column("target", "labels")

# Retirer la colonne text (et toute autre colonne non‑utile)
tokenized_ds = tokenized_ds.remove_columns(["text"])

# Passer au format torch pour que Trainer sache comment caster
tokenized_ds.set_format("torch")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# Split
split1 = tokenized_ds.train_test_split(test_size=0.2, seed=34)
train_val_ds = split1["train"]
test_ds = split1["test"]
split2 = train_val_ds.train_test_split(test_size=0.1, seed=34)
train_ds = split2["train"]
val_ds   = split2["test"]

In [None]:
lr = best_params["learning_rate"]
bs = best_params["batch_size"]

In [None]:
lr = 2e-05
bs = 32

In [None]:
training_args = TrainingArguments("test-trainer",
                                  report_to=[],
                                  load_best_model_at_end=True,
                                  eval_strategy="epoch",
                                  per_device_train_batch_size=bs,
                                  per_device_eval_batch_size=bs,
                                  dataloader_num_workers=2,
                                  num_train_epochs=15,
                                  learning_rate=lr,
                                  dataloader_pin_memory=True,
                                  fp16=False,
                                  bf16=False,
                                  save_strategy="epoch",
                                  logging_steps=50,
                                  remove_unused_columns=True)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics  = compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)])

In [None]:
with mlflow.start_run() as run:

    trainer.train()

    mlflow.pytorch.log_model(
        pytorch_model=trainer.model,
        artifact_path="model",
        registered_model_name="BERT_Classification_Model"
    )

    eval_metrics = trainer.evaluate(test_ds)
    mlflow.log_metrics({
        "accuracy": eval_metrics["eval_accuracy"],
        "f1_score": eval_metrics["eval_f1"],
        "roc_auc": eval_metrics["eval_roc_auc"],
    })

    start = time.time()
    _ = trainer.predict(test_ds)
    elapsed = time.time() - start
    infer_time_ms = 1000 * elapsed / len(test_ds)
    mlflow.log_metric("inference_time_ms_per_sample", infer_time_ms)