In [None]:
# Cellule 1 : Installation des dépendances

# Installer les outils nécessaires
!pip install transformers datasets accelerate -U
!pip install evaluate scikit-learn numpy pandas sentencepiece
!pip install peft # Pour la technique LoRA (nécessaire pour la mémoire)

print("Installation terminée. Passez à la cellule 2 pour préparer les données.")

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
[31mERROR: pip's dependency resolver does

In [None]:
# Cellule 2 : Préparation des données pour la Classification et la Génération

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer

FILE_NAME = 'dataset-tickets-multi-lang3-4k.csv'
CLASSIFIER_MODEL_NAME = "xlm-roberta-base"

# 1. Charger, renommer et nettoyer les données
df = pd.read_csv(FILE_NAME)
df = df.rename(columns={
    'body': 'text',
    'queue': 'category_queue',
    'answer': 'agent_answer'
})
df_clean = df.dropna(subset=['text', 'category_queue', 'agent_answer']).copy()

# 2. Création des labels numériques (ex: 'Technical Support' -> 0)
label_to_id = {label: i for i, label in enumerate(df_clean['category_queue'].unique())}
df_clean['label'] = df_clean['category_queue'].map(label_to_id)
id_to_label = {v: k for k, v in label_to_id.items()}

# 3. Séparation des données
train_df, temp_df = train_test_split(df_clean, test_size=0.2, random_state=42, stratify=df_clean['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# 4. Préparation pour XLM-R (Classification)
raw_datasets_cls = DatasetDict({
    'train': Dataset.from_pandas(train_df[['text', 'label']]),
    'validation': Dataset.from_pandas(val_df[['text', 'label']]),
    'test': Dataset.from_pandas(test_df[['text', 'label']]),
})

tokenizer_cls = AutoTokenizer.from_pretrained(CLASSIFIER_MODEL_NAME)
def tokenize_function_cls(examples):
    return tokenizer_cls(examples["text"], truncation=True)

tokenized_datasets_cls = raw_datasets_cls.map(tokenize_function_cls, batched=True)
tokenized_datasets_cls = tokenized_datasets_cls.remove_columns(["text", "__index_level_0__"])
tokenized_datasets_cls.set_format("torch")

print(f"Préparation des données de Classification terminée. Nombre de classes : {len(id_to_label)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Map:   0%|          | 0/3199 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Préparation des données de Classification terminée. Nombre de classes : 10


In [None]:
# Cellule 3 : Fine-Tuning et Sauvegarde du Classifieur

import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding

FINAL_CLASSIFIER_DIR = "final_queue_classifier"

# 1. Charger le modèle XLM-RoBERTa pour la classification
num_labels = len(id_to_label)
model_cls = AutoModelForSequenceClassification.from_pretrained(
    CLASSIFIER_MODEL_NAME,
    num_labels=num_labels,
    id2label=id_to_label,
    label2id=label_to_id
)

# 2. Définir la métrique F1
metric_cls = evaluate.load("f1")
def compute_metrics_cls(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric_cls.compute(predictions=predictions, references=labels, average="weighted")

data_collator_cls = DataCollatorWithPadding(tokenizer=tokenizer_cls)

# 3. Arguments d'entraînement (Robustes)
training_args_cls = TrainingArguments(
    output_dir="./xlm_roberta_classification_queue",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    fp16=True,
    report_to="none",
)

# 4. Lancer l'Entraînement 🏁
trainer_cls = Trainer(
    model=model_cls,
    args=training_args_cls,
    train_dataset=tokenized_datasets_cls["train"],
    eval_dataset=tokenized_datasets_cls["validation"],
    tokenizer=tokenizer_cls,
    data_collator=data_collator_cls,
    compute_metrics=compute_metrics_cls,
)

print(f"Début du Fine-Tuning pour {num_labels} classes de départements...")
trainer_cls.train()

# 5. Sauvegarde
trainer_cls.save_model(FINAL_CLASSIFIER_DIR)
tokenizer_cls.save_pretrained(FINAL_CLASSIFIER_DIR)
print(f"\nModèle de classification sauvegardé dans : {FINAL_CLASSIFIER_DIR}")

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer_cls = Trainer(


Début du Fine-Tuning pour 10 classes de départements...


Step,Training Loss
500,1.5289



Modèle de classification sauvegardé dans : final_queue_classifier


In [None]:
# Cellule 4 : Préparation des données pour la Génération (T5-Small)

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from datasets import DatasetDict

# MODÈLE T5 UNILINGUE (le plus léger)
MODEL_T5 = "t5-small"
t5_tokenizer = AutoTokenizer.from_pretrained(MODEL_T5)

# Créer les DataFrames bruts pour T5 (Input: text, Target: agent_answer)
t5_train_df = df_clean.loc[train_df.index][['text', 'agent_answer']].dropna()
t5_val_df = df_clean.loc[val_df.index][['text', 'agent_answer']].dropna()

# Retirer les colonnes vides ou trop courtes
t5_train_df = t5_train_df[t5_train_df['agent_answer'].str.len() > 10]
t5_val_df = t5_val_df[t5_val_df['agent_answer'].str.len() > 10]

t5_raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(t5_train_df),
    'validation': Dataset.from_pandas(t5_val_df),
})

# Fonction de Tokenisation pour Seq2Seq
def preprocess_for_t5(examples):
    inputs = [doc for doc in examples["text"]]
    targets = [doc for doc in examples["agent_answer"]]

    model_inputs = t5_tokenizer(inputs, max_length=512, truncation=True)

    with t5_tokenizer.as_target_tokenizer():
        labels = t5_tokenizer(targets, max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_t5_datasets = t5_raw_datasets.map(preprocess_for_t5, batched=True)
tokenized_t5_datasets = tokenized_t5_datasets.remove_columns(["text", "agent_answer", "__index_level_0__"])
tokenized_t5_datasets.set_format("torch")

print("Préparation des données pour la Génération (T5-Small) terminée.")

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/3199 [00:00<?, ? examples/s]



Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Préparation des données pour la Génération (T5-Small) terminée.


In [None]:
# Cellule 5 : Fine-Tuning du Générateur (T5-Small + LoRA)

from transformers import TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

FINAL_GENERATOR_DIR = "final_response_generator"

# 1. Charger le modèle T5-Small
MODEL_T5 = "t5-small"
t5_model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_T5)
t5_data_collator = DataCollatorForSeq2Seq(t5_tokenizer, model=t5_model)

# --- 2. Configuration LoRA ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)
t5_model = get_peft_model(t5_model, lora_config)
t5_model.print_trainable_parameters()

# 3. Arguments d'entraînement T5
t5_training_args = TrainingArguments(
    output_dir="./t5_response_generator_lora",
    num_train_epochs=3,

    # 🛑 TAILLE DE LOT MINIMALE POUR ÉVITER OOM
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,

    learning_rate=5e-5,
    save_strategy="epoch",
    fp16=True,
    report_to="none",
    remove_unused_columns=False, # Pour PEFT
)

# 4. Lancer l'Entraînement T5 🏁
t5_trainer = Trainer(
    model=t5_model,
    args=t5_training_args,
    train_dataset=tokenized_t5_datasets["train"],
    eval_dataset=tokenized_t5_datasets["validation"],
    tokenizer=t5_tokenizer,
    data_collator=t5_data_collator,
)

print("\nDébut du Fine-Tuning pour la Génération de Réponses (T5-Small + LoRA)...")
t5_trainer.train()

# 5. Sauvegarde
t5_model.save_pretrained(FINAL_GENERATOR_DIR)
t5_tokenizer.save_pretrained(FINAL_GENERATOR_DIR)
print(f"Modèle de génération sauvegardé dans : {FINAL_GENERATOR_DIR}")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 294,912 || all params: 60,801,536 || trainable%: 0.4850


  t5_trainer = Trainer(



Début du Fine-Tuning pour la Génération de Réponses (T5-Small + LoRA)...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
500,3.5708
1000,3.0945
1500,2.9578
2000,2.9457
2500,2.9393
3000,2.8462
3500,2.8445
4000,2.8081
4500,2.815
5000,2.7431


Modèle de génération sauvegardé dans : final_response_generator


In [None]:
# Cellule 6 : Démonstration du Pipeline Intégré (FIN DU PROJET)

from transformers import pipeline

# --- 1. Chargement des modèles sauvegardés ---
# A. CLASSIFIEUR (XLM-RoBERTa) - Modèle FINI
classifier = pipeline(
    "text-classification",
    model=FINAL_CLASSIFIER_DIR,
    tokenizer=FINAL_CLASSIFIER_DIR
)
# B. GÉNÉRATEUR (T5-Small + LoRA) - Modèle FINI
generator = pipeline(
     "text2text-generation",
     model=FINAL_GENERATOR_DIR,
     tokenizer=FINAL_GENERATOR_DIR,
     device=0 # Utiliser le GPU
 )
# C. NER (Extraction d'entités, pré-entraîné XLM-R)
ner_pipeline = pipeline("ner", model="xlm-roberta-large-finetuned-conll03-english", grouped_entities=True)


# --- 2. NOUVEAU TICKET (en Français, pour la démo) ---
nouveau_ticket = "Bonjour, mon compte est bloqué depuis ce matin et je n'arrive pas à me connecter. C'est urgent ! Mon identifiant client est 123456."

print("\n\n#####################################################")
print("############# DÉMONSTRATION DU PIPELINE FINAL #############")
print("#####################################################")
print(f"**INPUT :** {nouveau_ticket}\n")

# A. CLASSIFICATION (Triage)
classification_result = classifier(nouveau_ticket)[0]
categorie = classification_result['label']
confiance_class = classification_result['score']
print(f"1. CLASSIFICATION (Département) : {categorie} (Confiance: {confiance_class:.2f})")

# B. EXTRACTION NER
entites_result = ner_pipeline(nouveau_ticket)
print("\n2. EXTRACTION D'ENTITÉS (NER) :")
for entite in entites_result:
    print(f"  - {entite['entity_group']} : {entite['word']} (Score: {entite['score']:.2f})")

# C. GÉNÉRATION DE RÉPONSE
prompt = f"GÉNÉRER RÉPONSE POUR {categorie}: {nouveau_ticket}"
generated_output = generator(
    prompt,
    max_length=150,
    do_sample=True,
    top_k=50,
    num_return_sequences=1
)[0]['generated_text']

print("\n3. PROPOSITION DE RÉPONSE (T5-Small Générateur) :")
print("--------------------------------------------------")
print(generated_output)
print("--------------------------------------------------")

Device set to use cuda:0
Device set to use cuda:0


config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Device set to use cuda:0




#####################################################
############# DÉMONSTRATION DU PIPELINE FINAL #############
#####################################################
**INPUT :** Bonjour, mon compte est bloqué depuis ce matin et je n'arrive pas à me connecter. C'est urgent ! Mon identifiant client est 123456.

1. CLASSIFICATION (Département) : Technical Support (Confiance: 0.38)

2. EXTRACTION D'ENTITÉS (NER) :


Both `max_new_tokens` (=256) and `max_length`(=150) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



3. PROPOSITION DE RÉPONSE (T5-Small Générateur) :
--------------------------------------------------
GÉNÉRER RÉPONSE POUR Technical Support: Bonjour, je n'arrive pas à me connecter. C'est urgent !
--------------------------------------------------


In [None]:
# Cellule Finale : Compression et Téléchargement des Modèles

# 1. Compresser les dossiers de modèles entraînés
!zip -r final_classifier.zip final_queue_classifier
!zip -r final_generator.zip final_response_generator

print("\nFichiers ZIP créés. Vous pouvez maintenant les télécharger.")
print("Allez dans la barre latérale des dossiers de Colab pour télécharger:")
print(" - final_classifier.zip")
print(" - final_generator.zip")

  adding: final_queue_classifier/ (stored 0%)
  adding: final_queue_classifier/tokenizer.json (deflated 76%)
  adding: final_queue_classifier/config.json (deflated 56%)
  adding: final_queue_classifier/sentencepiece.bpe.model (deflated 49%)
  adding: final_queue_classifier/model.safetensors (deflated 26%)
  adding: final_queue_classifier/tokenizer_config.json (deflated 76%)
  adding: final_queue_classifier/special_tokens_map.json (deflated 52%)
  adding: final_queue_classifier/training_args.bin (deflated 53%)
  adding: final_response_generator/ (stored 0%)
  adding: final_response_generator/tokenizer.json (deflated 74%)
  adding: final_response_generator/spiece.model (deflated 48%)
  adding: final_response_generator/README.md (deflated 66%)
  adding: final_response_generator/tokenizer_config.json (deflated 95%)
  adding: final_response_generator/adapter_model.safetensors (deflated 7%)
  adding: final_response_generator/special_tokens_map.json (deflated 85%)
  adding: final_response_gen