In [None]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Charger et préparer les données
data = pd.read_csv('../data/train_data.csv', sep=';')
X_train, X_val, y_train, y_val = train_test_split(data['text'], data['label'], test_size=0.2, random_state=2003)


In [18]:
# Charger le tokenizer CamemBERT
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
unique_labels = list(set(y_train))
print(unique_labels)
# Fonction pour tokeniser les données et créer un Dataset compatible
def tokenize_and_create_dataset(texts, labels, tokenizer, max_length=128):
    tokens = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=max_length)
    labels_encoded = [unique_labels.index(label) for label in labels]
    tokens['labels'] = labels_encoded
    return Dataset.from_dict(tokens)

# Créer les Datasets pour l'entraînement et la validation
train_dataset = tokenize_and_create_dataset(X_train, y_train, tokenizer)
val_dataset = tokenize_and_create_dataset(X_val, y_val, tokenizer)



['out_of_scope', 'book_flight', 'carry_on', 'flight_status', 'book_hotel', 'lost_luggage', 'translate', 'travel_alert', 'travel_suggestion']


In [10]:

# Charger le modèle CamemBERT pour la classification
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(set(y_train)))

# Configurer les arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",   # Évaluation par époque pour surveiller les performances
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,  # Garder uniquement le dernier checkpoint pour réduire l'espace de stockage
    save_strategy="epoch"  # Sauvegarde du modèle à chaque fin d'époque
)

# Préparer le Trainer avec les données et les labels
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Entraîner le modèle
trainer.train()


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.7302500009536743, 'eval_runtime': 21.0367, 'eval_samples_per_second': 11.979, 'eval_steps_per_second': 0.761, 'epoch': 1.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.3672679662704468, 'eval_runtime': 17.9699, 'eval_samples_per_second': 14.023, 'eval_steps_per_second': 0.89, 'epoch': 2.0}


  0%|          | 0/16 [00:00<?, ?it/s]

{'eval_loss': 1.2680690288543701, 'eval_runtime': 23.6255, 'eval_samples_per_second': 10.666, 'eval_steps_per_second': 0.677, 'epoch': 3.0}
{'train_runtime': 1068.4368, 'train_samples_per_second': 2.819, 'train_steps_per_second': 0.177, 'train_loss': 1.6887061709449405, 'epoch': 3.0}


TrainOutput(global_step=189, training_loss=1.6887061709449405, metrics={'train_runtime': 1068.4368, 'train_samples_per_second': 2.819, 'train_steps_per_second': 0.177, 'total_flos': 52629629755824.0, 'train_loss': 1.6887061709449405, 'epoch': 3.0})

In [17]:
# New example texts for prediction
import pandas as pd
new_texts = pd.read_csv('../data/test_data.csv', sep=';')['text'].tolist()

# Tokenize the new texts
new_tokens = tokenizer(new_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Run the model to get logits
outputs = trainer.model(**new_tokens)
logits = outputs.logits

# Get the predicted labels
predicted_class_indices = torch.argmax(logits, dim=1)
unique_labels = list(set(y_train))  # Ensure this matches the original training labels
predicted_labels = [unique_labels[idx] for idx in predicted_class_indices]

print(predicted_labels)  # Display predicted labels


['lost_luggage', 'carry_on', 'book_hotel', 'travel_alert', 'book_hotel', 'book_flight', 'flight_status', 'translate', 'lost_luggage', 'travel_alert', 'travel_alert', 'travel_suggestion', 'book_hotel', 'carry_on', 'book_hotel', 'translate', 'travel_suggestion', 'out_of_scope', 'lost_luggage', 'flight_status', 'lost_luggage', 'travel_suggestion', 'flight_status', 'translate', 'book_flight', 'book_hotel', 'out_of_scope', 'travel_alert', 'book_flight', 'travel_suggestion', 'translate', 'lost_luggage', 'book_flight', 'flight_status', 'book_flight', 'translate', 'translate', 'travel_suggestion', 'book_flight', 'lost_luggage', 'carry_on', 'book_hotel', 'book_hotel', 'book_hotel', 'travel_suggestion', 'out_of_scope', 'flight_status', 'travel_alert', 'book_hotel', 'carry_on', 'book_hotel', 'travel_alert', 'travel_alert', 'book_hotel', 'book_hotel', 'out_of_scope', 'flight_status', 'lost_luggage', 'carry_on', 'lost_luggage', 'carry_on', 'lost_luggage', 'lost_luggage', 'flight_status', 'translate

In [14]:
model.save_pretrained("fine_tuned_camembert_model")
tokenizer.save_pretrained("fine_tuned_camembert_model")
print("Modèle et tokenizer fine-tunés enregistrés avec succès.")

Modèle et tokenizer fine-tunés enregistrés avec succès.
