In [1]:
#Import des librairies
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from torch.optim import AdamW  # AdamW est maintenant dans torch.optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Données d'expérimentation

data = [
    # Exporter notation
    ("export de la notation 123", "export_notation"),
    ("exporter notation 456", "export_notation"),
    ("je veux exporter la notation 789", "export_notation"),
    ("export notation numéro 12", "export_notation"),
    ("exporte moi la notation 45", "export_notation"),
    ("j'ai besoin d'exporter la notation 67", "export_notation"),
    ("peux-tu exporter notation 890", "export_notation"),
    ("export de notation 111", "export_notation"),
    ("télécharger notation 222", "export_notation"),
    ("obtenir export notation 333", "export_notation"),
    
    # Créer notation
    ("je veux créer une notation", "creer_notation"),
    ("créer une notation", "creer_notation"),
    ("créer notation", "creer_notation"),
    ("nouvelle notation", "creer_notation"),
    ("ajouter une notation", "creer_notation"),
    ("faire une nouvelle notation", "creer_notation"),
    ("je souhaite créer une notation", "creer_notation"),
    ("commencer une notation", "creer_notation"),
    ("démarrer nouvelle notation", "creer_notation"),
    ("initier une notation", "creer_notation"),
    
    # Créer essai
    ("je veux créer un essai", "creer_essai"),
    ("créer un essai", "creer_essai"),
    ("créer essai", "creer_essai"),
    ("nouvel essai", "creer_essai"),
    ("ajouter un essai", "creer_essai"),
    ("faire un nouvel essai", "creer_essai"),
    ("je souhaite créer un essai", "creer_essai"),
    ("commencer un essai", "creer_essai"),
    ("démarrer nouvel essai", "creer_essai"),
    ("initier un essai", "creer_essai"),
]

In [3]:
# Mapping des labels vers des indices
label_to_id = {
    "export_notation": 0,
    "creer_notation": 1,
    "creer_essai": 2
}
id_to_label = {v: k for k, v in label_to_id.items()}

In [4]:
# Dataset personnalisé
class IntentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [5]:
def evaluate_model(dataloader, model, device):
    """Évalue le modèle"""
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    return predictions, true_labels

In [6]:
def train_model(train_dataloader, model, optimizer, device, epochs=3):
    """Entraîne le modèle"""
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

In [7]:
def predict_intent(text, model, tokenizer, device):
    """Prédit l'intention pour une phrase"""
    model.eval()
    
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1)
        confidence = probs[0][pred].item()
    
    return id_to_label[pred.item()], confidence

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Utilisation de: {device}")
    
# Préparation des données
texts = [item[0] for item in data]
labels = [label_to_id[item[1]] for item in data]
    
# Split train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)
    
print(f"Données d'entraînement: {len(train_texts)}")
print(f"Données de test: {len(test_texts)}")

# Chargement du tokenizer et du modèle
print("\nChargement de CamemBERT...")
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertForSequenceClassification.from_pretrained(
    'camembert-base',
    num_labels=len(label_to_id)
)
model.to(device)

Utilisation de: cpu
Données d'entraînement: 24
Données de test: 6

Chargement de CamemBERT...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias

In [9]:
# Création des datasets
train_dataset = IntentDataset(train_texts, train_labels, tokenizer)
test_dataset = IntentDataset(test_texts, test_labels, tokenizer)
    
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

In [10]:
# Entraînement
optimizer = AdamW(model.parameters(), lr=2e-5)
    
print("\nEntraînement du modèle...")
train_model(train_dataloader, model, optimizer, device, epochs=5)


Entraînement du modèle...
Epoch 1/5, Loss: 1.1020
Epoch 2/5, Loss: 1.0898
Epoch 3/5, Loss: 1.0709
Epoch 4/5, Loss: 1.0612
Epoch 5/5, Loss: 1.0467


In [11]:
# Évaluation
print("\nÉvaluation sur le jeu de test...")
predictions, true_labels = evaluate_model(test_dataloader, model, device)
    
print("\nRapport de classification:")
print(classification_report(
    true_labels,
    predictions,
    target_names=list(label_to_id.keys())
))


Évaluation sur le jeu de test...

Rapport de classification:
                 precision    recall  f1-score   support

export_notation       1.00      1.00      1.00         2
 creer_notation       1.00      1.00      1.00         2
    creer_essai       1.00      1.00      1.00         2

       accuracy                           1.00         6
      macro avg       1.00      1.00      1.00         6
   weighted avg       1.00      1.00      1.00         6



In [12]:
# Test sur de nouvelles phrases
print("\n" + "="*50)
print("Tests sur de nouvelles phrases:")
print("="*50)
    
test_phrases = [
    "je voudrais exporter la notation 555",
    "créer une nouvelle notation s'il te plaît",
    "faire un essai",
    "export notation 999",
    "ajouter essai",
    "Créer une notation depuis l'essai 43"
]
    
for phrase in test_phrases:
    intent, confidence = predict_intent(phrase, model, tokenizer, device)
    print(f"\nPhrase: '{phrase}'")
    print(f"  → Intention: {intent}")
    print(f"  → Confiance: {confidence:.2%}")
    


Tests sur de nouvelles phrases:

Phrase: 'je voudrais exporter la notation 555'
  → Intention: export_notation
  → Confiance: 37.17%

Phrase: 'créer une nouvelle notation s'il te plaît'
  → Intention: creer_notation
  → Confiance: 34.41%

Phrase: 'faire un essai'
  → Intention: creer_essai
  → Confiance: 35.62%

Phrase: 'export notation 999'
  → Intention: export_notation
  → Confiance: 37.62%

Phrase: 'ajouter essai'
  → Intention: creer_essai
  → Confiance: 34.43%

Phrase: 'Créer une notation depuis l'essai 43'
  → Intention: export_notation
  → Confiance: 34.26%


In [13]:
# Sauvegarde du modèle
print("\nSauvegarde du modèle...")
model.save_pretrained('../models/hermes-v1')
tokenizer.save_pretrained('../models/hermes-v1')
print("Modèle sauvegardé")


Sauvegarde du modèle...
Modèle sauvegardé
