In [32]:
!pip install pandas transformers torch datasets protobuf sentencepiece evaluate scikit-learn evaluate accelerate>=0.26.0 tensorboard


[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, pipeline
import torch
from torch import nn
from datasets import Dataset, DatasetDict
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import evaluate
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from huggingface_hub import login
login("votre_token_d_huggingface")  # Remplacez par votre token Hugging Face


In [4]:
df = pd.read_csv('../data/raw/bitext-retail-ecommerce-llm-chatbot-training-dataset.csv')
df.head()

Unnamed: 0,instruction,intent,category,tags,response
0,I got to add an item to the cart,add_product,CART,BL,I'll get right on it! I'm here to assist you i...
1,wanna add fucking products to the basket can h...,add_product,CART,BCIMQWZ,I sincerely apologize if you've encountered an...
2,i have to add products to the basket i ned help,add_product,CART,BCMQZ,You bet! I'm here to assist you in adding prod...
3,di like to add products to the cart could i ge...,add_product,CART,BCILMPQZ,Indeed! I'm here to assist you in adding produ...
4,"I need to add an item to the cart , where do I...",add_product,CART,BCILZ,I'll take care of it! I'm here to help you wit...


In [7]:
df['category'].unique()

array(['CART', 'PRODUCT', 'ORDER', 'ACCOUNT', 'CONTACT', 'DELIVERY',
       'PAYMENT', 'RETURNS', 'USER', 'SALES', 'STORE', 'FEEDBACK',
       'APP_WEBSITE'], dtype=object)

In [9]:
df[df['category'] == 'PRODUCT'].head(1)['instruction']

957    i got to see the availability of an item i nee...
Name: instruction, dtype: object

In [6]:
# je selectionne uniquement les trois premieres categories
small_df = df[df['category'].isin(df['category'].unique()[:3])]
small_df['category'].unique()

array(['CART', 'PRODUCT', 'ORDER'], dtype=object)

In [7]:
small_df.head()

Unnamed: 0,instruction,intent,category,tags,response
0,I got to add an item to the cart,add_product,CART,BL,I'll get right on it! I'm here to assist you i...
1,wanna add fucking products to the basket can h...,add_product,CART,BCIMQWZ,I sincerely apologize if you've encountered an...
2,i have to add products to the basket i ned help,add_product,CART,BCMQZ,You bet! I'm here to assist you in adding prod...
3,di like to add products to the cart could i ge...,add_product,CART,BCILMPQZ,Indeed! I'm here to assist you in adding produ...
4,"I need to add an item to the cart , where do I...",add_product,CART,BCILZ,I'll take care of it! I'm here to help you wit...


In [8]:
# je mantien que les colonnes instruction et category
small_df = small_df[['instruction', 'category']].rename(columns={'instruction': 'text', 'category': 'label'})
small_df.head()

Unnamed: 0,text,label
0,I got to add an item to the cart,CART
1,wanna add fucking products to the basket can h...,CART
2,i have to add products to the basket i ned help,CART
3,di like to add products to the cart could i ge...,CART
4,"I need to add an item to the cart , where do I...",CART


In [9]:
# je verifie que ma version de CUDA est bien detectee pour utiliser le GPU
torch.cuda.is_available()

True

In [None]:
MODEL_NAME = 'cross-encoder/ms-marco-TinyBERT-L2-v2' # 'meta-llama/Llama-3.2-1B" #"meta-llama/Meta-Llama-3-8B-Instruct"

# Prepare DataFrame: 'text' and 'label' columns
dataset = Dataset.from_pandas(small_df)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
num_labels = small_df['label'].nunique()

In [None]:
# 1. Assume you already have a DataFrame `df` with columns: "text" (str) and 'label' (str or int)
# Example:
# df = pd.read_csv("your_data.csv")  # must contain 'text' and 'label'

# Encode string labels to integers if needed
if small_df['label'].dtype == "O":
    le = LabelEncoder()
    small_df['label'] = le.fit_transform(small_df['label'])
else:
    le = None  # labels already numeric

num_labels = small_df['label'].nunique()

In [None]:
random_seed = 42
# # 2. Train/validation split
# train_df, val_df = train_test_split(small_df, test_size=0.2, random_state=random_seed, stratify=small_df['label'])

# # 3. Convert pandas -> HF Datasets
# train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
# val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))
# datasets = DatasetDict({"train": train_dataset, "validation": val_dataset})


# First split: train vs temp (test+validation)
train_df, t_and_v_df = train_test_split(small_df, random_state=random_seed, test_size=0.2)
# Second split: temp -> test vs validation (10% + 10%)
test_df, val_df = train_test_split(t_and_v_df, random_state=random_seed, test_size=0.5)

train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

datasets = DatasetDict(
    {
        "train": train_dataset,      # 80%
        "test": test_dataset,       # 10%
        "val": val_dataset,  # 10% (held-out final validation)
    }
)


In [None]:
# 4. Load TinyBERT tokenizer and model (you can swap to any TinyBERT checkpoint)
model_name = "prajjwal1/bert-tiny"  # a common TinyBERT-like model on HF Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
    )

tokenized_datasets = datasets.map(tokenize_function, batched=True)

# Set format for PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(
    [c for c in tokenized_datasets["train"].column_names if c not in ["input_ids", "attention_mask", 'label']]
)
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", 'label'])

Map: 100%|██████████| 10059/10059 [00:01<00:00, 8425.96 examples/s]
Map: 100%|██████████| 1257/1257 [00:00<00:00, 10130.74 examples/s]
Map: 100%|██████████| 1258/1258 [00:00<00:00, 8938.69 examples/s]


In [54]:
# 5. Metrics: accuracy and loss will be tracked; we compute accuracy explicitly
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=preds, references=labels)["accuracy"]
    return {"accuracy": acc}


In [63]:
# 6. Load TinyBERT with a new classification head sized to num_labels
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)

# 6a. Print all layers / parameter names before freezing
# print("Model parameters before freezing:")
# for name, param in model.named_parameters():
#     print(name) #print(name, param.shape)


# 6b. Freeze all layers except the classification head
# for name, param in model.named_parameters():
#     # keep only the classifier (and optionally dropout/biases) trainable
#     if "classifier" in name:
#         param.requires_grad = True
#     else:
#         param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
run_id = datetime.now().strftime("%Y%m%d-%H%M%S")
run_name = f"tinybert_e5_lr1e-5_{run_id}"

# 7. Training configuration
training_args = TrainingArguments(
    output_dir="./tinybert_cls",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,

    # >>> TensorBoard-related args <<<
    logging_dir=f"../logs/{run_name}",   # where TensorBoard will read logs
    report_to=["tensorboard"],           # force logging to TensorBoard
    run_name=run_name
)

# 8. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 9. Train
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6844,0.566056,0.920446
2,0.2383,0.18408,0.996818
3,0.093,0.073251,0.998409
4,0.0541,0.04255,0.998409
5,0.0326,0.028881,0.998409
6,0.0248,0.021313,0.998409
7,0.021,0.018055,0.998409
8,0.0159,0.013677,0.998409
9,0.0197,0.013141,0.998409
10,0.0051,0.012466,0.998409


TrainOutput(global_step=18870, training_loss=0.05728635616564978, metrics={'train_runtime': 458.2667, 'train_samples_per_second': 658.503, 'train_steps_per_second': 41.177, 'total_flos': 95878615472640.0, 'train_loss': 0.05728635616564978, 'epoch': 30.0})

In [67]:
# final evaluation on the held-out validation set
final_metrics = trainer.evaluate(
    eval_dataset=tokenized_datasets["val"]
)
print("Final validation metrics:", final_metrics)

Final validation metrics: {'eval_loss': 0.007263979408890009, 'eval_accuracy': 0.9992050874403816, 'eval_runtime': 1.0204, 'eval_samples_per_second': 1232.884, 'eval_steps_per_second': 77.423, 'epoch': 30.0}


In [66]:
# 10. Final evaluation: accuracy and validation loss
eval_results = trainer.evaluate()
print("Validation results:", eval_results)
# eval_results contains keys like: 'eval_loss', 'eval_accuracy', 'eval_runtime', etc.

# 11. Optional: invert label encoding for predictions
def predict_texts(texts):
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt",
    )
    with torch.no_grad():
        outputs = model(**{k: v.to(model.device) for k, v in enc.items()})
        preds = outputs.logits.argmax(dim=-1).cpu().numpy()
    if le is not None:
        preds = le.inverse_transform(preds)
    return preds

# Example:
# print(predict_texts(["some example text", "another text"]))

Validation results: {'eval_loss': 0.0026745139621198177, 'eval_accuracy': 0.9992044550517104, 'eval_runtime': 0.8962, 'eval_samples_per_second': 1402.638, 'eval_steps_per_second': 88.153, 'epoch': 30.0}


In [None]:
df = pd.read_csv('../data/raw/bitext-retail-ecommerce-llm-chatbot-training-dataset.csv')
df.head()

model_id = "gpasiniesgi/esgi_nlp_project_1"  # example HF repo with safetensors weights

clf = pipeline(
    task="text-classification",
    model=model_id,
    tokenizer=model_id,
    # use_safetensors is True by default for modern models, but you can be explicit:
    model_kwargs={"torch_dtype": "auto"},
)

text_pro = df[df['category'] == 'PRODUCT']['instruction'].sample(n=1).iloc[0]
type(text_pro)

pred_pro = clf(text_pro)

text_car = df[df['category'] == 'CART']['instruction'].sample(n=1).iloc[0]
pred_car = clf(text_car)

text_ord = df[df['category'] == 'ORDER']['instruction'].sample(n=1).iloc[0]
pred_ord = clf(text_ord)

print("product : " + text_pro)
print(f"product : {pred_pro}")

print(f'car : {text_car}')
print(f'car : {pred_car}')

print(f"ord : {text_ord}")
print(f"ord : {pred_ord}")

Device set to use cuda:0


product : I would like to exchange a fucking item I purchased, could I get some help ?
product : [{'label': 'LABEL_2', 'score': 0.9998375177383423}]
car : I need to add a fucking product to thd cart, can you help me?
car : [{'label': 'LABEL_0', 'score': 0.9996371269226074}]
ord : I have to track my replacement iitem, could I get some help?
ord : [{'label': 'LABEL_1', 'score': 0.9997499585151672}]
