In [6]:
!pip install transformers datasets torch scikit-learn --quiet

In [7]:
!pip install accelerate --quiet

In [19]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split

# Charger les données
with open("data/dechets_1000.json", "r", encoding="utf-8") as f:
    data = json.load(f)

df = pd.DataFrame(data)

# Nettoyage & regroupement
import unicodedata

def nettoyer(texte):
    texte = texte.lower()
    texte = ''.join(c for c in unicodedata.normalize('NFD', texte) if unicodedata.category(c) != 'Mn')
    return texte

def regrouper(label):
    if label in ["papier", "carton"]:
        return "papier_carton"
    elif label in ["textile", "électronique"]:
        return "autres"
    else:
        return label

df["description"] = df["description"].apply(nettoyer)
df["label"] = df["label"].apply(regrouper)

# Vérif
print(df["label"].value_counts())

# Split
train_texts, val_texts, train_labels, val_labels = train_test_split(df["description"], df["label"], test_size=0.2, stratify=df["label"], random_state=42)


label
plastique         170
verre             166
métal             166
papier_carton     166
compost           166
non-recyclable    166
Name: count, dtype: int64


In [20]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokeniser les textes
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

# Encoder les labels
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_labels_enc = le.fit_transform(train_labels)
val_labels_enc = le.transform(val_labels)

# Dataset Torch
import torch

class DechetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = DechetDataset(train_encodings, train_labels_enc)
val_dataset = DechetDataset(val_encodings, val_labels_enc)


In [21]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

num_labels = len(le.classes_)
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=num_labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="no",
    load_best_model_at_end=False,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,1.6004,0.876003
2,0.5559,0.434258
3,0.2579,0.111775


TrainOutput(global_step=600, training_loss=0.8047530492146809, metrics={'train_runtime': 4285.5454, 'train_samples_per_second': 0.56, 'train_steps_per_second': 0.14, 'total_flos': 16033905763200.0, 'train_loss': 0.8047530492146809, 'epoch': 3.0})

In [22]:
preds = trainer.predict(val_dataset)
pred_labels = preds.predictions.argmax(axis=-1)

from sklearn.metrics import classification_report
print(classification_report(val_labels_enc, pred_labels, target_names=le.classes_))



                precision    recall  f1-score   support

       compost       1.00      1.00      1.00        33
         métal       1.00      1.00      1.00        33
non-recyclable       1.00      1.00      1.00        33
 papier_carton       1.00      1.00      1.00        34
     plastique       0.87      1.00      0.93        34
         verre       1.00      0.85      0.92        33

      accuracy                           0.97       200
     macro avg       0.98      0.97      0.97       200
  weighted avg       0.98      0.97      0.97       200

