In [15]:
import json
import pandas as pd
import numpy as np

data_path = "../data/dechets.json"

with open(data_path, "r", encoding="utf-8") as f:
    raw_data = json.load(f)

df = pd.DataFrame(raw_data)
df.head()


Unnamed: 0,description,label
0,emballage papier,papier
1,enveloppe blanche,papier
2,litière de chat souillée,non-recyclable
3,peau de banane,organique
4,ampoule halogène,non-recyclable


In [24]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


texts = df["description"].values
labels = df["label"].values

def nettoyer_texte(texte):
    import unicodedata
    texte = texte.lower()
    texte = ''.join(
        c for c in unicodedata.normalize('NFD', texte)
        if unicodedata.category(c) != 'Mn'  # supprime les accents
    )
    return texte

def regrouper_classes(label):
    if label in ["papier", "carton"]:
        return "papier_carton"
    elif label in ["textile", "électronique"]:
        return "autres"
    else:
        return label

# Nettoyage et regroupement
df["description"] = df["description"].apply(nettoyer_texte)
df["label"] = df["label"].apply(regrouper_classes)

label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)


X_train, X_test, y_train, y_test = train_test_split(
    texts, encoded_labels, test_size=0.2, random_state=42, stratify=encoded_labels
)


print(f"Exemple : '{X_train[0]}' → {y_train[0]} ({label_encoder.inverse_transform([y_train[0]])[0]})")
print(f"\nNombre de classes : {len(label_encoder.classes_)} → {label_encoder.classes_}")


Exemple : 'papier aluminium usage' → 2 (métal)

Nombre de classes : 8 → ['autres' 'compost' 'métal' 'non-recyclable' 'organique' 'papier_carton'
 'plastique' 'verre']


In [25]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization, Embedding, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential

# ⚙️ Paramètres
max_tokens = 5000   # taille du vocabulaire
output_dim = 64     # dimension des embeddings
sequence_length = 50  # nombre max de mots par texte
num_classes = len(label_encoder.classes_)

# 🔠 Text vectorizer
vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=sequence_length)
vectorizer.adapt(X_train)

# ✅ Modèle simple
model = Sequential([
    vectorizer,
    Embedding(input_dim=max_tokens, output_dim=output_dim),
    GlobalAveragePooling1D(),
    Dense(64, activation="relu"),
    Dense(num_classes, activation="softmax")
])

# ⚙️ Compilation
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])

# 👀 Aperçu
model.summary()



Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization_4 (Text  (None, 50)                0         
 Vectorization)                                                  
                                                                 
 embedding_4 (Embedding)     (None, 50, 64)            320000    
                                                                 
 global_average_pooling1d_4  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_8 (Dense)             (None, 64)                4160      
                                                                 
 dense_9 (Dense)             (None, 8)                 520       
                                                                 
Total params: 324680 (1.24 MB)
Trainable params: 32468

In [26]:
# Entraînement
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2
)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [27]:
# Évaluation sur test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest accuracy: {accuracy:.2f}")

# Exemple de prédiction
sample_texts = [
    "bouteille en plastique transparente",
    "journal froissé",
    "reste de nourriture organique"
]

pred_probs = model.predict(sample_texts)
pred_labels = np.argmax(pred_probs, axis=1)
pred_classes = label_encoder.inverse_transform(pred_labels)

for text, label in zip(sample_texts, pred_classes):
    print(f"Texte : '{text}' → Prédit : {label}")



Test accuracy: 0.25
Texte : 'bouteille en plastique transparente' → Prédit : papier_carton
Texte : 'journal froissé' → Prédit : papier_carton
Texte : 'reste de nourriture organique' → Prédit : papier_carton
