## üì¶ 1. Installation et Imports

In [None]:
# Installation des packages
!pip install -q datasets transformers

# T√©l√©charger NLTK data
import nltk
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

print("‚úÖ Installation termin√©e!")

In [None]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import pickle
import re
from collections import Counter

from datasets import load_dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Configuration
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("‚úÖ Imports r√©ussis!")

## üìÇ 2. Cr√©er la Structure de Dossiers

In [None]:
# Cr√©er les dossiers n√©cessaires
folders = [
    'data/raw',
    'data/processed',
    'models/lstm',
    'models/bilstm',
    'models/cnn_bilstm',
    'models/bert',
    'results/figures',
    'results/metrics'
]

for folder in folders:
    Path(folder).mkdir(parents=True, exist_ok=True)

print("‚úÖ Structure de dossiers cr√©√©e!")
!ls -la

## üîó 3. T√©l√©chargement du Dataset GoEmotions

In [None]:
# T√©l√©charger GoEmotions depuis Hugging Face
print("T√©l√©chargement du dataset GoEmotions...")
print("Cela peut prendre quelques minutes...\n")

dataset = load_dataset('go_emotions', 'simplified')

# Convertir en DataFrames
train_df = pd.DataFrame(dataset['train'])
val_df = pd.DataFrame(dataset['validation'])
test_df = pd.DataFrame(dataset['test'])

print(f"\n‚úÖ Dataset t√©l√©charg√©!")
print(f"\nüìä Statistiques:")
print(f"  Train:      {len(train_df):,} samples")
print(f"  Validation: {len(val_df):,} samples")
print(f"  Test:       {len(test_df):,} samples")
print(f"  Total:      {len(train_df) + len(val_df) + len(test_df):,} samples")

In [None]:
# Charger le dataset depuis data/raw/
df = pd.read_csv('data/raw/goemotions.csv')

print("üìã Exploration du dataset:")
print(f"  Shape: {df.shape}")
print(f"\nColonnes: {df.columns.tolist()}")
print(f"\nPremi√®res lignes:")
display(df.head(10))

# V√©rifier la colonne d'√©motions
if 'emotions' in df.columns:
    print(f"\nüìä Exemples d'√©motions:")
    print(df['emotions'].value_counts().head(10))
elif any(col in df.columns for col in ['anger', 'joy', 'sadness']):
    print("\n‚úÖ Dataset d√©j√† au format multi-colonnes!")
else:
    print("\n‚ö†Ô∏è V√©rifier le format du dataset")

In [None]:
# Pr√©parer les colonnes pour l'entra√Ænement
# Le dataset Kaggle peut avoir diff√©rents formats, on s'adapte

# Identifier la colonne de texte
text_col = None
for col in ['text', 'comment_text', 'sentence']:
    if col in df.columns:
        text_col = col
        break

if text_col is None:
    print("‚ö†Ô∏è Colonne de texte non trouv√©e. Colonnes disponibles:", df.columns.tolist())
else:
    print(f"‚úÖ Colonne de texte: '{text_col}'")

# Identifier les colonnes d'√©motions
emotion_cols = [col for col in df.columns if col in [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]]

print(f"\nüé≠ Colonnes d'√©motions trouv√©es: {len(emotion_cols)}")
print(emotion_cols[:10] if len(emotion_cols) > 10 else emotion_cols)

# Si le dataset a d√©j√† les 28 colonnes binaires, c'est parfait!
if len(emotion_cols) >= 27:
    print("\n‚úÖ Dataset au format multi-label! Pr√™t pour l'entra√Ænement.")
    # Cr√©er les DataFrames train/val/test
    from sklearn.model_selection import train_test_split
    
    # Split: 80% train, 10% val, 10% test
    train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
    
    print(f"\nüìä Splits:")
    print(f"  Train:      {len(train_data):,} samples")
    print(f"  Validation: {len(val_data):,} samples")
    print(f"  Test:       {len(test_data):,} samples")
    
    # Renommer en train_df, val_df, test_df
    train_df = train_data.copy()
    val_df = val_data.copy()
    test_df = test_data.copy()
    
    # Cr√©er la colonne 'text' si n√©cessaire
    if text_col != 'text':
        train_df['text'] = train_df[text_col]
        val_df['text'] = val_df[text_col]
        test_df['text'] = test_df[text_col]
    
    # Cr√©er la colonne 'labels' (liste des indices des √©motions actives)
    def get_label_indices(row):
        return [i for i, col in enumerate(emotion_cols) if row[col] == 1]
    
    train_df['labels'] = train_df.apply(get_label_indices, axis=1)
    val_df['labels'] = val_df.apply(get_label_indices, axis=1)
    test_df['labels'] = test_df.apply(get_label_indices, axis=1)
    
    print("\n‚úÖ Format adapt√© pour l'entra√Ænement!")
    
else:
    print("\n‚ö†Ô∏è Format non standard. Utilisation de HuggingFace comme alternative...")
    # Fallback sur HuggingFace si le format Kaggle est diff√©rent
    from datasets import load_dataset
    dataset = load_dataset('go_emotions', 'simplified')
    train_df = pd.DataFrame(dataset['train'])
    val_df = pd.DataFrame(dataset['validation'])
    test_df = pd.DataFrame(dataset['test'])
    print(f"\n‚úÖ Dataset HuggingFace charg√©!")
    print(f"  Train:      {len(train_df):,} samples")
    print(f"  Validation: {len(val_df):,} samples")
    print(f"  Test:       {len(test_df):,} samples")

### üìä Pr√©paration du Dataset pour le Projet

Le dataset Kaggle GoEmotions contient toutes les √©motions dans une seule colonne.
Nous devons le transformer en format multi-label (28 colonnes binaires).

In [None]:
# Afficher un aper√ßu
print("\nüìù Aper√ßu des donn√©es:")
train_df.head()

## üè∑Ô∏è 4. Labels des √âmotions

In [None]:
# Labels d'√©motions GoEmotions
emotion_labels = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
]

NUM_CLASSES = len(emotion_labels)

print(f"\nüé≠ Nombre d'√©motions: {NUM_CLASSES}")
print(f"\nListe des √©motions:")
for i, emotion in enumerate(emotion_labels, 1):
    print(f"  {i:2d}. {emotion}")

## üìä 5. Analyse Exploratoire

In [None]:
# Distribution des √©motions
all_labels = []
for labels in train_df['labels']:
    all_labels.extend(labels)

label_counts = Counter(all_labels)
label_counts_sorted = sorted(label_counts.items())

# Pr√©parer les donn√©es pour le graphique
indices = [x[0] for x in label_counts_sorted]
counts = [x[1] for x in label_counts_sorted]
labels_names = [emotion_labels[i] if i < len(emotion_labels) else f"Label {i}" for i in indices]

# Plot
fig, ax = plt.subplots(figsize=(16, 6))
bars = ax.bar(labels_names, counts, color='steelblue', edgecolor='navy')

# Colorer les barres selon la fr√©quence
colors = plt.cm.viridis(np.linspace(0, 1, len(counts)))
for bar, color in zip(bars, colors):
    bar.set_color(color)

ax.set_xlabel('√âmotion', fontsize=12, fontweight='bold')
ax.set_ylabel('Fr√©quence', fontsize=12, fontweight='bold')
ax.set_title('Distribution des √âmotions dans le Training Set', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('results/figures/emotion_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nüìà Statistiques:")
print(f"  Total de labels: {sum(counts):,}")
print(f"  √âmotion la plus fr√©quente: {labels_names[counts.index(max(counts))]} ({max(counts):,} occurrences)")
print(f"  √âmotion la moins fr√©quente: {labels_names[counts.index(min(counts))]} ({min(counts):,} occurrences)")
print(f"  Ratio d√©s√©quilibre: {max(counts)/min(counts):.2f}")

In [None]:
# Nombre de labels par √©chantillon
labels_per_sample = [len(labels) for labels in train_df['labels']]

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogramme
axes[0].hist(labels_per_sample, bins=range(1, max(labels_per_sample)+2), 
             edgecolor='black', color='coral', alpha=0.7)
axes[0].set_xlabel('Nombre de Labels par √âchantillon', fontweight='bold')
axes[0].set_ylabel('Fr√©quence', fontweight='bold')
axes[0].set_title('Distribution du Nombre de Labels', fontweight='bold')
axes[0].grid(True, alpha=0.3)

# Boxplot
axes[1].boxplot(labels_per_sample, vert=True)
axes[1].set_ylabel('Nombre de Labels', fontweight='bold')
axes[1].set_title('Boxplot du Nombre de Labels', fontweight='bold')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('results/figures/labels_per_sample.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nüìä Statistiques Multi-Label:")
print(f"  Moyenne: {np.mean(labels_per_sample):.2f} labels/√©chantillon")
print(f"  M√©diane: {np.median(labels_per_sample):.0f} labels/√©chantillon")
print(f"  Maximum: {max(labels_per_sample)} labels/√©chantillon")
print(f"  Minimum: {min(labels_per_sample)} labels/√©chantillon")

In [None]:
# Longueur des textes
train_df['text_length'] = train_df['text'].apply(len)
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

axes[0].hist(train_df['text_length'], bins=50, edgecolor='black', color='lightblue')
axes[0].axvline(train_df['text_length'].mean(), color='red', linestyle='--', 
                linewidth=2, label=f'Moyenne: {train_df["text_length"].mean():.0f}')
axes[0].set_xlabel('Longueur du Texte (caract√®res)', fontweight='bold')
axes[0].set_ylabel('Fr√©quence', fontweight='bold')
axes[0].set_title('Distribution de la Longueur des Textes', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].hist(train_df['word_count'], bins=50, edgecolor='black', color='lightgreen')
axes[1].axvline(train_df['word_count'].mean(), color='red', linestyle='--', 
                linewidth=2, label=f'Moyenne: {train_df["word_count"].mean():.1f}')
axes[1].set_xlabel('Nombre de Mots', fontweight='bold')
axes[1].set_ylabel('Fr√©quence', fontweight='bold')
axes[1].set_title('Distribution du Nombre de Mots', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('results/figures/text_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nüìè Statistiques de Longueur:")
print(f"  Longueur moyenne: {train_df['text_length'].mean():.1f} caract√®res")
print(f"  Nombre de mots moyen: {train_df['word_count'].mean():.1f} mots")
print(f"  Percentile 95: {train_df['word_count'].quantile(0.95):.0f} mots")

## üßπ 6. Pr√©traitement des Textes

In [None]:
def clean_text(text):
    """Nettoyer le texte"""
    if not isinstance(text, str):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Supprimer URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    
    # Supprimer mentions et hashtags
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    
    # Garder seulement lettres, chiffres et ponctuation basique
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'\-]', '', text)
    
    # Supprimer espaces multiples
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Appliquer le nettoyage
print("Nettoyage des textes...")
train_df['text_clean'] = train_df['text'].apply(clean_text)
val_df['text_clean'] = val_df['text'].apply(clean_text)
test_df['text_clean'] = test_df['text'].apply(clean_text)

print("\n‚úÖ Nettoyage termin√©!")
print("\nExemple de transformation:")
print(f"Avant: {train_df['text'].iloc[0]}")
print(f"Apr√®s: {train_df['text_clean'].iloc[0]}")

## üî¢ 7. Tokenization et S√©quences

In [None]:
# Param√®tres
MAX_VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 128

print(f"Configuration:")
print(f"  Taille du vocabulaire: {MAX_VOCAB_SIZE:,}")
print(f"  Longueur max des s√©quences: {MAX_SEQUENCE_LENGTH}")

# Cr√©er le tokenizer
print("\nCr√©ation du tokenizer...")
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['text_clean'])

print(f"\n‚úÖ Tokenizer cr√©√©!")
print(f"  Vocabulaire complet: {len(tokenizer.word_index):,} mots")
print(f"  Vocabulaire utilis√©: {MAX_VOCAB_SIZE:,} mots")

In [None]:
# Convertir en s√©quences
print("Conversion en s√©quences...")
X_train_seq = tokenizer.texts_to_sequences(train_df['text_clean'])
X_val_seq = tokenizer.texts_to_sequences(val_df['text_clean'])
X_test_seq = tokenizer.texts_to_sequences(test_df['text_clean'])

# Padding
print("Application du padding...")
X_train = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, 
                        padding='post', truncating='post')
X_val = pad_sequences(X_val_seq, maxlen=MAX_SEQUENCE_LENGTH, 
                      padding='post', truncating='post')
X_test = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, 
                       padding='post', truncating='post')

print(f"\n‚úÖ S√©quences cr√©√©es!")
print(f"  X_train: {X_train.shape}")
print(f"  X_val:   {X_val.shape}")
print(f"  X_test:  {X_test.shape}")

## üè∑Ô∏è 8. Pr√©parer les Labels Multi-Label

In [None]:
def prepare_labels(labels_list, num_classes=28):
    """
    Convertir les listes de labels en matrices binaires
    """
    n_samples = len(labels_list)
    label_matrix = np.zeros((n_samples, num_classes), dtype=np.float32)
    
    for i, labels in enumerate(labels_list):
        for label_idx in labels:
            if label_idx < num_classes:
                label_matrix[i, label_idx] = 1.0
    
    return label_matrix

# Pr√©parer les labels
print("Pr√©paration des labels...")
y_train = prepare_labels(train_df['labels'].tolist(), NUM_CLASSES)
y_val = prepare_labels(val_df['labels'].tolist(), NUM_CLASSES)
y_test = prepare_labels(test_df['labels'].tolist(), NUM_CLASSES)

print(f"\n‚úÖ Labels pr√©par√©s!")
print(f"  y_train: {y_train.shape}")
print(f"  y_val:   {y_val.shape}")
print(f"  y_test:  {y_test.shape}")

## üíæ 9. Sauvegarder les Donn√©es Pr√©par√©es

In [None]:
print("Sauvegarde des donn√©es pr√©par√©es...\n")

# Sauvegarder les s√©quences
np.save('data/processed/X_train.npy', X_train)
np.save('data/processed/X_val.npy', X_val)
np.save('data/processed/X_test.npy', X_test)
print("‚úÖ S√©quences sauvegard√©es")

# Sauvegarder les labels
np.save('data/processed/y_train.npy', y_train)
np.save('data/processed/y_val.npy', y_val)
np.save('data/processed/y_test.npy', y_test)
print("‚úÖ Labels sauvegard√©s")

# Sauvegarder le tokenizer
with open('data/processed/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)
print("‚úÖ Tokenizer sauvegard√©")

# Sauvegarder les m√©tadonn√©es
metadata = {
    'emotion_labels': emotion_labels,
    'num_classes': NUM_CLASSES,
    'max_vocab_size': MAX_VOCAB_SIZE,
    'max_sequence_length': MAX_SEQUENCE_LENGTH,
    'vocab_size': len(tokenizer.word_index),
    'train_size': len(X_train),
    'val_size': len(X_val),
    'test_size': len(X_test)
}

with open('data/processed/metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)
print("‚úÖ M√©tadonn√©es sauvegard√©es")

# Sauvegarder les textes originaux pour analyse
with open('data/processed/test_texts.pkl', 'wb') as f:
    pickle.dump(test_df['text'].tolist(), f)
print("‚úÖ Textes de test sauvegard√©s")

print("\n" + "="*60)
print("üéâ PR√âPARATION DES DONN√âES TERMIN√âE !")
print("="*60)
print("\nVous pouvez maintenant ex√©cuter les notebooks d'entra√Ænement:")
print("  üìì Notebook_1_LSTM.ipynb")
print("  üìì Notebook_2_BiLSTM_Attention.ipynb")
print("  üìì Notebook_3_CNN_BiLSTM.ipynb")
print("  üìì Notebook_4_BERT.ipynb")
print("\nPuis pour la comparaison:")
print("  üìì Notebook_5_Comparaison_Finale.ipynb")

In [None]:
# V√©rifier les fichiers cr√©√©s
print("\nüìÅ Fichiers cr√©√©s dans data/processed/:")
!ls -lh data/processed/

## üì§ 10. [Optionnel] Sauvegarder vers Google Drive

In [None]:
# Monter Google Drive (optionnel)
from google.colab import drive
drive.mount('/content/drive')

# Cr√©er un dossier dans Drive
!mkdir -p "/content/drive/MyDrive/emotion_detection_project"

# Copier les donn√©es
!cp -r data/processed "/content/drive/MyDrive/emotion_detection_project/"
!cp -r results "/content/drive/MyDrive/emotion_detection_project/"

print("\n‚úÖ Donn√©es sauvegard√©es dans Google Drive!")