# <a id='toc1_'></a>[Modèle simple](#toc0_)

**Table of contents**<a id='toc0_'></a>    
- [Modèle simple](#toc1_)    
- [Telechargements & imports des données](#toc2_)    
- [Preprocessing des données](#toc3_)    
  - [Renommer les colonnes](#toc3_1_)    
  - [Text cleaning](#toc3_2_)    
  - [Fonctions de split](#toc3_3_)    
    - [Lemmatisation](#toc3_3_1_)    
    - [Stemmatisation](#toc3_3_2_)    
    - [Tokenisation](#toc3_3_3_)    
      - [ Exemple de resultat sur 1 sample](#toc3_3_3_1_)    
  - [Embedding et vectorisation](#toc3_4_)    
    - [TFIDF](#toc3_4_1_)    
    - [Sentence Transformers](#toc3_4_2_)    
      - [ Exemple de resultat sur 1 sample](#toc3_4_2_1_)    
  - [Comparatif global](#toc3_5_)    
- [Log des models avec MLflow](#toc4_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

# <a id='toc2_'></a>[Telechargements & imports des données](#toc0_)

In [None]:
# !pip install uv
# !uv pip install pandas gensim numpy matplotlib scikit-learn wordcloud tqdm sentence_transformers ipykernel tensorflow spacy
# !python -m spacy download en_core_web_sm

In [None]:
import os
import re

import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

tqdm.pandas()

import string
import warnings

warnings.filterwarnings("ignore")
import nltk
import spacy
from nltk.stem.snowball import PorterStemmer
from sentence_transformers import SentenceTransformer

nltk.download('punkt')
nltk.download('punkt_tab')


# Charger le modèle anglais
nlp = spacy.load("en_core_web_sm")

# Charger le stemmer anglais
stemmer = PorterStemmer()


import mlflow
import mlflow.sklearn
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    average_precision_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from sklearn.preprocessing import label_binarize

In [None]:
# Telecharger les données
!wget https://s3-eu-west-1.amazonaws.com/static.oc-static.com/prod/courses/files/AI+Engineer/Project+7%C2%A0-+D%C3%A9tectez+les+Bad+Buzz+gr%C3%A2ce+au+Deep+Learning/sentiment140.zip

# Extraction des données
ZIP_PATH = '/content/sentiment140.zip'
!unzip $ZIP_PATH

In [None]:
# Lecture du Dataframe
DATASET_PATH = '/content/training.1600000.processed.noemoticon.csv'
df = pd.read_csv(DATASET_PATH, sep=',', encoding = "ISO-8859-1", header=None)
df.head()

# <a id='toc3_'></a>[Preprocessing des données](#toc0_)

## <a id='toc3_1_'></a>[Renommer les colonnes](#toc0_)

In [None]:
# Renommer les colonnes en ce basant sur les cards du dataset
df = df.rename(columns={
    df.columns[0]: 'target',
    df.columns[1]: 'ids',
    df.columns[2]: 'date',
    df.columns[3]: 'flag',
    df.columns[4]: 'user',
    df.columns[5]: 'text',

})

In [None]:
# Definir les jeux de données

complete_df = df[['target', 'text']]
sample_df = df[['target', 'text']].sample(10_000)

# Afficher la valeurs des labels initiaux
print(sample_df['target'].value_counts())

# Conversion en binaire 0,1
sample_df['target'] = sample_df['target'].replace({0: 0, 4: 1})
complete_df['target'] = complete_df['target'].replace({0: 0, 4: 1})


## <a id='toc3_2_'></a>[Text cleaning](#toc0_)

In [None]:

def tweet_cleaning(tweet):
    """
    Nettoie et prétraite un tweet

    Cette fonction effectue plusieurs étapes de nettoyage :
        - Suppression des URLs, mentions et hashtags
        - Suppression des emojis et caractères spéciaux
        - Suppression de la ponctuation et des chiffres
        - Normalisation du texte (minuscules, espaces multiples)

    Params :
        tweet (str) : Le tweet brut à nettoyer.

    Return :
        str : Le tweet nettoyé et prétraité, prêt pour l'analyse de sentiment.

    """
    # Supprimer les URLs
    tweet = re.sub(r'https?://\S+|www\.\S+', '', tweet)

    # Supprimer les mentions (@user)
    tweet = re.sub(r'@\w+', '', tweet)

    # Supprimer les hashtags (#hashtag)
    tweet = re.sub(r'#\w+', '', tweet)

    # Normaliser & supprimer les caractères
    tweet = tweet.encode('ascii', 'ignore').decode('utf-8')
    tweet = re.sub(r'[^\x00-\x7F]+', '', tweet)

    # Supprimer la ponctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))

    # Supprimer les chiffres
    tweet = re.sub(r'\d+', '', tweet)

    # Supprimer les espaces multiples et les espaces au début/fin
    tweet = re.sub(r'\s+', ' ', tweet).strip()

    return tweet



In [None]:
sample_df.apply(lambda x: tweet_cleaning(x['text']), axis=1)

## <a id='toc3_3_'></a>[Fonctions de split](#toc0_)

### <a id='toc3_3_1_'></a>[Lemmatisation](#toc0_)

In [None]:
def lemmatize_text(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

### <a id='toc3_3_2_'></a>[Stemmatisation](#toc0_)

In [None]:
def stem_text_french(text):
    tokens = nltk.word_tokenize(text, language='french')
    return [stemmer.stem(token) for token in tokens]

### <a id='toc3_3_3_'></a>[Tokenisation](#toc0_)

In [None]:
def tokenize_text(text):
    doc = nlp(text)
    return [token.text for token in doc]

#### <a id='toc3_3_3_1_'></a>[ Exemple de resultat sur 1 sample](#toc0_)

In [None]:
text = sample_df['text'].sample(1).values[0]
print(text)

print(lemmatize_text(text=text))
print(stem_text_french(text=text))
print(tokenize_text(text=text))

## <a id='toc3_4_'></a>[Embedding et vectorisation](#toc0_)

### <a id='toc3_4_1_'></a>[TFIDF](#toc0_)

In [None]:
def get_tfidf_vector(texts):
    """
    texts: liste de textes (strings)
    Retourne la matrice TF-IDF et le vecteurur entraîné
    """
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    return vectorizer, tfidf_matrix

### <a id='toc3_4_2_'></a>[Sentence Transformers](#toc0_)

In [None]:
def get_sentence_embeddings(texts, model_name="all-MiniLM-L6-v2"):
    """
    texts: liste de phrases à encoder
    Retourne une liste d'embeddings (vecteurs numpy)
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(texts)
    return embeddings

#### <a id='toc3_4_2_1_'></a>[ Exemple de resultat sur 1 sample](#toc0_)

In [None]:

text = sample_df['text'].sample(1).values[0]
print(text)

print(get_tfidf_vector([text]))
print(get_sentence_embeddings([text]))

## <a id='toc3_5_'></a>[Comparatif global](#toc0_)

In [None]:


def compare_preprocessing_embeddings(df, text_col='text', label_col='target', sample_size=10_000):
    """
    Compare différentes combinaisons de prétraitement + embedding pour la classification.

    Args:
        df (pd.DataFrame): DataFrame contenant les données
        text_col (str): Nom de la colonne texte
        label_col (str): Nom de la colonne cible
        sample_size (int): Taille de l'échantillon à utiliser (pour des raisons de performance)

    Returns:
        None - Affiche un barchart des accuracies
    """
    # Échantillonnage
    df_sample = df.sample(sample_size, random_state=42)
    texts_raw = df_sample[text_col].tolist()
    labels = df_sample[label_col].tolist()

    # Prétraitements
    texts_lemmatized = [' '.join(lemmatize_text(text)) for text in texts_raw]
    texts_tokenized = [' '.join(tokenize_text(text)) for text in texts_raw]
    texts_stemmed = [' '.join(stem_text_french(text)) for text in texts_raw]

    # Dictionnaire des combinaisons à tester
    results = {}

    print("🚀 Début des tests...")

    # --- TF-IDF ---
    print("🔹 TF-IDF")
    vectorizer, X_tfidf = get_tfidf_vector(texts_raw)
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["raw_tfidf"] = acc
    print("  - raw + tfidf:", acc)

    vectorizer, X_tfidf = get_tfidf_vector(texts_lemmatized)
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["lemmatized_tfidf"] = acc
    print("  - lemma + tfidf:", acc)

    vectorizer, X_tfidf = get_tfidf_vector(texts_tokenized)
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["tokenized_tfidf"] = acc
    print("  - token + tfidf:", acc)

    vectorizer, X_tfidf = get_tfidf_vector(texts_stemmed)
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["stemmed_tfidf"] = acc
    print("  - stem + tfidf:", acc)

    # --- Sentence Transformers ---
    print("🔹 Sentence Transformers")

    embeddings = get_sentence_embeddings(texts_raw)
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["raw_transformer"] = acc
    print("  - raw + transformer:", acc)

    embeddings = get_sentence_embeddings(texts_lemmatized)
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["lemmatized_transformer"] = acc
    print("  - lemma + transformer:", acc)

    embeddings = get_sentence_embeddings(texts_tokenized)
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["tokenized_transformer"] = acc
    print("  - token + transformer:", acc)

    embeddings = get_sentence_embeddings(texts_stemmed)
    X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.2, random_state=42)
    model = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    acc = accuracy_score(y_test, model.predict(X_test))
    results["stemmed_transformer"] = acc
    print("  - stem + transformer:", acc)

    print("\n✅ Fin des tests.")

    # --- Affichage graphique ---
    names = [
        "Raw + TF-IDF",
        "Lemmatization + TF-IDF",
        "Tokenization + TF-IDF",
        "Stemming + TF-IDF",
        "Raw + Transformer",
        "Lemmatization + Transformer",
        "Tokenization + Transformer",
        "Stemming + Transformer"
    ]

    scores = [
        results["raw_tfidf"],
        results["lemmatized_tfidf"],
        results["tokenized_tfidf"],
        results["stemmed_tfidf"],
        results["raw_transformer"],
        results["lemmatized_transformer"],
        results["tokenized_transformer"],
        results["stemmed_transformer"]
    ]

    colors = ['#FF9999' if 'TF-IDF' in name else '#66B2FF' for name in names]

    plt.figure(figsize=(12, 6))
    bars = plt.bar(names, scores, color=colors)
    plt.ylim(0.7, 1.0)
    plt.ylabel("Accuracy")
    plt.title("Comparaison des performances par méthode de prétraitement + embedding")
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()

    # Ajouter les valeurs au-dessus des barres
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                 f'{height:.3f}', ha='center', va='bottom')

    plt.show()

In [None]:
compare_preprocessing_embeddings(complete_df)

# <a id='toc4_'></a>[Log des models avec MLflow](#toc0_)

In [None]:
def compare_preprocessing_embeddings_mlflow(df, text_col='text', label_col='target', sample_size=5000):
    """
    Compare différentes combinaisons de prétraitement + embedding pour la classification.
    Enregistre chaque expérience dans MLflow avec métriques, paramètres et artifacts.

    Args:
        df (pd.DataFrame): DataFrame contenant les données
        text_col (str): Nom de la colonne texte
        label_col (str): Nom de la colonne cible
        sample_size (int): Taille de l'échantillon à utiliser

    Returns:
        None - Affiche un barchart des accuracies et loggue dans MLflow
    """
    # Créer un répertoire pour les matrices de confusion
    os.makedirs("confusion_matrices", exist_ok=True)

    # Échantillonnage
    df_sample = df.sample(sample_size, random_state=42)
    texts_raw = df_sample[text_col].tolist()
    labels = df_sample[label_col].tolist()

    # Prétraitements
    texts_lemmatized = [' '.join(lemmatize_text(text)) for text in texts_raw]
    texts_tokenized = [' '.join(tokenize_text(text)) for text in texts_raw]
    texts_stemmed = [' '.join(stem_text_french(text)) for text in texts_raw]

    # Dictionnaire des combinaisons
    experiments = {
        "raw_tfidf": texts_raw,
        "lemmatized_tfidf": texts_lemmatized,
        "tokenized_tfidf": texts_tokenized,
        "stemmed_tfidf": texts_stemmed,

        "raw_transformer": texts_raw,
        "lemmatized_transformer": texts_lemmatized,
        "tokenized_transformer": texts_tokenized,
        "stemmed_transformer": texts_stemmed,
    }

    # Début de l'expérience MLflow
    mlflow.set_experiment("Text Classification with Logistic Regression")

    print("🚀 Début des tests...")

    for key, texts in experiments.items():
        method = key.split("_")[1]
        preprocessing = key.split("_")[0]
        model_name = f"LogisticRegression_{preprocessing}_{method}"

        print(f"\nEntraînement : {model_name}")

        # Vectorisation
        if method == "tfidf":
            vectorizer, X = get_tfidf_vector(texts)
        elif method == "transformer":
            X = get_sentence_embeddings(texts)
        else:
            raise ValueError("Méthode non reconnue")

        # Binariser les étiquettes si nécessaire (pour AUC)
        y = label_binarize(labels, classes=[0, 1])

        # Split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Flatten pour certaines méthodes
        y_train_flat = y_train.ravel()
        y_test_flat = y_test.ravel()

        # Entraînement
        model = LogisticRegression(max_iter=1000, solver="liblinear", penalty="l2", C=1.0)

        with mlflow.start_run(run_name=model_name):

            # Entraîner
            model.fit(X_train, y_train_flat)

            # Prédire
            y_pred = model.predict(X_test)
            y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else [0]*len(y_test)

            # Calcul des métriques
            accuracy = accuracy_score(y_test_flat, y_pred)
            f1 = f1_score(y_test_flat, y_pred)
            recall = recall_score(y_test_flat, y_pred)
            precision = precision_score(y_test_flat, y_pred)
            roc_auc = roc_auc_score(y_test_flat, y_proba)
            pr_auc = average_precision_score(y_test_flat, y_proba)

            # Log métriques
            mlflow.log_metrics({
                "accuracy": accuracy,
                "f1": f1,
                "recall": recall,
                "precision": precision,
                "roc_auc": roc_auc,
                "pr_auc": pr_auc
            })

            # Log hyperparamètres
            mlflow.log_params({
                "C": model.C,
                "penalty": model.penalty,
                "solver": model.solver,
                "max_iter": model.max_iter
            })

            # Matrice de confusion
            cm = confusion_matrix(y_test_flat, y_pred)
            disp = ConfusionMatrixDisplay(confusion_matrix=cm)
            plt.figure(figsize=(4, 4))
            disp.plot(cmap=plt.cm.Blues)
            plt.title(model_name)
            cm_path = f"confusion_matrices/{model_name}.png"
            plt.savefig(cm_path)
            plt.close()

            # Log artifact : matrice de confusion
            mlflow.log_artifact(cm_path)

            # Log dataset en tant que tag (ou partiellement comme artifact)
            mlflow.set_tag("dataset_used", "sentiment140")
            mlflow.set_tag("sample_size", sample_size)
            mlflow.set_tag("preprocessing", preprocessing)
            mlflow.set_tag("embedding_method", method)

            # Log le modèle
            mlflow.sklearn.log_model(model, "model")

            print(f" {model_name} sauvegardé dans MLflow")

    print("\n Toutes les expériences ont été enregistrées dans MLflow.")

In [None]:
# Lancer la fonction
compare_preprocessing_embeddings_mlflow(sample_df)