# Preprocessing for classical ML

In [4]:
import pandas as pd
import numpy as np
import re
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk

## Directories and constants

In [5]:
FINETUNING_SPLITS_DIR = "../data/cleaned/finetuning-splits/"
ML_SPLITS_DIR = "../data/cleaned/ml-methods-splits/"

## NLTK stopwords

In [6]:
nltk.download("stopwords")
french_stop = set(stopwords.words("french"))
negations = {"ne", "pas", "jamais", "rien", "aucun", "sans", "not", "no", "never", "none"}
french_stop = french_stop - negations

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\OrdiOne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load train/test splits from fine-tuning

In [7]:
print("Loading stratified train/test splits from fine-tuning notebook...")
train_df = pd.read_csv(f"{FINETUNING_SPLITS_DIR}/train_set.csv")
test_df  = pd.read_csv(f"{FINETUNING_SPLITS_DIR}/test_set.csv")
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

Loading stratified train/test splits from fine-tuning notebook...
Train: (433, 2), Test: (109, 2)


## Text cleaning function for classic ML

In [8]:
def clean_text_ml(text):
    """Clean text for TF-IDF / classical ML."""
    if not isinstance(text, str) or text.strip() == "":
        return ""
    
    # lowercase
    text = text.lower()
    
    # emoji mapping
    emoji_replacements = {
        "üòä": " _emoji_souriant_ ",
        "üòç": " _emoji_coeur_ ",
        "üëç": " _emoji_ok_ ",
        "üëé": " _emoji_pas_ok_ ",
        "üò†": " _emoji_enerve_ ",
        "üòî": " _emoji_triste_ ",
        "‚≠ê": " _emoji_etoile_ ",
        "üåü": " _emoji_etoile_brillante_ ",
    }
    for emoji, replacement in emoji_replacements.items():
        text = text.replace(emoji, replacement)
    
    # remove unwanted chars but keep basic punctuation
    text = re.sub(r"[^\w\s√†√¢√§√©√®√™√´√Æ√Ø√¥√∂√π√ª√º√ß.!?,;:]", " ", text)
    
    # mark punctuation
    text = re.sub(r"(!)", " _exclamation_ ", text)
    text = re.sub(r"(\?)", " _question_ ", text)
    
    # remove URLs, mentions, hashtags
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#\w+", " ", text)
    
    # normalize spaces
    text = re.sub(r"\s+", " ", text).strip()
    
    return text

## Remove stopwords

In [9]:
def remove_stopwords(text):
    words = text.split()
    return " ".join([w for w in words if w not in french_stop])

## Complete preprocessing pipeline

In [10]:
def preprocess_dataframe(df):
    df = df.copy()
    
    # Step 1: clean text
    df["text_clean"] = df["text"].apply(clean_text_ml)
    
    # Step 2: remove stopwords
    df["text_clean"] = df["text_clean"].apply(remove_stopwords)
    
    
    # Step 3: remove empty rows after cleaning
    before = df.shape[0]
    df["text_clean"] = df["text_clean"].replace("", np.nan)
    df = df.dropna(subset=["text_clean"])
    after = df.shape[0]
    print(f"Removed {before - after} empty rows after preprocessing.")
    
    return df

## Apply preprocessing on train/test

In [11]:
print("\nPreprocessing train set...")
train_df = preprocess_dataframe(train_df)

print("\nPreprocessing test set...")
test_df = preprocess_dataframe(test_df)


Preprocessing train set...
Removed 2 empty rows after preprocessing.

Preprocessing test set...
Removed 4 empty rows after preprocessing.


## Distribution of data 

In [15]:
X_train = train_df['text_clean']
y_train = train_df['label']
X_test = test_df['text_clean']
y_test = test_df['label']
print(f"   Train: {len(X_train)} samples")
print(f"   Test:  {len(X_test)} samples")
print(f'Distribution of the training data : ',y_train.value_counts())
print(f'Distribution of the testing data : ',y_test.value_counts())

   Train: 431 samples
   Test:  105 samples
Distribution of the training data :  label
2    298
0     94
1     39
Name: count, dtype: int64
Distribution of the testing data :  label
2    71
0    24
1    10
Name: count, dtype: int64


## Save preprocessed splits

In [13]:
os.makedirs(ML_SPLITS_DIR, exist_ok=True)
train_path = f"{ML_SPLITS_DIR}/train_set.csv"
test_path  = f"{ML_SPLITS_DIR}/test_set.csv"
train_df[["text_clean", "label"]].to_csv(train_path, index=False, encoding="utf-8-sig")
test_df[["text_clean", "label"]].to_csv(test_path,  index=False, encoding="utf-8-sig")
print(f"Train saved to: {train_path} ‚Äî shape: {train_df.shape}")
print(f"Test saved  to: {test_path} ‚Äî shape: {test_df.shape}")

Train saved to: ../data/cleaned/ml-methods-splits//train_set.csv ‚Äî shape: (431, 3)
Test saved  to: ../data/cleaned/ml-methods-splits//test_set.csv ‚Äî shape: (105, 3)
