In [45]:
# Imports standards
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import warnings
import joblib
import os

warnings.filterwarnings("ignore")

# Configuration affichage
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
sns.set_style("whitegrid")

print("‚úì Imports OK")

‚úì Imports OK


In [46]:
# Charger les donn√©es
df = pd.read_csv(
    "../data/interim/train_dataset_M1_interim.csv"
)

print(f"üìä Dataset shape: {df.shape}")
print(f"üìÖ P√©riode: Day {df['Day'].min()} √† {df['Day'].max()}")
print(f"üéØ Target balance: {df['Purchase'].value_counts(normalize=True)}")

# Afficher les premi√®res lignes
display(df.head())

# %%
# V√©rifier les types de colonnes
print("üîç Types de donn√©es:")
display(df.dtypes)

üìä Dataset shape: (13735, 21)
üìÖ P√©riode: Day 1 √† 70
üéØ Target balance: Purchase
0    0.631889
1    0.368111
Name: proportion, dtype: float64


Unnamed: 0,id,Age,Gender,Reviews_Read,Price,Discount,Category,Items_In_Cart,Time_of_Day,Email_Interaction,Device_Type,Payment_Method,Referral_Source,Socioeconomic_Status_Score,Engagement_Score,AB_Bucket,Price_Sine,Session_ID,Day,Campaign_Period,Purchase
0,1,,1.0,3.0,592.975,22.0,1.0,6.0,afternoon,0.0,Mobile,Credit,Social_media,7.26,1.85652,3.0,0.999047,S0000003,59,False,0
1,2,25.0,1.0,1.0,511.279,12.0,0.0,3.0,morning,1.0,Tablet,Cash,Social_media,8.3,1.868138,5.0,-0.129689,S0000005,29,True,1
2,3,22.0,0.0,3.0,218.36,2.0,1.0,4.0,evening,1.0,Mobile,Bank,Social_media,6.61,1.223445,0.0,-0.421646,S0000007,16,False,0
3,4,24.0,0.0,3.0,313.781,1.0,3.0,0.0,evening,1.0,Mobile,PayPal,Social_media,10.51,0.359684,1.0,-0.988239,S0000008,53,False,0
4,5,35.0,1.0,1.0,495.088,13.0,0.0,2.0,evening,0.0,Mobile,Cash,Social_media,8.33,3.84858,2.0,0.695737,S0000009,10,False,0


üîç Types de donn√©es:


id                              int64
Age                           float64
Gender                        float64
Reviews_Read                  float64
Price                         float64
Discount                      float64
Category                      float64
Items_In_Cart                 float64
Time_of_Day                    object
Email_Interaction             float64
Device_Type                    object
Payment_Method                 object
Referral_Source                object
Socioeconomic_Status_Score    float64
Engagement_Score              float64
AB_Bucket                     float64
Price_Sine                    float64
Session_ID                     object
Day                             int64
Campaign_Period                  bool
Purchase                        int64
dtype: object

In [47]:
# Statistiques de valeurs manquantes
print("‚ùå Valeurs manquantes par colonne:")
missing = df.isnull().sum()
missing_pct = (missing / len(df) * 100).round(2)
missing_df = pd.DataFrame({"Missing_Count": missing, "Missing_Pct": missing_pct})
display(
    missing_df[missing_df["Missing_Count"] > 0].sort_values(
        "Missing_Pct", ascending=False
    )
)

‚ùå Valeurs manquantes par colonne:


Unnamed: 0,Missing_Count,Missing_Pct
Age,2087,15.19
Price,634,4.62
Reviews_Read,291,2.12
Category,287,2.09
Price_Sine,284,2.07
Items_In_Cart,280,2.04
Session_ID,280,2.04
Time_of_Day,279,2.03
Device_Type,278,2.02
Socioeconomic_Status_Score,275,2.0


In [48]:
# V√©rifier les valeurs uniques des cat√©gorielles
print("üìÇ Cat√©gories uniques:")
for col in df.select_dtypes(include="object").columns:
    print(f"\n{col}: {df[col].nunique()} valeurs")
    print(f"  ‚Üí {df[col].unique()[:10]}")

üìÇ Cat√©gories uniques:

Time_of_Day: 3 valeurs
  ‚Üí ['afternoon' 'morning' 'evening' nan]

Device_Type: 3 valeurs
  ‚Üí ['Mobile' 'Tablet' 'Desktop' nan]

Payment_Method: 4 valeurs
  ‚Üí ['Credit' 'Cash' 'Bank' 'PayPal' nan]

Referral_Source: 5 valeurs
  ‚Üí ['Social_media' 'Direct' 'Search_engine' 'Ads' 'Email' nan]

Session_ID: 13455 valeurs
  ‚Üí ['S0000003' 'S0000005' 'S0000007' 'S0000008' 'S0000009' 'S0000010'
 'S0000011' 'S0000013' 'S0000014' 'S0000016']


In [49]:
# Split temporel
train = df[df["Day"] <= 60].copy()
val = df[(df["Day"] > 60) & (df["Day"] <= 70)].copy()
test = df[df["Day"] > 70].copy()

print(f"üì¶ Train: {len(train):,} lignes ({len(train)/len(df)*100:.1f}%)")
print(f"üì¶ Val:   {len(val):,} lignes ({len(val)/len(df)*100:.1f}%)")
print(f"üì¶ Test:  {len(test):,} lignes ({len(test)/len(df)*100:.1f}%)")

üì¶ Train: 11,766 lignes (85.7%)
üì¶ Val:   1,969 lignes (14.3%)
üì¶ Test:  0 lignes (0.0%)


In [50]:
# V√©rifier la distribution de la target
print("\nüéØ Distribution Purchase:")
print(f"  Train: {train['Purchase'].mean():.2%}")
print(f"  Val:   {val['Purchase'].mean():.2%}")
if len(test) > 0 and "Purchase" in test.columns:
    print(f"  Test:  {test['Purchase'].mean():.2%}")


üéØ Distribution Purchase:
  Train: 37.44%
  Val:   33.06%


In [51]:
def engineer_features(df, campaign_days=None, is_train=True):
    """
    Cr√©e toutes les features engineer√©es

    Parameters:
    -----------
    df : DataFrame
        Donn√©es √† transformer
    campaign_days : list
        Liste des jours de campagne (appris sur train)
    is_train : bool
        Si True, apprend les campaign_days

    Returns:
    --------
    df : DataFrame transform√©
    campaign_days : list (si is_train=True)
    """

    df = df.copy()

    # === 1. FEATURES PRIX ===
    print("üí∞ Features prix...")
    # Montant r√©el de la r√©duction
    df["Effective_Discount"] = df["Price"] * df["Discount"] / 100

    # Prix apr√®s r√©duction
    df["Net_Price"] = df["Price"] * (1 - df["Discount"] / 100)

    # Buckets de prix (robuste aux outliers)
    df["Price_Bucket"] = pd.qcut(df["Price"], q=5, labels=False, duplicates="drop")

    # === 2. FEATURES ENGAGEMENT ===
    print("üéØ Features engagement...")
    # Interaction email √ó score d'engagement
    df["Email_x_Engagement"] = df["Email_Interaction"] * df["Engagement_Score"]

    # Items dans le panier √ó engagement
    df["Cart_x_Engagement"] = df["Items_In_Cart"] * df["Engagement_Score"]

    # === 3. FEATURES CAMPAGNE (insights EDA) ===
    print("üì¢ Features campagne...")
    # Tablet pendant campagne (tr√®s performant selon EDA)
    df["Tablet_During_Campaign"] = (
        (df["Device_Type"] == "Tablet") & (df["Campaign_Period"])
    ).astype(int)

    # Desktop pendant campagne
    df["Desktop_During_Campaign"] = (
        (df["Device_Type"] == "Desktop") & (df["Campaign_Period"])
    ).astype(int)

    # Distance au campaign le plus proche
    if is_train:
        # Apprendre quels jours sont des campagnes
        campaign_days = sorted(df[df["Campaign_Period"] == True]["Day"].unique())
        print(
            f"  üìÖ Jours de campagne d√©tect√©s: {campaign_days[:5]}... ({len(campaign_days)} jours)"
        )

    if campaign_days is not None and len(campaign_days) > 0:
        df["Day_to_Campaign"] = df["Day"].apply(
            lambda d: min([abs(d - cd) for cd in campaign_days])
        )
    else:
        df["Day_to_Campaign"] = 999  # Valeur par d√©faut si pas de campagne

    # === 4. FEATURES CAT√âGORIELLES ===
    print("üìÇ Features cat√©gorielles...")
    # Cat√©gories haute valeur (0, 1, 2 selon EDA)
    df["HighValue_Category"] = df["Category"].isin([0.0, 1.0, 2.0]).astype(int)

    # Email + Device premium (Tablet/Desktop)
    df["Email_Device_High"] = (
        (df["Email_Interaction"] == 1.0)
        & (df["Device_Type"].isin(["Tablet", "Desktop"]))
    ).astype(int)

    print(f"‚úì {len(df.columns)} colonnes apr√®s feature engineering")

    if is_train:
        return df, campaign_days
    else:
        return df

In [52]:
# Cr√©er les features sur TRAIN d'abord (pour apprendre campaign_days)
print("=" * 60)
print("FEATURE ENGINEERING - TRAIN")
print("=" * 60)
train_featured, campaign_days = engineer_features(train, is_train=True)

FEATURE ENGINEERING - TRAIN
üí∞ Features prix...
üéØ Features engagement...
üì¢ Features campagne...
  üìÖ Jours de campagne d√©tect√©s: [np.int64(25), np.int64(26), np.int64(27), np.int64(28), np.int64(29)]... (26 jours)
üìÇ Features cat√©gorielles...
‚úì 31 colonnes apr√®s feature engineering


In [53]:
# Puis sur VAL et TEST en utilisant les campaign_days du train
print("\n" + "=" * 60)
print("FEATURE ENGINEERING - VAL")
print("=" * 60)
val_featured = engineer_features(val, campaign_days=campaign_days, is_train=False)

if len(test) > 0:
    print("\n" + "=" * 60)
    print("FEATURE ENGINEERING - TEST")
    print("=" * 60)
    test_featured = engineer_features(test, campaign_days=campaign_days, is_train=False)
else:
    test_featured = test.copy()


FEATURE ENGINEERING - VAL
üí∞ Features prix...
üéØ Features engagement...
üì¢ Features campagne...
üìÇ Features cat√©gorielles...
‚úì 31 colonnes apr√®s feature engineering


In [54]:
# V√©rifier les nouvelles features
print("üÜï Nouvelles features cr√©√©es:")
new_features = [col for col in train_featured.columns if col not in df.columns]
print(new_features)

# Afficher quelques statistiques
display(train_featured[new_features].describe())

üÜï Nouvelles features cr√©√©es:
['Effective_Discount', 'Net_Price', 'Price_Bucket', 'Email_x_Engagement', 'Cart_x_Engagement', 'Tablet_During_Campaign', 'Desktop_During_Campaign', 'Day_to_Campaign', 'HighValue_Category', 'Email_Device_High']


Unnamed: 0,Effective_Discount,Net_Price,Price_Bucket,Email_x_Engagement,Cart_x_Engagement,Tablet_During_Campaign,Desktop_During_Campaign,Day_to_Campaign,HighValue_Category,Email_Device_High
count,10988.0,10988.0,11216.0,11312.0,11297.0,11766.0,11766.0,11766.0,11766.0,11766.0
mean,141.852051,421.080031,1.999822,1.032998,7.884988,0.062468,0.168281,5.999575,0.588135,0.237804
std,266.103561,581.592408,1.41434,1.492498,9.160557,0.242014,0.374132,7.307519,0.492192,0.425756
min,0.0,2.87045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,34.760085,182.384527,1.0,0.0,1.795998,0.0,0.0,0.0,0.0,0.0
50%,93.76107,356.14061,2.0,0.0,4.866455,0.0,0.0,3.0,1.0,0.0
75%,188.843385,549.497975,3.0,1.904209,10.842206,0.0,0.0,10.0,1.0,0.0
max,11803.141054,12601.978185,4.0,6.398595,179.160658,1.0,1.0,24.0,1.0,1.0


In [55]:
def add_missing_indicators(df, cols_with_missing=None, is_train=True, threshold=0.01):
    """
    Ajoute des indicateurs de valeurs manquantes

    Parameters:
    -----------
    df : DataFrame
    cols_with_missing : list
        Colonnes √† traiter (appris sur train)
    is_train : bool
        Si True, d√©tecte les colonnes avec >threshold missing
    threshold : float
        Seuil de % de missing pour cr√©er un flag

    Returns:
    --------
    df : DataFrame avec flags
    cols_with_missing : list
    """

    df = df.copy()

    if is_train:
        # D√©tecter les colonnes avec >threshold de missing
        missing_pct = df.isnull().sum() / len(df)
        cols_with_missing = missing_pct[missing_pct > threshold].index.tolist()
        print(f"üìç {len(cols_with_missing)} colonnes avec >{threshold*100}% missing:")
        for col in cols_with_missing:
            pct = missing_pct[col] * 100
            print(f"  - {col}: {pct:.2f}%")

    # Ajouter les flags
    if cols_with_missing:
        for col in cols_with_missing:
            if col in df.columns:
                df[f"{col}_missing"] = df[col].isnull().astype(int)

    if is_train:
        return df, cols_with_missing
    else:
        return df

In [56]:
# Ajouter les missing indicators
print("=" * 60)
print("MISSING INDICATORS")
print("=" * 60)

train_featured, cols_with_missing = add_missing_indicators(
    train_featured, is_train=True, threshold=0.01
)

val_featured = add_missing_indicators(
    val_featured, cols_with_missing=cols_with_missing, is_train=False
)

if len(test_featured) > 0:
    test_featured = add_missing_indicators(
        test_featured, cols_with_missing=cols_with_missing, is_train=False
    )

print(f"\n‚úì Flags ajout√©s pour: {cols_with_missing}")

MISSING INDICATORS
üìç 20 colonnes avec >1.0% missing:
  - Age: 15.19%
  - Gender: 1.87%
  - Reviews_Read: 2.14%
  - Price: 4.67%
  - Discount: 2.03%
  - Category: 2.08%
  - Items_In_Cart: 2.01%
  - Time_of_Day: 1.99%
  - Email_Interaction: 1.94%
  - Device_Type: 1.99%
  - Socioeconomic_Status_Score: 2.02%
  - Engagement_Score: 2.00%
  - AB_Bucket: 1.95%
  - Price_Sine: 2.12%
  - Session_ID: 2.08%
  - Effective_Discount: 6.61%
  - Net_Price: 6.61%
  - Price_Bucket: 4.67%
  - Email_x_Engagement: 3.86%
  - Cart_x_Engagement: 3.99%

‚úì Flags ajout√©s pour: ['Age', 'Gender', 'Reviews_Read', 'Price', 'Discount', 'Category', 'Items_In_Cart', 'Time_of_Day', 'Email_Interaction', 'Device_Type', 'Socioeconomic_Status_Score', 'Engagement_Score', 'AB_Bucket', 'Price_Sine', 'Session_ID', 'Effective_Discount', 'Net_Price', 'Price_Bucket', 'Email_x_Engagement', 'Cart_x_Engagement']


In [57]:
def impute_numeric_features(train, val, test=None):
    """
    Impute les features num√©riques avec la m√©diane du train

    Returns:
    --------
    train, val, test (imput√©s)
    impute_values (dict des m√©dianes)
    """

    train = train.copy()
    val = val.copy()
    if test is not None:
        test = test.copy()

    # Identifier les colonnes num√©riques avec des NaN
    numeric_cols = train.select_dtypes(include=[np.number]).columns
    cols_to_impute = [col for col in numeric_cols if train[col].isnull().any()]

    print(f"üî¢ Imputation de {len(cols_to_impute)} colonnes num√©riques:")

    impute_values = {}

    for col in cols_to_impute:
        # Calculer la m√©diane sur TRAIN uniquement
        median_val = train[col].median()
        impute_values[col] = median_val

        # Appliquer sur train, val, test
        train[col] = train[col].fillna(median_val)
        val[col] = val[col].fillna(median_val)
        if test is not None:
            if col in test.columns:  # <-- Ajoutez cette v√©rification
                test[col] = test[col].fillna(median_val)

        print(f"  - {col}: m√©diane = {median_val:.2f}")

    print(f"\n‚úì Imputation termin√©e")

    if test is not None:
        return train, val, test, impute_values
    else:
        return train, val, None, impute_values

In [58]:
def impute_categorical_features(train, val, test=None):
    """
    Impute les features cat√©gorielles avec 'Unknown'
    """

    train = train.copy()
    val = val.copy()
    if test is not None:
        test = test.copy()

    cat_cols = train.select_dtypes(include="object").columns
    cols_to_impute = [col for col in cat_cols if train[col].isnull().any()]

    print(f"üìÇ Imputation de {len(cols_to_impute)} colonnes cat√©gorielles:")

    for col in cols_to_impute:
        missing_count = train[col].isnull().sum()
        train[col] = train[col].fillna("Unknown")
        val[col] = val[col].fillna("Unknown")
        if test is not None:
            test[col] = test[col].fillna("Unknown")

        print(f"  - {col}: {missing_count} valeurs ‚Üí 'Unknown'")

    print(f"‚úì Imputation termin√©e")

    if test is not None:
        return train, val, test
    else:
        return train, val, None

In [59]:
# Imputation
print("=" * 60)
print("IMPUTATION - NUM√âRIQUES")
print("=" * 60)

train_featured, val_featured, test_featured, impute_vals = impute_numeric_features(
    train_featured, val_featured, test_featured
)

print("\n" + "=" * 60)
print("IMPUTATION - CAT√âGORIELLES")
print("=" * 60)

train_featured, val_featured, test_featured = impute_categorical_features(
    train_featured, val_featured, test_featured
)

IMPUTATION - NUM√âRIQUES
üî¢ Imputation de 17 colonnes num√©riques:
  - Age: m√©diane = 28.00
  - Gender: m√©diane = 0.00
  - Reviews_Read: m√©diane = 3.00
  - Price: m√©diane = 499.72
  - Discount: m√©diane = 25.00
  - Category: m√©diane = 2.00
  - Items_In_Cart: m√©diane = 3.00
  - Email_Interaction: m√©diane = 0.00
  - Socioeconomic_Status_Score: m√©diane = 4.53
  - Engagement_Score: m√©diane = 2.11
  - AB_Bucket: m√©diane = 3.00
  - Price_Sine: m√©diane = 0.03
  - Effective_Discount: m√©diane = 93.76
  - Net_Price: m√©diane = 356.14
  - Price_Bucket: m√©diane = 2.00
  - Email_x_Engagement: m√©diane = 0.00
  - Cart_x_Engagement: m√©diane = 4.87

‚úì Imputation termin√©e

IMPUTATION - CAT√âGORIELLES
üìÇ Imputation de 5 colonnes cat√©gorielles:
  - Time_of_Day: 234 valeurs ‚Üí 'Unknown'
  - Device_Type: 234 valeurs ‚Üí 'Unknown'
  - Payment_Method: 34 valeurs ‚Üí 'Unknown'
  - Referral_Source: 31 valeurs ‚Üí 'Unknown'
  - Session_ID: 245 valeurs ‚Üí 'Unknown'
‚úì Imputation termin√©

In [60]:
# V√©rifier qu'il n'y a plus de NaN
print("üîç V√©rification des valeurs manquantes apr√®s imputation:")
print(f"  Train: {train_featured.isnull().sum().sum()} NaN")
print(f"  Val:   {val_featured.isnull().sum().sum()} NaN")
if len(test_featured) > 0:
    print(f"  Test:  {test_featured.isnull().sum().sum()} NaN")

üîç V√©rification des valeurs manquantes apr√®s imputation:
  Train: 0 NaN
  Val:   0 NaN


In [61]:
def handle_rare_categories(train, val, test=None, cat_features=None, threshold=0.01):
    """
    Groupe les cat√©gories rares (<threshold) en 'Other'

    Parameters:
    -----------
    threshold : float
        % minimum pour garder une cat√©gorie

    Returns:
    --------
    train, val, test (transform√©s)
    known_categories (dict)
    """

    train = train.copy()
    val = val.copy()
    if test is not None:
        test = test.copy()

    if cat_features is None:
        cat_features = train.select_dtypes(include="object").columns.tolist()

    known_categories = {}

    print(f"üìä Groupement des cat√©gories rares (<{threshold*100}%):")

    for col in cat_features:
        if col not in train.columns:
            continue

        # Compter les occurrences dans train
        value_counts = train[col].value_counts()
        freq = value_counts / len(train)

        # Garder seulement les cat√©gories fr√©quentes
        valid_cats = freq[freq >= threshold].index.tolist()
        known_categories[col] = valid_cats

        rare_count = len(value_counts) - len(valid_cats)

        if rare_count > 0:
            print(f"  - {col}: {rare_count}/{len(value_counts)} cat√©gories ‚Üí 'Other'")

            # Remplacer les rares par 'Other'
            train[col] = train[col].apply(lambda x: x if x in valid_cats else "Other")
            val[col] = val[col].apply(lambda x: x if x in valid_cats else "Other")
            if test is not None:
                test[col] = test[col].apply(lambda x: x if x in valid_cats else "Other")

    print(f"\n‚úì Groupement termin√©")

    if test is not None:
        return train, val, test, known_categories
    else:
        return train, val, None, known_categories

In [62]:
# D√©finir les features cat√©gorielles
cat_features = [
    "Device_Type",
    "Time_of_Day",
    "Payment_Method",
    "Referral_Source",
    "Category",
]

print("=" * 60)
print("RARE CATEGORIES HANDLING")
print("=" * 60)

train_featured, val_featured, test_featured, known_cats = handle_rare_categories(
    train_featured,
    val_featured,
    test_featured,
    cat_features=cat_features,
    threshold=0.01,
)

# Afficher les cat√©gories gard√©es
print("\nüìã Cat√©gories valides par feature:")
for col, cats in known_cats.items():
    print(f"  {col}: {cats}")

RARE CATEGORIES HANDLING
üìä Groupement des cat√©gories rares (<1.0%):
  - Payment_Method: 1/5 cat√©gories ‚Üí 'Other'
  - Referral_Source: 1/6 cat√©gories ‚Üí 'Other'

‚úì Groupement termin√©

üìã Cat√©gories valides par feature:
  Device_Type: ['Mobile', 'Desktop', 'Tablet', 'Unknown']
  Time_of_Day: ['evening', 'afternoon', 'morning', 'Unknown']
  Payment_Method: ['PayPal', 'Cash', 'Bank', 'Credit']
  Referral_Source: ['Search_engine', 'Direct', 'Social_media', 'Email', 'Ads']
  Category: [2.0, 1.0, 4.0, 0.0, 3.0]


In [63]:
def prepare_catboost_data(
    train,
    val,
    test,
    cat_features,
    target_col="Purchase",
    id_col="id",
    session_col="Session_ID",
):
    """
    Pr√©pare les donn√©es pour CatBoost

    Returns:
    --------
    X_train, y_train, X_val, y_val, X_test, cat_indices
    """

    # Convertir les cat√©gorielles en string
    train = train.copy()
    val = val.copy()
    test = test.copy() if test is not None and len(test) > 0 else None

    for col in cat_features:
        if col in train.columns:
            train[col] = train[col].astype(str)
            val[col] = val[col].astype(str)
            if test is not None:
                test[col] = test[col].astype(str)

    # Colonnes √† drop
    drop_cols = [target_col, id_col, session_col]
    drop_cols = [c for c in drop_cols if c in train.columns]

    # S√©parer features et target
    X_train = train.drop(columns=drop_cols)
    y_train = train[target_col]

    X_val = val.drop(columns=drop_cols)
    y_val = val[target_col] if target_col in val.columns else None

    if test is not None:
        X_test = (
            test.drop(columns=drop_cols)
            if target_col in test.columns
            else test.drop(
                columns=[c for c in [id_col, session_col] if c in test.columns]
            )
        )
    else:
        X_test = None

    # Indices des colonnes cat√©gorielles
    cat_indices = [
        X_train.columns.get_loc(col) for col in cat_features if col in X_train.columns
    ]

    print("üì¶ CatBoost Data Ready:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_val:   {X_val.shape}")
    if X_test is not None:
        print(f"  X_test:  {X_test.shape}")
    print(f"  Categorical indices: {cat_indices}")
    print(f"  Categorical features: {[X_train.columns[i] for i in cat_indices]}")

    return X_train, y_train, X_val, y_val, X_test, cat_indices


print("=" * 60)
print("PIPELINE 1 : CATBOOST")
print("=" * 60)

X_train_cb, y_train_cb, X_val_cb, y_val_cb, X_test_cb, cat_indices = (
    prepare_catboost_data(
        train_featured, val_featured, test_featured, cat_features=cat_features
    )
)

# Afficher les premi√®res lignes
print("\nüìä Aper√ßu X_train_cb:")
display(X_train_cb.head())

# V√©rifier les types
print("\nüîç Types de colonnes:")
print(X_train_cb.dtypes.value_counts())

PIPELINE 1 : CATBOOST
üì¶ CatBoost Data Ready:
  X_train: (11766, 48)
  X_val:   (1969, 48)
  Categorical indices: [9, 7, 10, 11, 5]
  Categorical features: ['Device_Type', 'Time_of_Day', 'Payment_Method', 'Referral_Source', 'Category']

üìä Aper√ßu X_train_cb:


Unnamed: 0,Age,Gender,Reviews_Read,Price,Discount,Category,Items_In_Cart,Time_of_Day,Email_Interaction,Device_Type,Payment_Method,Referral_Source,Socioeconomic_Status_Score,Engagement_Score,AB_Bucket,Price_Sine,Day,Campaign_Period,Effective_Discount,Net_Price,Price_Bucket,Email_x_Engagement,Cart_x_Engagement,Tablet_During_Campaign,Desktop_During_Campaign,Day_to_Campaign,HighValue_Category,Email_Device_High,Age_missing,Gender_missing,Reviews_Read_missing,Price_missing,Discount_missing,Category_missing,Items_In_Cart_missing,Time_of_Day_missing,Email_Interaction_missing,Device_Type_missing,Socioeconomic_Status_Score_missing,Engagement_Score_missing,AB_Bucket_missing,Price_Sine_missing,Session_ID_missing,Effective_Discount_missing,Net_Price_missing,Price_Bucket_missing,Email_x_Engagement_missing,Cart_x_Engagement_missing
0,28.0,1.0,3.0,592.975,22.0,1.0,6.0,afternoon,0.0,Mobile,Credit,Social_media,7.26,1.85652,3.0,0.999047,59,False,130.4545,462.5205,2.0,0.0,11.139119,0,0,9,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,25.0,1.0,1.0,511.279,12.0,0.0,3.0,morning,1.0,Tablet,Cash,Social_media,8.3,1.868138,5.0,-0.129689,29,True,61.35348,449.92552,2.0,1.868138,5.604415,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,22.0,0.0,3.0,218.36,2.0,1.0,4.0,evening,1.0,Mobile,Bank,Social_media,6.61,1.223445,0.0,-0.421646,16,False,4.3672,213.9928,1.0,1.223445,4.893782,0,0,9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,24.0,0.0,3.0,313.781,1.0,3.0,0.0,evening,1.0,Mobile,PayPal,Social_media,10.51,0.359684,1.0,-0.988239,53,False,3.13781,310.64319,1.0,0.359684,0.0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,35.0,1.0,1.0,495.088,13.0,0.0,2.0,evening,0.0,Mobile,Cash,Social_media,8.33,3.84858,2.0,0.695737,10,False,64.36144,430.72656,2.0,0.0,7.69716,0,0,15,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0



üîç Types de colonnes:
int64      26
float64    16
object      5
bool        1
Name: count, dtype: int64


In [64]:
def prepare_classic_data(
    train,
    val,
    test,
    cat_features,
    scale=True,
    target_col="Purchase",
    id_col="id",
    session_col="Session_ID",
):
    """
    Pr√©pare les donn√©es pour mod√®les classiques (one-hot + scaling)

    Returns:
    --------
    X_train, y_train, X_val, y_val, X_test, scaler
    """

    train = train.copy()
    val = val.copy()
    test = test.copy() if test is not None and len(test) > 0 else None

    # Colonnes √† drop
    drop_cols = [target_col, id_col, session_col]
    drop_cols = [c for c in drop_cols if c in train.columns]

    # S√©parer features et target
    X_train = train.drop(columns=drop_cols)
    y_train = train[target_col]

    X_val = val.drop(columns=drop_cols)
    y_val = val[target_col] if target_col in val.columns else None

    if test is not None:
        X_test = (
            test.drop(columns=drop_cols)
            if target_col in test.columns
            else test.drop(
                columns=[c for c in [id_col, session_col] if c in test.columns]
            )
        )
    else:
        X_test = None

    print("üîÑ One-hot encoding...")

    # One-hot encoding
    X_train_encoded = pd.get_dummies(X_train, columns=cat_features, drop_first=True)
    X_val_encoded = pd.get_dummies(X_val, columns=cat_features, drop_first=True)
    if X_test is not None:
        X_test_encoded = pd.get_dummies(X_test, columns=cat_features, drop_first=True)
    else:
        X_test_encoded = None

    # Aligner les colonnes (val/test doivent avoir les m√™mes que train)
    train_cols = X_train_encoded.columns.tolist()

    # Ajouter les colonnes manquantes dans val
    missing_in_val = set(train_cols) - set(X_val_encoded.columns)
    for col in missing_in_val:
        X_val_encoded[col] = 0

    # Supprimer les colonnes en trop dans val
    extra_in_val = set(X_val_encoded.columns) - set(train_cols)
    X_val_encoded = X_val_encoded.drop(columns=list(extra_in_val))

    # R√©ordonner les colonnes
    X_val_encoded = X_val_encoded[train_cols]

    # M√™me chose pour test
    if X_test_encoded is not None:
        missing_in_test = set(train_cols) - set(X_test_encoded.columns)
        for col in missing_in_test:
            X_test_encoded[col] = 0

        extra_in_test = set(X_test_encoded.columns) - set(train_cols)
        X_test_encoded = X_test_encoded.drop(columns=list(extra_in_test))

        X_test_encoded = X_test_encoded[train_cols]

    print(f"  ‚úì Train: {X_train_encoded.shape[1]} features apr√®s encoding")
    print(
        f"  ‚úì Val:   {len(missing_in_val)} colonnes ajout√©es, {len(extra_in_val)} supprim√©es"
    )
    if X_test_encoded is not None:
        print(
            f"  ‚úì Test:  {len(missing_in_test)} colonnes ajout√©es, {len(extra_in_test)} supprim√©es"
        )

    # Scaling
    scaler = None
    if scale:
        print("\nüìè Scaling (StandardScaler)...")
        scaler = StandardScaler()

        # Fit sur train uniquement
        X_train_encoded = pd.DataFrame(
            scaler.fit_transform(X_train_encoded),
            columns=X_train_encoded.columns,
            index=X_train_encoded.index,
        )

        # Transform sur val et test
        X_val_encoded = pd.DataFrame(
            scaler.transform(X_val_encoded),
            columns=X_val_encoded.columns,
            index=X_val_encoded.index,
        )

        if X_test_encoded is not None:
            X_test_encoded = pd.DataFrame(
                scaler.transform(X_test_encoded),
                columns=X_test_encoded.columns,
                index=X_test_encoded.index,
            )

        print("  ‚úì Scaling termin√©")

    print("\nüì¶ Classic ML Data Ready:")
    print(f"  X_train: {X_train_encoded.shape}")
    print(f"  X_val:   {X_val_encoded.shape}")
    if X_test_encoded is not None:
        print(f"  X_test:  {X_test_encoded.shape}")

    return X_train_encoded, y_train, X_val_encoded, y_val, X_test_encoded, scaler


print("=" * 60)
print("PIPELINE 2 : CLASSIC ML (ONE-HOT + SCALING)")
print("=" * 60)

X_train_cls, y_train_cls, X_val_cls, y_val_cls, X_test_cls, scaler = (
    prepare_classic_data(
        train_featured,
        val_featured,
        test_featured,
        cat_features=cat_features,
        scale=True,
    )
)

# Afficher les premi√®res lignes
print("\nüìä Aper√ßu X_train_cls:")
display(X_train_cls.head())

# V√©rifier la distribution (devrait √™tre ~N(0,1) apr√®s scaling)
print("\nüìä Statistiques apr√®s scaling:")
display(X_train_cls.describe().loc[["mean", "std"]].T.head(10))


# Test rapide Logistic Regression sur le pipeline classic

print("=" * 60)
print("VALIDATION RAPIDE - LOGISTIC REGRESSION")
print("=" * 60)

PIPELINE 2 : CLASSIC ML (ONE-HOT + SCALING)
üîÑ One-hot encoding...
  ‚úì Train: 62 features apr√®s encoding
  ‚úì Val:   0 colonnes ajout√©es, 0 supprim√©es

üìè Scaling (StandardScaler)...
  ‚úì Scaling termin√©

üì¶ Classic ML Data Ready:
  X_train: (11766, 62)
  X_val:   (1969, 62)

üìä Aper√ßu X_train_cls:


Unnamed: 0,Age,Gender,Reviews_Read,Price,Discount,Items_In_Cart,Email_Interaction,Socioeconomic_Status_Score,Engagement_Score,AB_Bucket,Price_Sine,Day,Campaign_Period,Effective_Discount,Net_Price,Price_Bucket,Email_x_Engagement,Cart_x_Engagement,Tablet_During_Campaign,Desktop_During_Campaign,Day_to_Campaign,HighValue_Category,Email_Device_High,Age_missing,Gender_missing,Reviews_Read_missing,Price_missing,Discount_missing,Category_missing,Items_In_Cart_missing,Time_of_Day_missing,Email_Interaction_missing,Device_Type_missing,Socioeconomic_Status_Score_missing,Engagement_Score_missing,AB_Bucket_missing,Price_Sine_missing,Session_ID_missing,Effective_Discount_missing,Net_Price_missing,Price_Bucket_missing,Email_x_Engagement_missing,Cart_x_Engagement_missing,Device_Type_Mobile,Device_Type_Tablet,Device_Type_Unknown,Time_of_Day_afternoon,Time_of_Day_evening,Time_of_Day_morning,Payment_Method_Cash,Payment_Method_Credit,Payment_Method_Other,Payment_Method_PayPal,Referral_Source_Direct,Referral_Source_Email,Referral_Source_Other,Referral_Source_Search_engine,Referral_Source_Social_media,Category_1.0,Category_2.0,Category_3.0,Category_4.0
0,-0.279918,1.03477,0.010633,0.044476,-0.209614,0.879285,-0.897613,0.624501,-0.283771,-0.005875,1.416165,1.650357,-0.866841,-0.031923,0.081343,0.000123,-0.672485,0.375142,-0.258129,-0.449811,0.410612,0.836833,-0.558568,2.363095,-0.138037,-0.147941,-0.221443,-0.143993,-0.145827,-0.143377,-0.142448,-0.140573,-0.142448,-0.143685,-0.142758,-0.140887,-0.147038,-0.145827,-0.266091,-0.266091,-0.221443,-0.200336,-0.203753,1.120217,-0.414482,-0.142448,1.541487,-0.799342,-0.64542,-0.581865,1.812396,-0.053834,-0.593252,-0.499841,-0.496651,-0.051397,-0.507135,2.00811,1.988493,-0.516259,-0.484517,-0.501434
1,-0.53128,1.03477,-1.164121,-0.06434,-0.885354,-0.178194,1.114066,0.92739,-0.275632,1.003307,-0.198453,-0.076653,1.153613,-0.300359,0.058942,0.000123,0.592489,-0.240158,3.874037,-0.449811,-0.821049,0.836833,1.790292,-0.423174,-0.138037,-0.147941,-0.221443,-0.143993,-0.145827,-0.143377,-0.142448,-0.140573,-0.142448,-0.143685,-0.142758,-0.140887,-0.147038,-0.145827,-0.266091,-0.266091,-0.221443,-0.200336,-0.203753,-0.892684,2.412648,-0.142448,-0.648724,-0.799342,1.54938,1.718613,-0.551756,-0.053834,-0.593252,-0.499841,-0.496651,-0.051397,-0.507135,2.00811,-0.502893,-0.516259,-0.484517,-0.501434
2,-0.782643,-0.966398,0.010633,-0.454494,-1.561094,0.174299,1.114066,0.435195,-0.727205,-1.519649,-0.616088,-0.825024,-0.866841,-0.521732,-0.360687,-0.724081,0.155948,-0.31916,-0.258129,-0.449811,0.410612,0.836833,-0.558568,-0.423174,-0.138037,-0.147941,-0.221443,-0.143993,-0.145827,-0.143377,-0.142448,-0.140573,-0.142448,-0.143685,-0.142758,-0.140887,-0.147038,-0.145827,-0.266091,-0.266091,-0.221443,-0.200336,-0.203753,1.120217,-0.414482,-0.142448,-0.648724,1.25103,-0.64542,-0.581865,-0.551756,-0.053834,-0.593252,-0.499841,-0.496651,-0.051397,-0.507135,2.00811,1.988493,-0.516259,-0.484517,-0.501434
3,-0.615068,-0.966398,0.010633,-0.327398,-1.628668,-1.235673,1.114066,1.57103,-1.332222,-1.015058,-1.426581,1.304955,-0.866841,-0.526508,-0.188785,-0.724081,-0.428932,-0.863207,-0.258129,-0.449811,-0.410495,-1.194982,-0.558568,-0.423174,-0.138037,-0.147941,-0.221443,-0.143993,-0.145827,-0.143377,-0.142448,-0.140573,-0.142448,-0.143685,-0.142758,-0.140887,-0.147038,-0.145827,-0.266091,-0.266091,-0.221443,-0.200336,-0.203753,1.120217,-0.414482,-0.142448,-0.648724,1.25103,-0.64542,-0.581865,-0.551756,-0.053834,1.685625,-0.499841,-0.496651,-0.051397,-0.507135,2.00811,-0.502893,-0.516259,2.06391,-0.501434
4,0.306594,1.03477,-1.164121,-0.085905,-0.81778,-0.530687,-0.897613,0.936128,1.111558,-0.510467,0.98229,-1.170427,-0.866841,-0.288674,0.024795,0.000123,-0.672485,-0.007505,-0.258129,-0.449811,1.231719,0.836833,-0.558568,-0.423174,-0.138037,-0.147941,-0.221443,-0.143993,-0.145827,-0.143377,-0.142448,-0.140573,-0.142448,-0.143685,-0.142758,-0.140887,-0.147038,-0.145827,-0.266091,-0.266091,-0.221443,-0.200336,-0.203753,1.120217,-0.414482,-0.142448,-0.648724,1.25103,-0.64542,1.718613,-0.551756,-0.053834,-0.593252,-0.499841,-0.496651,-0.051397,-0.507135,2.00811,-0.502893,-0.516259,-0.484517,-0.501434



üìä Statistiques apr√®s scaling:


Unnamed: 0,mean,std
Age,3.985706e-17,1.000042
Gender,-2.053243e-17,1.000042
Reviews_Read,-4.951938e-17,1.000042
Price,-6.824012e-17,1.000042
Discount,9.903876e-17,1.000042
Items_In_Cart,2.174022e-17,1.000042
Email_Interaction,-1.074933e-16,1.000042
Socioeconomic_Status_Score,1.237985e-16,1.000042
Engagement_Score,-2.071359e-16,1.000042
AB_Bucket,2.023048e-17,1.000042


VALIDATION RAPIDE - LOGISTIC REGRESSION


In [65]:
# === SAUVEGARDE DES PIPELINES ===
print("=" * 60)
print("SAUVEGARDE DES DONN√âES PREPROCESSED")
print("=" * 60)

import os

os.makedirs("../data/processed", exist_ok=True)

# Donn√©es CatBoost
catboost_data = {
    "X_train": X_train_cb,
    "y_train": y_train_cb,
    "X_val": X_val_cb,
    "y_val": y_val_cb,
    "X_test": X_test_cb,
    "cat_indices": cat_indices,
    "cat_features_names": [col for col in X_train_cb.columns if col in cat_features],
}

joblib.dump(catboost_data, "../data/processed/catboost_ready.pkl", protocol=4)
print("‚úÖ Donn√©es CatBoost export√©es dans 'catboost_ready.pkl'")

# Donn√©es Classic ML
classic_data = {
    "X_train": X_train_cls,
    "y_train": y_train_cls,
    "X_val": X_val_cls,
    "y_val": y_val_cls,
    "X_test": X_test_cls,
    "scaler": scaler,
}

joblib.dump(classic_data, "../data/processed/classic_ready.pkl", protocol=4)
print("‚úÖ Donn√©es Classic ML export√©es dans 'classic_ready.pkl'")

# Afficher un r√©sum√© (avec gestion de None)
print("\n" + "=" * 60)
print("R√âSUM√â DES SAUVEGARDES")
print("=" * 60)
print(f"üì¶ CatBoost ready:")
print(f"   Train: {X_train_cb.shape}")
print(f"   Val: {X_val_cb.shape}")
if X_test_cb is not None:
    print(f"   Test: {X_test_cb.shape}")
else:
    print(f"   Test: Non disponible")
print(f"   Cat indices: {cat_indices}")

print(f"\nüì¶ Classic ready:")
print(f"   Train: {X_train_cls.shape}")
print(f"   Val: {X_val_cls.shape}")
if X_test_cls is not None:
    print(f"   Test: {X_test_cls.shape}")
else:
    print(f"   Test: Non disponible")

SAUVEGARDE DES DONN√âES PREPROCESSED
‚úÖ Donn√©es CatBoost export√©es dans 'catboost_ready.pkl'
‚úÖ Donn√©es Classic ML export√©es dans 'classic_ready.pkl'

R√âSUM√â DES SAUVEGARDES
üì¶ CatBoost ready:
   Train: (11766, 48)
   Val: (1969, 48)
   Test: Non disponible
   Cat indices: [9, 7, 10, 11, 5]

üì¶ Classic ready:
   Train: (11766, 62)
   Val: (1969, 62)
   Test: Non disponible
