In [1]:
# %%
# === IMPORTS ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
import joblib
import os

warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
sns.set_style("whitegrid")

print("âœ… Imports OK")

# %%
# === CHARGEMENT DES DONNÃ‰ES ===
print("=" * 60)
print("CHARGEMENT DES DONNÃ‰ES")
print("=" * 60)

df = pd.read_csv("../data/interim/train_dataset_M1_interim.csv")

print(f"Dataset shape: {df.shape}")
print(f"PÃ©riode Day: {df['Day'].min()} - {df['Day'].max()}")

if "Purchase" in df.columns:
    print(f"\nTarget balance (Purchase):")
    print(df["Purchase"].value_counts(normalize=True))

# %%
# === SPLIT TRAIN/VAL ===
print("\n" + "=" * 60)
print("SPLIT TEMPOREL : TRAIN / VAL")
print("=" * 60)

train = df[df["Day"] <= 60].copy()
val = df[df["Day"] > 60].copy()

print(f"Train: Days 1-60   â†’ {len(train)} lignes")
print(f"Val:   Days 61-{int(df['Day'].max())}  â†’ {len(val)} lignes")

if "Purchase" in df.columns:
    print(f"\nDistribution Purchase:")
    print(f"  Train: {train['Purchase'].mean():.2%}")
    print(f"  Val:   {val['Purchase'].mean():.2%}")

# %%
# === FEATURE ENGINEERING - PRIX ===
print("\n" + "=" * 60)
print("FEATURE ENGINEERING - Prix")
print("=" * 60)

for data in [train, val]:
    data["Net_Price"] = data["Price"] * (1 - data["Discount"] / 100)
    data["Price_Bucket"] = pd.qcut(data["Price"], 5, labels=False, duplicates="drop")

print("âœ… Features prix crÃ©Ã©es")

# %%
# === FEATURE ENGINEERING - ENGAGEMENT ===
print("\n" + "=" * 60)
print("FEATURE ENGINEERING - Engagement")
print("=" * 60)

for data in [train, val]:
    data["Email_x_Engagement"] = data["Email_Interaction"] * data["Engagement_Score"]
    data["Cart_x_Engagement"] = data["Items_In_Cart"] * data["Engagement_Score"]

print("âœ… Features engagement crÃ©Ã©es")

# %%
# === FEATURE ENGINEERING - SEGMENTS ===
print("\n" + "=" * 60)
print("FEATURE ENGINEERING - Segments")
print("=" * 60)

for data in [train, val]:
    data["Campaign_Period"] = data["Campaign_Period"].astype(bool)
    data["HighValue_Category"] = data["Category"].isin([0.0, 1.0, 2.0]).astype(int)

print("âœ… Features segments crÃ©Ã©es")

# %%
# === ðŸš€ INTERACTIONS CIBLÃ‰ES (SEULEMENT LES MEILLEURES) ===
print("\n" + "=" * 60)
print("ðŸš€ INTERACTIONS QUADRATIQUES CIBLÃ‰ES")
print("=" * 60)

for data in [train, val]:
    # 1. Features quadratiques puissantes
    print("   1. Features quadratiques...")
    data["Items_Cart_Squared"] = data["Items_In_Cart"] ** 2
    data["Engagement_Squared"] = data["Engagement_Score"] ** 2

    # 2. Ratios pertinents SEULEMENT
    print("   2. Ratios ciblÃ©s...")
    data["Cart_per_Review"] = data["Items_In_Cart"] / (data["Reviews_Read"] + 1)
    data["Engagement_per_Price"] = data["Engagement_Score"] / (data["Net_Price"] + 1)

    # 3. Interactions campagne (SEULEMENT 2 meilleures)
    print("   3. Interactions campagne...")
    data["Engagement_x_Campaign"] = data["Engagement_Score"] * data[
        "Campaign_Period"
    ].astype(int)
    data["Price_x_Campaign"] = data["Net_Price"] * data["Campaign_Period"].astype(int)

print(f"âœ… 6 interactions ciblÃ©es crÃ©Ã©es")

# %%
# === MISSING INDICATORS ===
print("\n" + "=" * 60)
print("MISSING INDICATORS")
print("=" * 60)

missing_pct = train.isnull().sum() / len(train) * 100
cols_with_missing = missing_pct[missing_pct > 1].index.tolist()
cols_with_missing = [col for col in cols_with_missing if col not in ["Day"]]

for data in [train, val]:
    for col in cols_with_missing:
        data[f"{col}_missing"] = data[col].isnull().astype(int)

print(f"âœ… {len(cols_with_missing)} missing indicators crÃ©Ã©s")

# %%
# === IMPUTATION ===
print("\n" + "=" * 60)
print("IMPUTATION")
print("=" * 60)

median_values = {}
numeric_cols = train.select_dtypes(include=[np.number]).columns

for col in numeric_cols:
    if train[col].isnull().sum() > 0:
        median_values[col] = train[col].median()

for data in [train, val]:
    for col in numeric_cols:
        if data[col].isnull().sum() > 0:
            fill_value = median_values.get(col, data[col].median())
            data[col].fillna(fill_value, inplace=True)

    categorical_cols = data.select_dtypes(include=["object", "bool"]).columns
    for col in categorical_cols:
        if data[col].isnull().sum() > 0:
            data[col].fillna("Unknown", inplace=True)

print(f"âœ… Imputation terminÃ©e")

# %%
# === RARE CATEGORIES ===
print("\n" + "=" * 60)
print("RARE CATEGORIES")
print("=" * 60)

threshold = 0.01
categorical_cols = train.select_dtypes(include=["object"]).columns
rare_handling = {}

for col in categorical_cols:
    if col == "Session_ID":
        continue
    value_counts = train[col].value_counts(normalize=True)
    rare_categories = value_counts[value_counts < threshold].index.tolist()

    if rare_categories:
        rare_handling[col] = rare_categories

for data in [train, val]:
    for col, rare_cats in rare_handling.items():
        if col in data.columns:
            data[col] = data[col].apply(lambda x: "Other" if x in rare_cats else x)

print(f"âœ… Rare categories regroupÃ©es")

# %%
# === ðŸ§¹ NETTOYAGE RADICAL - SUPPRESSION FEATURES INUTILES ===
print("\n" + "=" * 60)
print("ðŸ§¹ NETTOYAGE RADICAL - SUPPRESSION FEATURES < 1%")
print("=" * 60)

# Features Ã  supprimer (importance < 1% ou redondantes)
low_value_features = [
    "Socioeconomic_Status_Score",
    "AB_Bucket",
    "Discount",
    "Age",
    "Gender",
    "Referral_Source",
    "Payment_Method",
    "Time_of_Day",
    "Price_Sine",
    "Email_Interaction",  # CapturÃ© par Email_x_Engagement
]

features_removed = []
for data in [train, val]:
    for feat in low_value_features:
        if feat in data.columns:
            data.drop(columns=[feat], inplace=True)
            if feat not in features_removed:
                features_removed.append(feat)

print(f"âœ… {len(features_removed)} features supprimÃ©es:")
for feat in features_removed:
    print(f"   - {feat}")

print(f"\nðŸ“Š Features restantes: {len(train.columns)}")

# %%
# === PIPELINE CATBOOST ===
print("\n" + "=" * 60)
print("PIPELINE CATBOOST")
print("=" * 60)

cat_features = ["Device_Type", "Category"]

for data in [train, val]:
    for col in cat_features:
        if col in data.columns:
            data[col] = data[col].astype(str)

cols_to_drop_cb = ["Session_ID", "Day", "Purchase", "id"]

X_train = train.drop(columns=cols_to_drop_cb, errors="ignore")
y_train = train["Purchase"] if "Purchase" in train.columns else None

X_val = val.drop(columns=cols_to_drop_cb, errors="ignore")
y_val = val["Purchase"] if "Purchase" in val.columns else None

cat_indices = [
    X_train.columns.get_loc(col) for col in cat_features if col in X_train.columns
]

print(f"âœ… Pipeline CatBoost prÃ©parÃ©:")
print(f"   X_train: {X_train.shape}")
print(f"   X_val:   {X_val.shape}")
print(f"   Features: {X_train.shape[1]}")
print(f"   Categorical: {cat_features}")

# %%
# === PIPELINE CLASSIC ===
print("\n" + "=" * 60)
print("PIPELINE CLASSIC")
print("=" * 60)

train_classic = train.copy()
val_classic = val.copy()

for data in [train_classic, val_classic]:
    data["Campaign_Period"] = data["Campaign_Period"].astype(int)

cols_onehot = ["Device_Type", "Category"]
train_classic = pd.get_dummies(train_classic, columns=cols_onehot, drop_first=True)
val_classic = pd.get_dummies(val_classic, columns=cols_onehot, drop_first=True)

missing_cols = set(train_classic.columns) - set(val_classic.columns)
for col in missing_cols:
    val_classic[col] = 0

extra_cols = set(val_classic.columns) - set(train_classic.columns)
val_classic = val_classic.drop(columns=list(extra_cols))
val_classic = val_classic[train_classic.columns]

X_train_cls = train_classic.drop(columns=cols_to_drop_cb, errors="ignore")
y_train_cls = train_classic["Purchase"] if "Purchase" in train_classic.columns else None

X_val_cls = val_classic.drop(columns=cols_to_drop_cb, errors="ignore")
y_val_cls = val_classic["Purchase"] if "Purchase" in val_classic.columns else None

scaler = StandardScaler()
X_train_cls_scaled = scaler.fit_transform(X_train_cls)
X_val_cls_scaled = scaler.transform(X_val_cls)

X_train_cls = pd.DataFrame(X_train_cls_scaled, columns=X_train_cls.columns)
X_val_cls = pd.DataFrame(X_val_cls_scaled, columns=X_val_cls.columns)

print(f"âœ… Pipeline Classic prÃ©parÃ©:")
print(f"   X_train: {X_train_cls.shape}")
print(f"   X_val:   {X_val_cls.shape}")

# %%
# === SAUVEGARDE ===
print("\n" + "=" * 60)
print("SAUVEGARDE")
print("=" * 60)

os.makedirs("../data/processed", exist_ok=True)

catboost_data = {
    "X_train": X_train,
    "y_train": y_train,
    "X_val": X_val,
    "y_val": y_val,
    "cat_indices": cat_indices,
    "cat_features_names": [col for col in X_train.columns if col in cat_features],
}

joblib.dump(catboost_data, "../data/processed/catboost_ready.pkl", protocol=4)
print("âœ… CatBoost: 'catboost_ready.pkl'")

classic_data = {
    "X_train": X_train_cls,
    "y_train": y_train_cls,
    "X_val": X_val_cls,
    "y_val": y_val_cls,
    "scaler": scaler,
}

joblib.dump(classic_data, "../data/processed/classic_ready.pkl", protocol=4)
print("âœ… Classic: 'classic_ready.pkl'")

print("\n" + "=" * 60)
print("ðŸŽ‰ PREPROCESSING OPTIMISÃ‰ TERMINÃ‰ !")
print("=" * 60)
print(f"ðŸ“¦ CatBoost: {X_train.shape[1]} features (vs 64 avant)")
print(f"ðŸš€ Gain attendu: +1-2% F1")
print("=" * 60)

âœ… Imports OK
CHARGEMENT DES DONNÃ‰ES
Dataset shape: (13455, 21)
PÃ©riode Day: 1 - 70

Target balance (Purchase):
Purchase
0    0.632553
1    0.367447
Name: proportion, dtype: float64

SPLIT TEMPOREL : TRAIN / VAL
Train: Days 1-60   â†’ 11521 lignes
Val:   Days 61-70  â†’ 1934 lignes

Distribution Purchase:
  Train: 37.37%
  Val:   33.04%

FEATURE ENGINEERING - Prix
âœ… Features prix crÃ©Ã©es

FEATURE ENGINEERING - Engagement
âœ… Features engagement crÃ©Ã©es

FEATURE ENGINEERING - Segments
âœ… Features segments crÃ©Ã©es

ðŸš€ INTERACTIONS QUADRATIQUES CIBLÃ‰ES
   1. Features quadratiques...
   2. Ratios ciblÃ©s...
   3. Interactions campagne...
   1. Features quadratiques...
   2. Ratios ciblÃ©s...
   3. Interactions campagne...
âœ… 6 interactions ciblÃ©es crÃ©Ã©es

MISSING INDICATORS
âœ… 24 missing indicators crÃ©Ã©s

IMPUTATION
âœ… Imputation terminÃ©e

RARE CATEGORIES
âœ… Rare categories regroupÃ©es

ðŸ§¹ NETTOYAGE RADICAL - SUPPRESSION FEATURES < 1%
âœ… 10 features supprimÃ©es:
  