In [1]:
# ============================================================================
# VERSION 1: Baseline LogisticRegression - param√®tres par d√©faut
# ============================================================================
# Objectif: mod√®le le plus simple possible, sans aucune gestion du d√©s√©quilibre
#           ni ajustement de seuil
# Validation: StratifiedKFold (5 folds) pour conserver la proportion de classes
# Mod√®le: LogisticRegression() avec max_iter=1000, random_state=42
# Features: X_train, y_train, X_test, y_test (seront scal√©s avec StandardScaler)
# Seuil fixe: 0.5
# M√©triques par fold: AUC-ROC, Accuracy, F1-score, Recall classe 1
# Co√ªt m√©tier: 10 * FN + 1 * FP (avec seuil=0.5)
# MLflow: run_name="V1_LogisticRegression_Baseline"
# Tags: version="1", model="LogisticRegression"

import datetime
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, confusion_matrix
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings('ignore', message='.*Failed to resolve installed pip version.*')
warnings.filterwarnings('ignore', category=FutureWarning, message='.*penalty.*deprecated.*')
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# ============================================================================
# CONFIGURATION
# ============================================================================
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
MLFLOW_EXPERIMENT_NAME = "OC_P6_Credit_Scoring"

PROJECT_VERSION = "1.0"
MODEL_NAME = "LogisticRegression"
NOTEBOOK_NAME = "04_regression"
RUN_DATE = datetime.datetime.now()

DATA_PATH = "../data/processed/"
TRAIN_FILE = "features_train.csv"
TEST_FILE = "features_test.csv"

# Configuration du mod√®le baseline (param√®tres par d√©faut)
MODEL_CONFIG_V1 = {
    "max_iter": 1000,
    "random_state": 42
}

RANDOM_STATE = 42
THRESHOLD_FIXED = 0.5  # Seuil fixe pour les pr√©dictions

print("Configuration charg√©e avec succ√®s !")
print(f"MLflow Experiment: {MLFLOW_EXPERIMENT_NAME}")
print(f"Model: {MODEL_NAME}")
print(f"Fixed threshold: {THRESHOLD_FIXED}")


Configuration charg√©e avec succ√®s !
MLflow Experiment: OC_P6_Credit_Scoring
Model: LogisticRegression
Fixed threshold: 0.5


In [2]:
# Configuration MLflow
from src.mlflow_config import configure_mlflow

mlflow = configure_mlflow(autolog=False)


In [3]:
# ============================================================================
# CHARGEMENT ET PR√âPARATION DES DONN√âES
# ============================================================================

# Chargement des donn√©es d'entra√Ænement
X_train = pd.read_csv(DATA_PATH + TRAIN_FILE)
y_train = X_train.pop("TARGET")

# Chargement des donn√©es de test
X_test = pd.read_csv(DATA_PATH + TEST_FILE)
y_test = X_test.pop("TARGET")

print(f"‚úì Donn√©es charg√©es:")
print(f"  X_train: {X_train.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  X_test: {X_test.shape}")
print(f"  y_test: {y_test.shape}")
print(f"  Classe 0: {(y_train==0).sum()}, Classe 1: {(y_train==1).sum()}")

# ============================================================================
# NETTOYAGE DES DONN√âES: Suppression des colonnes avec 100% NaN
# ============================================================================
from sklearn.impute import SimpleImputer

# V√©rifier les NaN
nan_train = X_train.isna().sum().sum()
nan_test = X_test.isna().sum().sum()

print(f"\n‚úì V√©rification initiale des NaN:")
print(f"  Total NaN in X_train: {nan_train}")
print(f"  Total NaN in X_test: {nan_test}")

# Identifier et supprimer les colonnes enti√®rement NaN dans X_train
empty_cols_train = X_train.columns[X_train.isna().all()].tolist()
empty_cols_test = X_test.columns[X_test.isna().all()].tolist()

print(f"\n‚úì Identification des colonnes vides (100% NaN):")
print(f"  Colonnes vides dans X_train: {len(empty_cols_train)}")
print(f"  Colonnes vides dans X_test: {len(empty_cols_test)}")

# Supprimer les colonnes vides (union des deux ensembles)
cols_to_drop = set(empty_cols_train) | set(empty_cols_test)
if cols_to_drop:
    print(f"  Suppression de {len(cols_to_drop)} colonnes vides...")
    X_train = X_train.drop(columns=list(cols_to_drop))
    X_test = X_test.drop(columns=list(cols_to_drop))
    print(f"  X_train apr√®s suppression: {X_train.shape}")
    print(f"  X_test apr√®s suppression: {X_test.shape}")

# ============================================================================
# IMPUTATION DES VALEURS NaN RESTANTES AVEC LA M√âDIANE
# ============================================================================

nan_train_remaining = X_train.isna().sum().sum()
nan_test_remaining = X_test.isna().sum().sum()

print(f"\n‚úì Imputation des NaN restants:")
print(f"  NaN restants in X_train: {nan_train_remaining}")
print(f"  NaN restants in X_test: {nan_test_remaining}")

if nan_train_remaining > 0 or nan_test_remaining > 0:
    print(f"  Imputation avec la m√©diane...")
    
    # Cr√©er un imputer avec strat√©gie m√©diane
    imputer = SimpleImputer(strategy='median')
    
    # Fit sur X_train et transformer X_train et X_test
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    # Reconvertir en DataFrame
    X_train = pd.DataFrame(X_train_imputed, columns=X_train.columns)
    X_test = pd.DataFrame(X_test_imputed, columns=X_test.columns)
    
    print(f"  X_train apr√®s imputation: {X_train.shape}")
    print(f"  X_test apr√®s imputation: {X_test.shape}")
    print(f"  V√©rification post-imputation:")
    print(f"    NaN in X_train: {X_train.isna().sum().sum()}")
    print(f"    NaN in X_test: {X_test.isna().sum().sum()}")
else:
    print(f"  Aucun NaN √† imputer !")

print(f"\n‚úì Donn√©es finales apr√®s nettoyage:")
print(f"  X_train: {X_train.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  X_test: {X_test.shape}")
print(f"  y_test: {y_test.shape}")


‚úì Donn√©es charg√©es:
  X_train: (10000, 741)
  y_train: (10000,)
  X_test: (10000, 741)
  y_test: (10000,)
  Classe 0: 9225, Classe 1: 775

‚úì V√©rification initiale des NaN:
  Total NaN in X_train: 5146964
  Total NaN in X_test: 5106144

‚úì Identification des colonnes vides (100% NaN):
  Colonnes vides dans X_train: 17
  Colonnes vides dans X_test: 1
  Suppression de 17 colonnes vides...
  X_train apr√®s suppression: (10000, 724)
  X_test apr√®s suppression: (10000, 724)

‚úì Imputation des NaN restants:
  NaN restants in X_train: 4976964
  NaN restants in X_test: 4936311
  Imputation avec la m√©diane...
  X_train apr√®s imputation: (10000, 724)
  X_test apr√®s imputation: (10000, 724)
  V√©rification post-imputation:
    NaN in X_train: 0
    NaN in X_test: 0

‚úì Donn√©es finales apr√®s nettoyage:
  X_train: (10000, 724)
  y_train: (10000,)
  X_test: (10000, 724)
  y_test: (10000,)


In [4]:
# ============================================================================
# STANDARDISATION DES FEATURES
# ============================================================================
# La r√©gression logistique est sensible √† l'√©chelle des features
# Utiliser StandardScaler (fit sur train, transform sur test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reconvertir en DataFrame pour conserver les noms de colonnes
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print(f"\n‚úì Features standardis√©es (StandardScaler):")
print(f"  Shape train: {X_train_scaled.shape}")
print(f"  Mean: {X_train_scaled.mean().mean():.8f} (‚âà 0)")
print(f"  Std:  {X_train_scaled.std().mean():.6f} (‚âà 1)")



‚úì Features standardis√©es (StandardScaler):
  Shape train: (10000, 724)
  Mean: -0.00000000 (‚âà 0)
  Std:  0.874353 (‚âà 1)


In [5]:
# ============================================================================
# CROSS-VALIDATION: LogisticRegression V1 Baseline
# ============================================================================
# StratifiedKFold (5 folds) pour conserver la proportion de classes
# Seuil fixe = 0.5 pour les pr√©dictions (pas d'optimisation)

from src.mlflow_config import configure_mlflow

mlflow = configure_mlflow(autolog=False)

# Terminer tout run actif avant de commencer
mlflow.end_run()

RUN_NAME_V1 = "V1_LogisticRegression_Baseline"

fold_results = []

with mlflow.start_run(run_name=RUN_NAME_V1):
    # ========== Logging des param√®tres et tags ==========
    mlflow.log_params(MODEL_CONFIG_V1)
    mlflow.set_tag("version", "1")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("notebook", NOTEBOOK_NAME)
    mlflow.set_tag("phase", "baseline_cv")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("scaling", "StandardScaler")
    mlflow.set_tag("model_type", "LogisticRegression")
    
    # ========== StratifiedKFold (5 folds) ==========
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train), start=1):
        X_tr, X_val = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # ========== Entra√Ænement ==========
        model = LogisticRegression(**MODEL_CONFIG_V1)
        model.fit(X_tr, y_tr)
        
        # ========== Pr√©dictions ==========
        y_val_proba = model.predict_proba(X_val)[:, 1]  # Probabilit√©s classe 1
        y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int)  # Seuil fixe 0.5
        
        # ========== M√©triques ==========
        auc = roc_auc_score(y_val, y_val_proba)
        accuracy = accuracy_score(y_val, y_val_pred)
        f1 = f1_score(y_val, y_val_pred)
        recall = recall_score(y_val, y_val_pred)
        
        # ========== Co√ªt m√©tier (seuil=0.5) ==========
        tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()
        cost = 10 * fn + 1 * fp
        
        fold_results.append({
            "fold": fold_idx,
            "auc": auc,
            "accuracy": accuracy,
            "f1_score": f1,
            "recall_class1": recall,
            "business_cost_min": cost,
            "optimal_threshold": THRESHOLD_FIXED,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn
        })
        
        print(f"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | "
              f"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}")
    
    # ========== Agr√©gation des r√©sultats ==========
    cv_results_df = pd.DataFrame(fold_results)
    
    metrics_mean = {
        "auc": cv_results_df["auc"].mean(),
        "f1_score": cv_results_df["f1_score"].mean(),
        "recall_class1": cv_results_df["recall_class1"].mean(),
        "business_cost_min": cv_results_df["business_cost_min"].mean(),
        "optimal_threshold": THRESHOLD_FIXED,
    }
    
    metrics_std = {
        "auc": cv_results_df["auc"].std(),
        "f1_score": cv_results_df["f1_score"].std(),
        "recall_class1": cv_results_df["recall_class1"].std(),
        "business_cost_min": cv_results_df["business_cost_min"].std(),
    }
    
    # ========== Logging dans MLFlow ==========
    # Utiliser les M√äMES noms que le sch√©ma standard MLflow (sans pr√©fixe)
    mlflow.log_metric("auc", metrics_mean["auc"])
    mlflow.log_metric("f1_score", metrics_mean["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean["optimal_threshold"])
    
    # Log artefact JSON avec d√©tails par fold
    mlflow.log_dict(cv_results_df.to_dict(orient="records"), "cv_results_per_fold.json")
    
    print("\n‚úì Cross-Validation LogisticRegression V1 termin√©e")
    print(f"  AUC moyen: {metrics_mean['auc']:.4f} ¬± {metrics_std['auc']:.4f}")
    print(f"  F1 moyen: {metrics_mean['f1_score']:.4f} ¬± {metrics_std['f1_score']:.4f}")
    print(f"  Recall moyen: {metrics_mean['recall_class1']:.4f} ¬± {metrics_std['recall_class1']:.4f}")
    print(f"  Co√ªt m√©tier moyen: {metrics_mean['business_cost_min']:.2f} ¬± {metrics_std['business_cost_min']:.2f}")
    print(f"  Seuil optimal: {metrics_mean['optimal_threshold']:.2f}")


Fold 1/5 | AUC=0.6995 | Acc=0.9170 | F1=0.0568 | Recall=0.0323 | Cost=1516
Fold 2/5 | AUC=0.6972 | Acc=0.9225 | F1=0.1243 | Recall=0.0710 | Cost=1451
Fold 3/5 | AUC=0.7036 | Acc=0.9160 | F1=0.0562 | Recall=0.0323 | Cost=1518
Fold 4/5 | AUC=0.7061 | Acc=0.9140 | F1=0.0444 | Recall=0.0258 | Cost=1531
Fold 5/5 | AUC=0.6984 | Acc=0.9145 | F1=0.0339 | Recall=0.0194 | Cost=1539

‚úì Cross-Validation LogisticRegression V1 termin√©e
  AUC moyen: 0.7010 ¬± 0.0038
  F1 moyen: 0.0631 ¬± 0.0355
  Recall moyen: 0.0361 ¬± 0.0202
  Co√ªt m√©tier moyen: 1511.00 ¬± 34.85
  Seuil optimal: 0.50
üèÉ View run V1_LogisticRegression_Baseline at: http://127.0.0.1:5000/#/experiments/1/runs/00e6a5708f0340678afb3fe611ba11c8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [6]:
# ============================================================================
# TABLEAU R√âCAPITULATIF: M√©triques par fold
# ============================================================================

print("\n" + "="*110)
print("TABLEAU R√âCAPITULATIF: M√©triques par fold")
print("="*110)

display_df = cv_results_df[["fold", "auc", "f1_score", "recall_class1", "business_cost_min", "optimal_threshold"]].copy()
print(display_df.to_string(index=False))

# Afficher les moyennes et √©cart-types
print("\n" + "="*110)
print("R√âSUM√â GLOBAL: Moyennes et √âcart-types sur 5 folds")
print("="*110)

summary_data = {
    "M√©trique": ["AUC-ROC", "F1-Score", "Recall Classe 1", "Co√ªt M√©tier Min", "Seuil Optimal"],
    "Moyenne": [
        f"{metrics_mean['auc']:.4f}",
        f"{metrics_mean['f1_score']:.4f}",
        f"{metrics_mean['recall_class1']:.4f}",
        f"{metrics_mean['business_cost_min']:.2f}",
        f"{metrics_mean['optimal_threshold']:.2f}",
    ],
    "√âcart-type": [
        f"{metrics_std['auc']:.4f}",
        f"{metrics_std['f1_score']:.4f}",
        f"{metrics_std['recall_class1']:.4f}",
        f"{metrics_std['business_cost_min']:.2f}",
        "-",
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))
print("="*110)



TABLEAU R√âCAPITULATIF: M√©triques par fold
 fold      auc  f1_score  recall_class1  business_cost_min  optimal_threshold
    1 0.699467  0.056818       0.032258               1516                0.5
    2 0.697180  0.124294       0.070968               1451                0.5
    3 0.703589  0.056180       0.032258               1518                0.5
    4 0.706146  0.044444       0.025806               1531                0.5
    5 0.698425  0.033898       0.019355               1539                0.5

R√âSUM√â GLOBAL: Moyennes et √âcart-types sur 5 folds
       M√©trique Moyenne √âcart-type
        AUC-ROC  0.7010     0.0038
       F1-Score  0.0631     0.0355
Recall Classe 1  0.0361     0.0202
Co√ªt M√©tier Min 1511.00      34.85
  Seuil Optimal    0.50          -


In [7]:
# ============================================================================
# ENTRA√éNEMENT FINAL: LogisticRegression sur l'ensemble train complet
# ============================================================================

final_model_v1 = LogisticRegression(**MODEL_CONFIG_V1)
final_model_v1.fit(X_train_scaled, y_train)

print("\n‚úì Mod√®le final LogisticRegression V1 entra√Æn√© sur l'ensemble train complet")
print(f"  Nombre de features: {X_train_scaled.shape[1]}")
print(f"  Intercept: {final_model_v1.intercept_[0]:.6f}")
print(f"  Norme des coefficients: {np.linalg.norm(final_model_v1.coef_):.6f}")



‚úì Mod√®le final LogisticRegression V1 entra√Æn√© sur l'ensemble train complet
  Nombre de features: 724
  Intercept: -3.504665
  Norme des coefficients: 3.948967


In [8]:
# ============================================================================
# √âVALUATION SUR L'ENSEMBLE TEST
# ============================================================================
# Utiliser le m√™me seuil fixe de 0.5

# V√©rifier et nettoyer les NaN dans y_test
print(f"‚úì V√©rification initiale y_test:")
print(f"  y_test shape: {y_test.shape}")
print(f"  NaN in y_test: {y_test.isna().sum()}")

# V√©rifier si y_test est enti√®rement NaN
if y_test.isna().sum() == len(y_test):
    print(f"\n‚ö†Ô∏è  ATTENTION: y_test est enti√®rement NaN - √âvaluation test IGNOR√âE")
    print(f"   Les donn√©es de test n'ont pas de cible valide.")
    test_auc = None
    test_accuracy = None
    test_f1 = None
    test_recall = None
    test_cost = None
    tp_test = None
    fp_test = None
    fn_test = None
    tn_test = None
    
else:
    # Supprimer les lignes avec NaN dans y_test
    if y_test.isna().sum() > 0:
        print(f"  Suppression de {y_test.isna().sum()} lignes avec NaN dans y_test...")
        mask_test_clean = ~y_test.isna()
        y_test = y_test[mask_test_clean]
        X_test_scaled = X_test_scaled[mask_test_clean]
        print(f"  y_test apr√®s suppression: {y_test.shape}")
        print(f"  X_test_scaled apr√®s suppression: {X_test_scaled.shape}")

    # R√©initialiser les indices
    y_test.reset_index(drop=True, inplace=True)
    X_test_scaled.reset_index(drop=True, inplace=True)

    # Pr√©dictions sur le test
    y_test_proba = final_model_v1.predict_proba(X_test_scaled)[:, 1]
    y_test_pred = (y_test_proba >= THRESHOLD_FIXED).astype(int)

    # M√©triques sur le test
    test_auc = roc_auc_score(y_test, y_test_proba)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred)
    test_recall = recall_score(y_test, y_test_pred)

    # Co√ªt m√©tier
    tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()
    test_cost = 10 * fn_test + 1 * fp_test

    print("\n" + "="*80)
    print("√âVALUATION SUR ENSEMBLE TEST (seuil=0.5)")
    print("="*80)
    print(f"AUC-ROC:        {test_auc:.4f}")
    print(f"Accuracy:       {test_accuracy:.4f}")
    print(f"F1-Score:       {test_f1:.4f}")
    print(f"Recall Classe 1: {test_recall:.4f}")
    print(f"Co√ªt M√©tier:    {test_cost:.0f}")
    print(f"\nConfusion Matrix:")
    print(f"  TP: {int(tp_test):6d}  |  FP: {int(fp_test):6d}")
    print(f"  FN: {int(fn_test):6d}  |  TN: {int(tn_test):6d}")
    print("="*80)


‚úì V√©rification initiale y_test:
  y_test shape: (10000,)
  NaN in y_test: 10000

‚ö†Ô∏è  ATTENTION: y_test est enti√®rement NaN - √âvaluation test IGNOR√âE
   Les donn√©es de test n'ont pas de cible valide.


In [9]:
# ============================================================================
# LOGGING MLFLOW: Sauvegarde des m√©triques et artefacts du mod√®le
# ============================================================================
# Logger les m√©triques CV et le mod√®le comme artefact
# IMPORTANT: Le mod√®le n'est PAS enregistr√© dans la Model Registry automatiquement

# Terminer le run CV pr√©c√©dent
mlflow.end_run()

with mlflow.start_run(run_name="V1_LogisticRegression_Test_Evaluation"):
    # Logging des param√®tres
    mlflow.log_params(MODEL_CONFIG_V1)
    
    # Tags
    mlflow.set_tag("version", "1")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("phase", "test_evaluation")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("model_type", "LogisticRegression")
    
    # TOUJOURS logger les m√©triques CV (pour que le mod√®le ait des m√©triques)
    mlflow.log_metric("auc", metrics_mean["auc"])
    mlflow.log_metric("f1_score", metrics_mean["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean["optimal_threshold"])
    
    # Si m√©triques test disponibles, les logger aussi (avec suffixe pour diff√©rencier)
    if test_auc is not None:
        mlflow.log_metric("test_auc", test_auc)
        mlflow.log_metric("test_f1_score", test_f1)
        mlflow.log_metric("test_recall_class1", test_recall)
        mlflow.log_metric("test_business_cost_min", test_cost)
        
        # R√©sultats test en artefact
        test_results = {
            "auc": float(test_auc),
            "f1_score": float(test_f1),
            "recall_class1": float(test_recall),
            "business_cost_min": float(test_cost),
            "optimal_threshold": float(THRESHOLD_FIXED),
            "confusion_matrix": {
                "tp": int(tp_test),
                "fp": int(fp_test),
                "fn": int(fn_test),
                "tn": int(tn_test),
            }
        }
        mlflow.log_dict(test_results, "test_evaluation.json")
        
        print(f"\n‚úì M√©triques test logg√©es")
        print(f"  Test AUC: {test_auc:.4f}")
        print(f"  Test F1: {test_f1:.4f}")
        print(f"  Test Recall: {test_recall:.4f}")
        print(f"  Test Business Cost Min: {test_cost:.0f}")
    else:
        print(f"\n‚ö†Ô∏è  ATTENTION: M√©triques test non disponibles (y_test √©tait enti√®rement NaN)")
        print(f"   Les m√©triques CV sont utilis√©es.")
        mlflow.set_tag("test_metrics_available", "false")
    
    # LOG: Sauvegarder le mod√®le comme artefact (accessible via MLflow)
    # IMPORTANT: Le mod√®le n'est PAS enregistr√© dans la Model Registry automatiquement
    # Cela doit √™tre fait manuellement via l'interface MLflow
    mlflow.sklearn.log_model(
        final_model_v1,
        artifact_path="logistic_regression_v1"
    )
    
    print(f"\n‚úì M√©triques et artefacts du mod√®le logg√©s dans MLflow")
    print(f"  AUC (CV): {metrics_mean['auc']:.4f}")
    print(f"  F1 (CV): {metrics_mean['f1_score']:.4f}")
    print(f"  Recall (CV): {metrics_mean['recall_class1']:.4f}")
    print(f"  Business Cost Min (CV): {metrics_mean['business_cost_min']:.2f}")
    print(f"\n  ‚ÑπÔ∏è  Pour enregistrer le mod√®le dans la Model Registry :")
    print(f"     - Allez √† http://127.0.0.1:5000/#/experiments/1")
    print(f"     - Trouvez le run 'V1_LogisticRegression_Test_Evaluation'")
    print(f"     - Dans l'onglet 'Artifacts', cliquez 'Register Model'")
    print(f"     - S√©lectionnez ou cr√©ez le nom 'LogisticRegression_V1'")





‚ö†Ô∏è  ATTENTION: M√©triques test non disponibles (y_test √©tait enti√®rement NaN)
   Les m√©triques CV sont utilis√©es.





‚úì M√©triques et artefacts du mod√®le logg√©s dans MLflow
  AUC (CV): 0.7010
  F1 (CV): 0.0631
  Recall (CV): 0.0361
  Business Cost Min (CV): 1511.00

  ‚ÑπÔ∏è  Pour enregistrer le mod√®le dans la Model Registry :
     - Allez √† http://127.0.0.1:5000/#/experiments/1
     - Trouvez le run 'V1_LogisticRegression_Test_Evaluation'
     - Dans l'onglet 'Artifacts', cliquez 'Register Model'
     - S√©lectionnez ou cr√©ez le nom 'LogisticRegression_V1'
üèÉ View run V1_LogisticRegression_Test_Evaluation at: http://127.0.0.1:5000/#/experiments/1/runs/b98cbeb8fddc435f998b929565c06021
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [10]:
# ============================================================================
# COMPARAISON: Cross-Validation vs Test
# ============================================================================
# V√©rifier la stabilit√© du mod√®le (g√©n√©ralisation)

print("\n" + "="*100)
print("COMPARAISON: Cross-Validation vs Test Set")
print("="*100)

if test_auc is None:
    print("\n‚ö†Ô∏è  ATTENTION: M√©triques test non disponibles")
    print("   (y_test √©tait enti√®rement NaN - √âvaluation test ignor√©e)")
    print("\n   Affichage des m√©triques de Cross-Validation uniquement:")
    print(f"   AUC moyen: {metrics_mean['auc']:.4f} ¬± {metrics_std['auc']:.4f}")
    print(f"   F1 moyen: {metrics_mean['f1_score']:.4f} ¬± {metrics_std['f1_score']:.4f}")
    print(f"   Recall moyen: {metrics_mean['recall_class1']:.4f} ¬± {metrics_std['recall_class1']:.4f}")
    print(f"   Co√ªt m√©tier moyen: {metrics_mean['business_cost_min']:.2f} ¬± {metrics_std['business_cost_min']:.2f}")
    print(f"   Seuil optimal: {metrics_mean['optimal_threshold']:.2f}")
else:
    comparison_data = {
        "M√©trique": ["AUC-ROC", "F1-Score", "Recall Classe 1", "Co√ªt M√©tier Min", "Seuil Optimal"],
        "CV Mean": [
            f"{metrics_mean['auc']:.4f}",
            f"{metrics_mean['f1_score']:.4f}",
            f"{metrics_mean['recall_class1']:.4f}",
            f"{metrics_mean['business_cost_min']:.2f}",
            f"{metrics_mean['optimal_threshold']:.2f}",
        ],
        "Test": [
            f"{test_auc:.4f}",
            f"{test_f1:.4f}",
            f"{test_recall:.4f}",
            f"{test_cost:.2f}",
            f"{THRESHOLD_FIXED:.2f}",
        ],
        "Diff (Test-CV)": [
            f"{test_auc - metrics_mean['auc']:+.4f}",
            f"{test_f1 - metrics_mean['f1_score']:+.4f}",
            f"{test_recall - metrics_mean['recall_class1']:+.4f}",
            f"{test_cost - metrics_mean['business_cost_min']:+.2f}",
            "0.00",
        ]
    }
    
    comparison_df = pd.DataFrame(comparison_data)
    print(comparison_df.to_string(index=False))

print("="*100)



COMPARAISON: Cross-Validation vs Test Set

‚ö†Ô∏è  ATTENTION: M√©triques test non disponibles
   (y_test √©tait enti√®rement NaN - √âvaluation test ignor√©e)

   Affichage des m√©triques de Cross-Validation uniquement:
   AUC moyen: 0.7010 ¬± 0.0038
   F1 moyen: 0.0631 ¬± 0.0355
   Recall moyen: 0.0361 ¬± 0.0202
   Co√ªt m√©tier moyen: 1511.00 ¬± 34.85
   Seuil optimal: 0.50


In [11]:
# ============================================================================
# VERSION 2.1: LogisticRegression avec class_weight='balanced'
# ============================================================================
# Objectif: G√©rer le d√©s√©quilibre des classes avec class_weight='balanced'
# Validation: StratifiedKFold (5 folds)
# Mod√®le: LogisticRegression(max_iter=1000, random_state=42, solver='saga', class_weight='balanced', penalty='l2')
# Features: X_train_scaled, y_train (d√©j√† scal√©es)
# Seuil fixe: 0.5
# M√©triques par fold: AUC-ROC, Accuracy, F1-score, Recall classe 1
# Co√ªt m√©tier: 10 * FN + 1 * FP (avec seuil=0.5)
# MLflow: run_name="V2_LogisticRegression_ClassWeightBalanced"
# Tags: version="2", imbalance_handling="class_weight"

from sklearn.pipeline import Pipeline

# Configuration du mod√®le V2.1 (class_weight balanced)
MODEL_CONFIG_V2_1 = {
    "max_iter": 3000,
    "random_state": 42,
    "solver": "saga",
    "class_weight": "balanced"
}

RUN_NAME_V2_1 = "V2_LogisticRegression_ClassWeightBalanced"

fold_results_v2_1 = []

# Terminer tout run actif
mlflow.end_run()

with mlflow.start_run(run_name=RUN_NAME_V2_1):
    # ========== Logging des param√®tres et tags ==========
    mlflow.log_params(MODEL_CONFIG_V2_1)
    mlflow.set_tag("version", "2")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("notebook", NOTEBOOK_NAME)
    mlflow.set_tag("phase", "imbalance_handling_cv")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("scaling", "StandardScaler")
    mlflow.set_tag("imbalance_handling", "class_weight")
    mlflow.set_tag("model_type", "LogisticRegression")
    
    # ========== StratifiedKFold (5 folds) ==========
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train), start=1):
        X_tr, X_val = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # ========== Entra√Ænement ==========
        model = LogisticRegression(**MODEL_CONFIG_V2_1)
        model.fit(X_tr, y_tr)
        
        # ========== Pr√©dictions ==========
        y_val_proba = model.predict_proba(X_val)[:, 1]
        y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int)
        
        # ========== M√©triques ==========
        auc = roc_auc_score(y_val, y_val_proba)
        accuracy = accuracy_score(y_val, y_val_pred)
        f1 = f1_score(y_val, y_val_pred)
        recall = recall_score(y_val, y_val_pred)
        
        # ========== Co√ªt m√©tier (seuil=0.5) ==========
        tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()
        cost = 10 * fn + 1 * fp
        
        fold_results_v2_1.append({
            "fold": fold_idx,
            "auc": auc,
            "accuracy": accuracy,
            "f1_score": f1,
            "recall_class1": recall,
            "business_cost_min": cost,
            "optimal_threshold": THRESHOLD_FIXED,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn
        })
        
        print(f"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | "
              f"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}")
    
    # ========== Agr√©gation des r√©sultats ==========
    cv_results_v2_1_df = pd.DataFrame(fold_results_v2_1)
    
    metrics_mean_v2_1 = {
        "auc": cv_results_v2_1_df["auc"].mean(),
        "f1_score": cv_results_v2_1_df["f1_score"].mean(),
        "recall_class1": cv_results_v2_1_df["recall_class1"].mean(),
        "business_cost_min": cv_results_v2_1_df["business_cost_min"].mean(),
        "optimal_threshold": THRESHOLD_FIXED,
    }
    
    metrics_std_v2_1 = {
        "auc": cv_results_v2_1_df["auc"].std(),
        "f1_score": cv_results_v2_1_df["f1_score"].std(),
        "recall_class1": cv_results_v2_1_df["recall_class1"].std(),
        "business_cost_min": cv_results_v2_1_df["business_cost_min"].std(),
    }
    
    # ========== Logging dans MLFlow ==========
    mlflow.log_metric("auc", metrics_mean_v2_1["auc"])
    mlflow.log_metric("f1_score", metrics_mean_v2_1["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean_v2_1["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean_v2_1["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean_v2_1["optimal_threshold"])
    
    # Log artefact JSON avec d√©tails par fold
    mlflow.log_dict(cv_results_v2_1_df.to_dict(orient="records"), "cv_results_per_fold.json")
    
    print("\n‚úì Cross-Validation LogisticRegression V2.1 (class_weight='balanced') termin√©e")
    print(f"  AUC moyen: {metrics_mean_v2_1['auc']:.4f} ¬± {metrics_std_v2_1['auc']:.4f}")
    print(f"  F1 moyen: {metrics_mean_v2_1['f1_score']:.4f} ¬± {metrics_std_v2_1['f1_score']:.4f}")
    print(f"  Recall moyen: {metrics_mean_v2_1['recall_class1']:.4f} ¬± {metrics_std_v2_1['recall_class1']:.4f}")
    print(f"  Co√ªt m√©tier moyen: {metrics_mean_v2_1['business_cost_min']:.2f} ¬± {metrics_std_v2_1['business_cost_min']:.2f}")
    print(f"  Seuil optimal: {metrics_mean_v2_1['optimal_threshold']:.2f}")

Fold 1/5 | AUC=0.6886 | Acc=0.6995 | F1=0.2324 | Recall=0.5871 | Cost=1177
Fold 2/5 | AUC=0.6828 | Acc=0.6980 | F1=0.2412 | Recall=0.6194 | Cost=1135
Fold 3/5 | AUC=0.7118 | Acc=0.7265 | F1=0.2476 | Recall=0.5806 | Cost=1132
Fold 4/5 | AUC=0.7035 | Acc=0.7000 | F1=0.2347 | Recall=0.5935 | Cost=1167
Fold 5/5 | AUC=0.6920 | Acc=0.7185 | F1=0.2277 | Recall=0.5355 | Cost=1211

‚úì Cross-Validation LogisticRegression V2.1 (class_weight='balanced') termin√©e
  AUC moyen: 0.6957 ¬± 0.0117
  F1 moyen: 0.2367 ¬± 0.0078
  Recall moyen: 0.5832 ¬± 0.0305
  Co√ªt m√©tier moyen: 1164.40 ¬± 32.60
  Seuil optimal: 0.50
üèÉ View run V2_LogisticRegression_ClassWeightBalanced at: http://127.0.0.1:5000/#/experiments/1/runs/d8b12c8475984c75b995472e30f56f69
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [12]:
# ============================================================================
# ENTRA√éNEMENT FINAL V2.1: LogisticRegression avec class_weight='balanced'
# ============================================================================

final_model_v2_1 = LogisticRegression(**MODEL_CONFIG_V2_1)
final_model_v2_1.fit(X_train_scaled, y_train)

print("\n‚úì Mod√®le final LogisticRegression V2.1 entra√Æn√© sur l'ensemble train complet")
print(f"  Nombre de features: {X_train_scaled.shape[1]}")
print(f"  Intercept: {final_model_v2_1.intercept_[0]:.6f}")
print(f"  Norme des coefficients: {np.linalg.norm(final_model_v2_1.coef_):.6f}")

# ============================================================================
# LOGGING MLFLOW V2.1: Sauvegarde du mod√®le
# ============================================================================

mlflow.end_run()

with mlflow.start_run(run_name="V2.1_LogisticRegression_ClassWeight_Final"):
    # Logging des param√®tres
    mlflow.log_params(MODEL_CONFIG_V2_1)
    
    # Tags
    mlflow.set_tag("version", "2.1")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("phase", "final_model")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("imbalance_handling", "class_weight")
    mlflow.set_tag("model_type", "LogisticRegression")
    
    # Logger les m√©triques CV
    mlflow.log_metric("auc", metrics_mean_v2_1["auc"])
    mlflow.log_metric("f1_score", metrics_mean_v2_1["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean_v2_1["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean_v2_1["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean_v2_1["optimal_threshold"])
    
    # Sauvegarder le mod√®le comme artefact
    mlflow.sklearn.log_model(
        final_model_v2_1,
        artifact_path="logistic_regression_v2_1_class_weight"
    )
    
    print(f"\n‚úì Mod√®le V2.1 enregistr√© dans MLflow")
    print(f"  AUC (CV): {metrics_mean_v2_1['auc']:.4f}")
    print(f"  F1 (CV): {metrics_mean_v2_1['f1_score']:.4f}")
    print(f"  Recall (CV): {metrics_mean_v2_1['recall_class1']:.4f}")
    print(f"  Business Cost Min (CV): {metrics_mean_v2_1['business_cost_min']:.2f}")


‚úì Mod√®le final LogisticRegression V2.1 entra√Æn√© sur l'ensemble train complet
  Nombre de features: 724
  Intercept: -0.917467
  Norme des coefficients: 2.441323





‚úì Mod√®le V2.1 enregistr√© dans MLflow
  AUC (CV): 0.6957
  F1 (CV): 0.2367
  Recall (CV): 0.5832
  Business Cost Min (CV): 1164.40
üèÉ View run V2.1_LogisticRegression_ClassWeight_Final at: http://127.0.0.1:5000/#/experiments/1/runs/0bc8f5f187c94a349c72011de4524c77
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [13]:
# ============================================================================
# VERSION 2.2: LogisticRegression avec SMOTE
# ============================================================================
# Objectif: G√©rer le d√©s√©quilibre des classes avec SMOTE
# Validation: StratifiedKFold (5 folds)
# Mod√®le: LogisticRegression(max_iter=1000, random_state=42, solver='saga', penalty='l2')
# Pipeline: StandardScaler -> SMOTE -> LogisticRegression (pour √©viter le data leakage)
# Features: X_train, y_train (seront scal√©es dans le pipeline)
# Seuil fixe: 0.5
# M√©triques par fold: AUC-ROC, Accuracy, F1-score, Recall classe 1
# Co√ªt m√©tier: 10 * FN + 1 * FP (avec seuil=0.5)
# MLflow: run_name="V2_LogisticRegression_SMOTE"
# Tags: version="2", imbalance_handling="smote"

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Configuration du mod√®le V2.2 (SMOTE)
MODEL_CONFIG_V2_2 = {
    "max_iter": 3000,
    "random_state": 42,
    "solver": "saga"
}

RUN_NAME_V2_2 = "V2_LogisticRegression_SMOTE"

fold_results_v2_2 = []

# Terminer tout run actif
mlflow.end_run()

with mlflow.start_run(run_name=RUN_NAME_V2_2):
    # ========== Logging des param√®tres et tags ==========
    mlflow.log_params(MODEL_CONFIG_V2_2)
    mlflow.set_tag("version", "2")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("notebook", NOTEBOOK_NAME)
    mlflow.set_tag("phase", "imbalance_handling_cv")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("scaling", "StandardScaler")
    mlflow.set_tag("imbalance_handling", "smote")
    mlflow.set_tag("model_type", "LogisticRegression")
    
    # ========== StratifiedKFold (5 folds) ==========
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), start=1):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # ========== Pipeline: Scaler -> SMOTE -> Model ==========
        # SMOTE est appliqu√© uniquement sur le train de chaque fold
        pipeline = ImbPipeline([
            ('scaler', StandardScaler()),
            ('smote', SMOTE(random_state=RANDOM_STATE)),
            ('model', LogisticRegression(**MODEL_CONFIG_V2_2))
        ])
        
        # ========== Entra√Ænement ==========
        pipeline.fit(X_tr, y_tr)
        
        # ========== Pr√©dictions ==========
        y_val_proba = pipeline.predict_proba(X_val)[:, 1]
        y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int)
        
        # ========== M√©triques ==========
        auc = roc_auc_score(y_val, y_val_proba)
        accuracy = accuracy_score(y_val, y_val_pred)
        f1 = f1_score(y_val, y_val_pred)
        recall = recall_score(y_val, y_val_pred)
        
        # ========== Co√ªt m√©tier (seuil=0.5) ==========
        tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()
        cost = 10 * fn + 1 * fp
        
        fold_results_v2_2.append({
            "fold": fold_idx,
            "auc": auc,
            "accuracy": accuracy,
            "f1_score": f1,
            "recall_class1": recall,
            "business_cost_min": cost,
            "optimal_threshold": THRESHOLD_FIXED,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn
        })
        
        print(f"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | "
              f"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}")
    
    # ========== Agr√©gation des r√©sultats ==========
    cv_results_v2_2_df = pd.DataFrame(fold_results_v2_2)
    
    metrics_mean_v2_2 = {
        "auc": cv_results_v2_2_df["auc"].mean(),
        "f1_score": cv_results_v2_2_df["f1_score"].mean(),
        "recall_class1": cv_results_v2_2_df["recall_class1"].mean(),
        "business_cost_min": cv_results_v2_2_df["business_cost_min"].mean(),
        "optimal_threshold": THRESHOLD_FIXED,
    }
    
    metrics_std_v2_2 = {
        "auc": cv_results_v2_2_df["auc"].std(),
        "f1_score": cv_results_v2_2_df["f1_score"].std(),
        "recall_class1": cv_results_v2_2_df["recall_class1"].std(),
        "business_cost_min": cv_results_v2_2_df["business_cost_min"].std(),
    }
    
    # ========== Logging dans MLFlow ==========
    mlflow.log_metric("auc", metrics_mean_v2_2["auc"])
    mlflow.log_metric("f1_score", metrics_mean_v2_2["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean_v2_2["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean_v2_2["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean_v2_2["optimal_threshold"])
    
    # Log artefact JSON avec d√©tails par fold
    mlflow.log_dict(cv_results_v2_2_df.to_dict(orient="records"), "cv_results_per_fold.json")
    
    print("\n‚úì Cross-Validation LogisticRegression V2.2 (SMOTE) termin√©e")
    print(f"  AUC moyen: {metrics_mean_v2_2['auc']:.4f} ¬± {metrics_std_v2_2['auc']:.4f}")
    print(f"  F1 moyen: {metrics_mean_v2_2['f1_score']:.4f} ¬± {metrics_std_v2_2['f1_score']:.4f}")
    print(f"  Recall moyen: {metrics_mean_v2_2['recall_class1']:.4f} ¬± {metrics_std_v2_2['recall_class1']:.4f}")
    print(f"  Co√ªt m√©tier moyen: {metrics_mean_v2_2['business_cost_min']:.2f} ¬± {metrics_std_v2_2['business_cost_min']:.2f}")
    print(f"  Seuil optimal: {metrics_mean_v2_2['optimal_threshold']:.2f}")

Fold 1/5 | AUC=0.6801 | Acc=0.7145 | F1=0.2397 | Recall=0.5806 | Cost=1156
Fold 2/5 | AUC=0.6807 | Acc=0.6985 | F1=0.2299 | Recall=0.5806 | Cost=1188
Fold 3/5 | AUC=0.7055 | Acc=0.7375 | F1=0.2553 | Recall=0.5806 | Cost=1110
Fold 4/5 | AUC=0.6872 | Acc=0.7190 | F1=0.2301 | Recall=0.5419 | Cost=1201
Fold 5/5 | AUC=0.6914 | Acc=0.7435 | F1=0.2377 | Recall=0.5161 | Cost=1188

‚úì Cross-Validation LogisticRegression V2.2 (SMOTE) termin√©e
  AUC moyen: 0.6890 ¬± 0.0104
  F1 moyen: 0.2386 ¬± 0.0104
  Recall moyen: 0.5600 ¬± 0.0297
  Co√ªt m√©tier moyen: 1168.60 ¬± 36.73
  Seuil optimal: 0.50
üèÉ View run V2_LogisticRegression_SMOTE at: http://127.0.0.1:5000/#/experiments/1/runs/dab29ff5c5a14880bb75287b1c5bcd5c
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [14]:
# ============================================================================
# ENTRA√éNEMENT FINAL V2.2: LogisticRegression avec SMOTE
# ============================================================================

# Pipeline complet avec SMOTE sur l'ensemble train
final_pipeline_v2_2 = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=RANDOM_STATE)),
    ('model', LogisticRegression(**MODEL_CONFIG_V2_2))
])

final_pipeline_v2_2.fit(X_train, y_train)

print("\n‚úì Pipeline final LogisticRegression V2.2 (SMOTE) entra√Æn√© sur l'ensemble train complet")
print(f"  Nombre de features: {X_train.shape[1]}")
print(f"  Intercept: {final_pipeline_v2_2.named_steps['model'].intercept_[0]:.6f}")
print(f"  Norme des coefficients: {np.linalg.norm(final_pipeline_v2_2.named_steps['model'].coef_):.6f}")

# ============================================================================
# LOGGING MLFLOW V2.2: Sauvegarde du mod√®le
# ============================================================================

mlflow.end_run()

with mlflow.start_run(run_name="V2.2_LogisticRegression_SMOTE_Final"):
    # Logging des param√®tres
    mlflow.log_params(MODEL_CONFIG_V2_2)
    
    # Tags
    mlflow.set_tag("version", "2.2")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("phase", "final_model")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("imbalance_handling", "smote")
    mlflow.set_tag("model_type", "LogisticRegression")
    
    # Logger les m√©triques CV
    mlflow.log_metric("auc", metrics_mean_v2_2["auc"])
    mlflow.log_metric("f1_score", metrics_mean_v2_2["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean_v2_2["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean_v2_2["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean_v2_2["optimal_threshold"])
    
    # Sauvegarder le pipeline complet comme artefact
    mlflow.sklearn.log_model(
        final_pipeline_v2_2,
        artifact_path="logistic_regression_v2_2_smote"
    )
    
    print(f"\n‚úì Pipeline V2.2 enregistr√© dans MLflow")
    print(f"  AUC (CV): {metrics_mean_v2_2['auc']:.4f}")
    print(f"  F1 (CV): {metrics_mean_v2_2['f1_score']:.4f}")
    print(f"  Recall (CV): {metrics_mean_v2_2['recall_class1']:.4f}")
    print(f"  Business Cost Min (CV): {metrics_mean_v2_2['business_cost_min']:.2f}")


‚úì Pipeline final LogisticRegression V2.2 (SMOTE) entra√Æn√© sur l'ensemble train complet
  Nombre de features: 724
  Intercept: -1.226644
  Norme des coefficients: 3.213375





‚úì Pipeline V2.2 enregistr√© dans MLflow
  AUC (CV): 0.6890
  F1 (CV): 0.2386
  Recall (CV): 0.5600
  Business Cost Min (CV): 1168.60
üèÉ View run V2.2_LogisticRegression_SMOTE_Final at: http://127.0.0.1:5000/#/experiments/1/runs/9a1cd90834c84f43a6b660e9dcc0a408
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [15]:
# ============================================================================
# TABLEAU COMPARATIF: V1 Baseline vs V2 Class Weight vs V2 SMOTE
# ============================================================================

print("\n" + "="*120)
print("TABLEAU COMPARATIF: V1 Baseline vs V2.1 Class Weight vs V2.2 SMOTE")
print("="*120)

comparison_data = {
    "Version": ["V1 Baseline", "V2.1 Class Weight", "V2.2 SMOTE"],
    "AUC": [
        f"{metrics_mean['auc']:.4f} ¬± {metrics_std['auc']:.4f}",
        f"{metrics_mean_v2_1['auc']:.4f} ¬± {metrics_std_v2_1['auc']:.4f}",
        f"{metrics_mean_v2_2['auc']:.4f} ¬± {metrics_std_v2_2['auc']:.4f}",
    ],
    "F1-Score": [
        f"{metrics_mean['f1_score']:.4f} ¬± {metrics_std['f1_score']:.4f}",
        f"{metrics_mean_v2_1['f1_score']:.4f} ¬± {metrics_std_v2_1['f1_score']:.4f}",
        f"{metrics_mean_v2_2['f1_score']:.4f} ¬± {metrics_std_v2_2['f1_score']:.4f}",
    ],
    "Recall Classe 1": [
        f"{metrics_mean['recall_class1']:.4f} ¬± {metrics_std['recall_class1']:.4f}",
        f"{metrics_mean_v2_1['recall_class1']:.4f} ¬± {metrics_std_v2_1['recall_class1']:.4f}",
        f"{metrics_mean_v2_2['recall_class1']:.4f} ¬± {metrics_std_v2_2['recall_class1']:.4f}",
    ],
    "Co√ªt M√©tier Min": [
        f"{metrics_mean['business_cost_min']:.2f} ¬± {metrics_std['business_cost_min']:.2f}",
        f"{metrics_mean_v2_1['business_cost_min']:.2f} ¬± {metrics_std_v2_1['business_cost_min']:.2f}",
        f"{metrics_mean_v2_2['business_cost_min']:.2f} ¬± {metrics_std_v2_2['business_cost_min']:.2f}",
    ],
    "Imbalance Handling": [
        "None",
        "class_weight='balanced'",
        "SMOTE",
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))
print("="*120)

# Analyse des am√©liorations
print("\n" + "="*120)
print("ANALYSE DES AM√âLIORATIONS (vs V1 Baseline)")
print("="*120)

improvement_data = {
    "Version": ["V2.1 Class Weight", "V2.2 SMOTE"],
    "Œî AUC": [
        f"{metrics_mean_v2_1['auc'] - metrics_mean['auc']:+.4f}",
        f"{metrics_mean_v2_2['auc'] - metrics_mean['auc']:+.4f}",
    ],
    "Œî F1-Score": [
        f"{metrics_mean_v2_1['f1_score'] - metrics_mean['f1_score']:+.4f}",
        f"{metrics_mean_v2_2['f1_score'] - metrics_mean['f1_score']:+.4f}",
    ],
    "Œî Recall Classe 1": [
        f"{metrics_mean_v2_1['recall_class1'] - metrics_mean['recall_class1']:+.4f}",
        f"{metrics_mean_v2_2['recall_class1'] - metrics_mean['recall_class1']:+.4f}",
    ],
    "Œî Co√ªt M√©tier": [
        f"{metrics_mean_v2_1['business_cost_min'] - metrics_mean['business_cost_min']:+.2f}",
        f"{metrics_mean_v2_2['business_cost_min'] - metrics_mean['business_cost_min']:+.2f}",
    ]
}

improvement_df = pd.DataFrame(improvement_data)
print(improvement_df.to_string(index=False))
print("="*120)

# D√©terminer la meilleure version
best_auc_version = ["V1", "V2.1", "V2.2"][
    np.argmax([metrics_mean['auc'], metrics_mean_v2_1['auc'], metrics_mean_v2_2['auc']])
]
best_f1_version = ["V1", "V2.1", "V2.2"][
    np.argmax([metrics_mean['f1_score'], metrics_mean_v2_1['f1_score'], metrics_mean_v2_2['f1_score']])
]
best_recall_version = ["V1", "V2.1", "V2.2"][
    np.argmax([metrics_mean['recall_class1'], metrics_mean_v2_1['recall_class1'], metrics_mean_v2_2['recall_class1']])
]
best_cost_version = ["V1", "V2.1", "V2.2"][
    np.argmin([metrics_mean['business_cost_min'], metrics_mean_v2_1['business_cost_min'], metrics_mean_v2_2['business_cost_min']])
]

print("\n‚úì Meilleure version par m√©trique:")
print(f"  AUC:              {best_auc_version}")
print(f"  F1-Score:         {best_f1_version}")
print(f"  Recall Classe 1:  {best_recall_version}")
print(f"  Co√ªt M√©tier Min:  {best_cost_version}")
print("="*120)


TABLEAU COMPARATIF: V1 Baseline vs V2.1 Class Weight vs V2.2 SMOTE
          Version             AUC        F1-Score Recall Classe 1 Co√ªt M√©tier Min      Imbalance Handling
      V1 Baseline 0.7010 ¬± 0.0038 0.0631 ¬± 0.0355 0.0361 ¬± 0.0202 1511.00 ¬± 34.85                    None
V2.1 Class Weight 0.6957 ¬± 0.0117 0.2367 ¬± 0.0078 0.5832 ¬± 0.0305 1164.40 ¬± 32.60 class_weight='balanced'
       V2.2 SMOTE 0.6890 ¬± 0.0104 0.2386 ¬± 0.0104 0.5600 ¬± 0.0297 1168.60 ¬± 36.73                   SMOTE

ANALYSE DES AM√âLIORATIONS (vs V1 Baseline)
          Version   Œî AUC Œî F1-Score Œî Recall Classe 1 Œî Co√ªt M√©tier
V2.1 Class Weight -0.0052    +0.1736           +0.5471       -346.60
       V2.2 SMOTE -0.0120    +0.1754           +0.5239       -342.40

‚úì Meilleure version par m√©trique:
  AUC:              V1
  F1-Score:         V2.2
  Recall Classe 1:  V2.1
  Co√ªt M√©tier Min:  V2.1


# VERSION 3: Meilleur mod√®le avec scaling robuste optimis√©

Objectif: R√©entra√Æner le meilleur mod√®le (V2.1 class_weight='balanced') avec un scaling plus adapt√©
- **RobustScaler**: Utilise la m√©diane et l'IQR (moins sensible aux outliers que StandardScaler)
- Validation: StratifiedKFold (5 folds)
- Mod√®le: LogisticRegression avec class_weight='balanced'
- Enregistrement dans MLflow Model Registry sous le nom "regression"

In [17]:
# ============================================================================
# VERSION 3: Meilleur mod√®le avec RobustScaler (adapt√© aux outliers)
# ============================================================================
# Objectif: Am√©liorer le scaling pour des features avec outliers
# RobustScaler utilise la m√©diane et l'IQR au lieu de la moyenne et l'√©cart-type
# Plus robuste face aux valeurs extr√™mes dans les donn√©es de cr√©dit

from sklearn.preprocessing import RobustScaler

# Cr√©er le RobustScaler
robust_scaler = RobustScaler()
X_train_robust = robust_scaler.fit_transform(X_train)
X_test_robust = robust_scaler.transform(X_test)

# Reconvertir en DataFrame
X_train_robust = pd.DataFrame(X_train_robust, columns=X_train.columns)
X_test_robust = pd.DataFrame(X_test_robust, columns=X_test.columns)

print(f"\n‚úì Features scal√©es avec RobustScaler (m√©diane + IQR):")
print(f"  Shape train: {X_train_robust.shape}")
print(f"  Median: {X_train_robust.median().mean():.8f} (‚âà 0)")
print(f"  IQR (Interquartile Range): {(X_train_robust.quantile(0.75) - X_train_robust.quantile(0.25)).mean():.6f}")
print(f"\n  Comparaison vs StandardScaler:")
print(f"  StandardScaler - Mean: {X_train_scaled.mean().mean():.8f}, Std: {X_train_scaled.std().mean():.6f}")
print(f"  RobustScaler   - Median: {X_train_robust.median().mean():.8f}, IQR: {(X_train_robust.quantile(0.75) - X_train_robust.quantile(0.25)).mean():.6f}")


‚úì Features scal√©es avec RobustScaler (m√©diane + IQR):
  Shape train: (10000, 724)
  Median: 0.00000000 (‚âà 0)
  IQR (Interquartile Range): 0.052486

  Comparaison vs StandardScaler:
  StandardScaler - Mean: -0.00000000, Std: 0.874353
  RobustScaler   - Median: 0.00000000, IQR: 0.052486


In [18]:
# ============================================================================
# CROSS-VALIDATION V3: Meilleur mod√®le (V2.1) avec RobustScaler
# ============================================================================

MODEL_CONFIG_V3 = {
    "max_iter": 3000,
    "random_state": 42,
    "solver": "saga",
    "class_weight": "balanced"
}

RUN_NAME_V3 = "V3_LogisticRegression_RobustScaler_ClassWeight"

fold_results_v3 = []

# Terminer tout run actif
mlflow.end_run()

with mlflow.start_run(run_name=RUN_NAME_V3):
    # ========== Logging des param√®tres et tags ==========
    mlflow.log_params(MODEL_CONFIG_V3)
    mlflow.set_tag("version", "3")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("notebook", NOTEBOOK_NAME)
    mlflow.set_tag("phase", "robust_scaling_cv")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("scaling", "RobustScaler")
    mlflow.set_tag("imbalance_handling", "class_weight")
    mlflow.set_tag("model_type", "LogisticRegression")
    
    # ========== StratifiedKFold (5 folds) ==========
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_robust, y_train), start=1):
        X_tr, X_val = X_train_robust.iloc[train_idx], X_train_robust.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        # ========== Entra√Ænement ==========
        model = LogisticRegression(**MODEL_CONFIG_V3)
        model.fit(X_tr, y_tr)
        
        # ========== Pr√©dictions ==========
        y_val_proba = model.predict_proba(X_val)[:, 1]
        y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int)
        
        # ========== M√©triques ==========
        auc = roc_auc_score(y_val, y_val_proba)
        accuracy = accuracy_score(y_val, y_val_pred)
        f1 = f1_score(y_val, y_val_pred)
        recall = recall_score(y_val, y_val_pred)
        
        # ========== Co√ªt m√©tier (seuil=0.5) ==========
        tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()
        cost = 10 * fn + 1 * fp
        
        fold_results_v3.append({
            "fold": fold_idx,
            "auc": auc,
            "accuracy": accuracy,
            "f1_score": f1,
            "recall_class1": recall,
            "business_cost_min": cost,
            "optimal_threshold": THRESHOLD_FIXED,
            "tp": tp,
            "fp": fp,
            "fn": fn,
            "tn": tn
        })
        
        print(f"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | "
              f"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}")
    
    # ========== Agr√©gation des r√©sultats ==========
    cv_results_v3_df = pd.DataFrame(fold_results_v3)
    
    metrics_mean_v3 = {
        "auc": cv_results_v3_df["auc"].mean(),
        "f1_score": cv_results_v3_df["f1_score"].mean(),
        "recall_class1": cv_results_v3_df["recall_class1"].mean(),
        "business_cost_min": cv_results_v3_df["business_cost_min"].mean(),
        "optimal_threshold": THRESHOLD_FIXED,
    }
    
    metrics_std_v3 = {
        "auc": cv_results_v3_df["auc"].std(),
        "f1_score": cv_results_v3_df["f1_score"].std(),
        "recall_class1": cv_results_v3_df["recall_class1"].std(),
        "business_cost_min": cv_results_v3_df["business_cost_min"].std(),
    }
    
    # ========== Logging dans MLFlow ==========
    mlflow.log_metric("auc", metrics_mean_v3["auc"])
    mlflow.log_metric("f1_score", metrics_mean_v3["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean_v3["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean_v3["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean_v3["optimal_threshold"])
    
    # Log artefact JSON avec d√©tails par fold
    mlflow.log_dict(cv_results_v3_df.to_dict(orient="records"), "cv_results_per_fold.json")
    
    print("\n‚úì Cross-Validation LogisticRegression V3 (RobustScaler + class_weight) termin√©e")
    print(f"  AUC moyen: {metrics_mean_v3['auc']:.4f} ¬± {metrics_std_v3['auc']:.4f}")
    print(f"  F1 moyen: {metrics_mean_v3['f1_score']:.4f} ¬± {metrics_std_v3['f1_score']:.4f}")
    print(f"  Recall moyen: {metrics_mean_v3['recall_class1']:.4f} ¬± {metrics_std_v3['recall_class1']:.4f}")
    print(f"  Co√ªt m√©tier moyen: {metrics_mean_v3['business_cost_min']:.2f} ¬± {metrics_std_v3['business_cost_min']:.2f}")
    print(f"  Seuil optimal: {metrics_mean_v3['optimal_threshold']:.2f}")

Fold 1/5 | AUC=0.5488 | Acc=0.3675 | F1=0.1516 | Recall=0.7290 | Cost=1643
Fold 2/5 | AUC=0.5648 | Acc=0.4400 | F1=0.1592 | Recall=0.6839 | Cost=1561
Fold 3/5 | AUC=0.5284 | Acc=0.3270 | F1=0.1492 | Recall=0.7613 | Cost=1679
Fold 4/5 | AUC=0.5628 | Acc=0.3750 | F1=0.1554 | Recall=0.7419 | Cost=1610
Fold 5/5 | AUC=0.5070 | Acc=0.3575 | F1=0.1462 | Recall=0.7097 | Cost=1690

‚úì Cross-Validation LogisticRegression V3 (RobustScaler + class_weight) termin√©e
  AUC moyen: 0.5424 ¬± 0.0245
  F1 moyen: 0.1523 ¬± 0.0051
  Recall moyen: 0.7252 ¬± 0.0298
  Co√ªt m√©tier moyen: 1636.60 ¬± 52.71
  Seuil optimal: 0.50
üèÉ View run V3_LogisticRegression_RobustScaler_ClassWeight at: http://127.0.0.1:5000/#/experiments/1/runs/f3c1d8a8220a4e5193cba3eb73b30df6
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [19]:
# ============================================================================
# ENTRA√éNEMENT FINAL V3 + ENREGISTREMENT DANS MODEL REGISTRY
# ============================================================================

# Entra√Æner le mod√®le final sur tout le train set avec RobustScaler
final_model_v3 = LogisticRegression(**MODEL_CONFIG_V3)
final_model_v3.fit(X_train_robust, y_train)

print("\n‚úì Mod√®le final LogisticRegression V3 entra√Æn√© sur l'ensemble train complet")
print(f"  Nombre de features: {X_train_robust.shape[1]}")
print(f"  Intercept: {final_model_v3.intercept_[0]:.6f}")
print(f"  Norme des coefficients: {np.linalg.norm(final_model_v3.coef_):.6f}")

# ============================================================================
# LOGGING MLFLOW V3 + ENREGISTREMENT DANS MODEL REGISTRY
# ============================================================================

mlflow.end_run()

with mlflow.start_run(run_name="V3_LogisticRegression_RobustScaler_Final"):
    # Logging des param√®tres
    mlflow.log_params(MODEL_CONFIG_V3)
    
    # Tags
    mlflow.set_tag("version", "3")
    mlflow.set_tag("model", "LogisticRegression")
    mlflow.set_tag("phase", "final_model")
    mlflow.set_tag("threshold", str(THRESHOLD_FIXED))
    mlflow.set_tag("scaling", "RobustScaler")
    mlflow.set_tag("imbalance_handling", "class_weight")
    mlflow.set_tag("model_type", "LogisticRegression")
    mlflow.set_tag("best_model", "true")
    
    # Logger les m√©triques CV
    mlflow.log_metric("auc", metrics_mean_v3["auc"])
    mlflow.log_metric("f1_score", metrics_mean_v3["f1_score"])
    mlflow.log_metric("recall_class1", metrics_mean_v3["recall_class1"])
    mlflow.log_metric("business_cost_min", metrics_mean_v3["business_cost_min"])
    mlflow.log_metric("optimal_threshold", metrics_mean_v3["optimal_threshold"])
    
    # Sauvegarder le mod√®le ET l'enregistrer dans le Model Registry
    model_info = mlflow.sklearn.log_model(
        final_model_v3,
        artifact_path="logistic_regression_v3_robust_scaler",
        registered_model_name="regression"  # Enregistrement automatique dans Model Registry
    )
    
    print(f"\n‚úì Mod√®le V3 enregistr√© dans MLflow Model Registry sous le nom 'regression'")
    print(f"  AUC (CV): {metrics_mean_v3['auc']:.4f}")
    print(f"  F1 (CV): {metrics_mean_v3['f1_score']:.4f}")
    print(f"  Recall (CV): {metrics_mean_v3['recall_class1']:.4f}")
    print(f"  Business Cost Min (CV): {metrics_mean_v3['business_cost_min']:.2f}")
    print(f"\n  Model URI: {model_info.model_uri}")
    print(f"  ‚ÑπÔ∏è  Ce mod√®le est maintenant disponible dans le Model Registry")
    print(f"     Accessible via: mlflow.sklearn.load_model('models:/regression/latest')")


‚úì Mod√®le final LogisticRegression V3 entra√Æn√© sur l'ensemble train complet
  Nombre de features: 724
  Intercept: -0.000000
  Norme des coefficients: 0.000000


Registered model 'regression' already exists. Creating a new version of this model...
2026/02/06 02:11:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: regression, version 3
Created version '3' of model 'regression'.



‚úì Mod√®le V3 enregistr√© dans MLflow Model Registry sous le nom 'regression'
  AUC (CV): 0.5424
  F1 (CV): 0.1523
  Recall (CV): 0.7252
  Business Cost Min (CV): 1636.60

  Model URI: models:/m-6f0e559865f84c4a9bae981ffb44747e
  ‚ÑπÔ∏è  Ce mod√®le est maintenant disponible dans le Model Registry
     Accessible via: mlflow.sklearn.load_model('models:/regression/latest')
üèÉ View run V3_LogisticRegression_RobustScaler_Final at: http://127.0.0.1:5000/#/experiments/1/runs/6d7ce4bf0fa94725a3b69b3f85e5bdc8
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1


In [None]:
# ============================================================================
# TABLEAU COMPARATIF FINAL: Toutes les versions (V1, V2.1, V2.2, V3)
# ============================================================================

print("\n" + "="*130)
print("TABLEAU COMPARATIF FINAL: Toutes les versions")
print("="*130)

comparison_data_final = {
    "Version": ["V1 Baseline", "V2.1 Class Weight", "V2.2 SMOTE", "V3 RobustScaler + CW"],
    "Scaling": ["StandardScaler", "StandardScaler", "StandardScaler", "RobustScaler"],
    "Imbalance": ["None", "class_weight", "SMOTE", "class_weight"],
    "AUC": [
        f"{metrics_mean['auc']:.4f} ¬± {metrics_std['auc']:.4f}",
        f"{metrics_mean_v2_1['auc']:.4f} ¬± {metrics_std_v2_1['auc']:.4f}",
        f"{metrics_mean_v2_2['auc']:.4f} ¬± {metrics_std_v2_2['auc']:.4f}",
        f"{metrics_mean_v3['auc']:.4f} ¬± {metrics_std_v3['auc']:.4f}",
    ],
    "F1-Score": [
        f"{metrics_mean['f1_score']:.4f} ¬± {metrics_std['f1_score']:.4f}",
        f"{metrics_mean_v2_1['f1_score']:.4f} ¬± {metrics_std_v2_1['f1_score']:.4f}",
        f"{metrics_mean_v2_2['f1_score']:.4f} ¬± {metrics_std_v2_2['f1_score']:.4f}",
        f"{metrics_mean_v3['f1_score']:.4f} ¬± {metrics_std_v3['f1_score']:.4f}",
    ],
    "Recall": [
        f"{metrics_mean['recall_class1']:.4f} ¬± {metrics_std['recall_class1']:.4f}",
        f"{metrics_mean_v2_1['recall_class1']:.4f} ¬± {metrics_std_v2_1['recall_class1']:.4f}",
        f"{metrics_mean_v2_2['recall_class1']:.4f} ¬± {metrics_std_v2_2['recall_class1']:.4f}",
        f"{metrics_mean_v3['recall_class1']:.4f} ¬± {metrics_std_v3['recall_class1']:.4f}",
    ],
    "Co√ªt M√©tier": [
        f"{metrics_mean['business_cost_min']:.2f} ¬± {metrics_std['business_cost_min']:.2f}",
        f"{metrics_mean_v2_1['business_cost_min']:.2f} ¬± {metrics_std_v2_1['business_cost_min']:.2f}",
        f"{metrics_mean_v2_2['business_cost_min']:.2f} ¬± {metrics_std_v2_2['business_cost_min']:.2f}",
        f"{metrics_mean_v3['business_cost_min']:.2f} ¬± {metrics_std_v3['business_cost_min']:.2f}",
    ]
}

comparison_df_final = pd.DataFrame(comparison_data_final)
print(comparison_df_final.to_string(index=False))
print("="*130)

# D√©terminer la meilleure version finale
all_metrics = {
    "V1": metrics_mean,
    "V2.1": metrics_mean_v2_1,
    "V2.2": metrics_mean_v2_2,
    "V3": metrics_mean_v3
}

best_auc_v = max(all_metrics.items(), key=lambda x: x[1]['auc'])
best_f1_v = max(all_metrics.items(), key=lambda x: x[1]['f1_score'])
best_recall_v = max(all_metrics.items(), key=lambda x: x[1]['recall_class1'])
best_cost_v = min(all_metrics.items(), key=lambda x: x[1]['business_cost_min'])

print("\n‚úì Meilleure version par m√©trique:")
print(f"  AUC:              {best_auc_v[0]} ({best_auc_v[1]['auc']:.4f})")
print(f"  F1-Score:         {best_f1_v[0]} ({best_f1_v[1]['f1_score']:.4f})")
print(f"  Recall Classe 1:  {best_recall_v[0]} ({best_recall_v[1]['recall_class1']:.4f})")
print(f"  Co√ªt M√©tier Min:  {best_cost_v[0]} ({best_cost_v[1]['business_cost_min']:.2f})")

# S√©lection dynamique bas√©e sur le Co√ªt M√©tier (m√©trique m√©tier principale)
best_overall_version = best_cost_v[0]

version_names = {
    "V1": "V1 Baseline",
    "V2.1": "V2.1 StandardScaler + class_weight='balanced'",
    "V2.2": "V2.2 StandardScaler + SMOTE",
    "V3": "V3 RobustScaler + class_weight='balanced'"
}

print("\n" + "="*130)
print(f"‚úì MOD√àLE FINAL S√âLECTIONN√â: {version_names[best_overall_version]}")
print(f"  Bas√© sur le Co√ªt M√©tier (m√©trique m√©tier principale): {best_cost_v[1]['business_cost_min']:.2f}")

if best_overall_version == "V3":
    print("  Enregistr√© dans MLflow Model Registry sous le nom: 'regression'")
else:
    print(f"  Note: V3 a √©t√© enregistr√© dans Model Registry pour d√©monstration,")
    print(f"        mais {version_names[best_overall_version]} a de meilleures performances")

print("="*130)


TABLEAU COMPARATIF FINAL: Toutes les versions
             Version        Scaling    Imbalance             AUC        F1-Score          Recall     Co√ªt M√©tier
         V1 Baseline StandardScaler         None 0.7010 ¬± 0.0038 0.0631 ¬± 0.0355 0.0361 ¬± 0.0202 1511.00 ¬± 34.85
   V2.1 Class Weight StandardScaler class_weight 0.6957 ¬± 0.0117 0.2367 ¬± 0.0078 0.5832 ¬± 0.0305 1164.40 ¬± 32.60
          V2.2 SMOTE StandardScaler        SMOTE 0.6890 ¬± 0.0104 0.2386 ¬± 0.0104 0.5600 ¬± 0.0297 1168.60 ¬± 36.73
V3 RobustScaler + CW   RobustScaler class_weight 0.5424 ¬± 0.0245 0.1523 ¬± 0.0051 0.7252 ¬± 0.0298 1636.60 ¬± 52.71

‚úì Meilleure version par m√©trique:
  AUC:              V1 (0.7010)
  F1-Score:         V2.2 (0.2386)
  Recall Classe 1:  V3 (0.7252)
  Co√ªt M√©tier Min:  V2.1 (1164.40)

‚úì MOD√àLE FINAL S√âLECTIONN√â: V1 Baseline
  Bas√© sur l'AUC (m√©trique principale): 0.7010
  Note: V3 a √©t√© enregistr√© dans Model Registry pour d√©monstration,
        mais V1 Baseline a de 