In [None]:
# ============================================================================
# CONFIGURATION DU NOTEBOOK
# ============================================================================

import datetime


# Configuration MLflow
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
MLFLOW_EXPERIMENT_NAME = "OC_P6_Credit_Scoring"

# Configuration du projet
PROJECT_VERSION = "1.0"
MODEL_NAME = "LightGBM"
NOTEBOOK_NAME = "03_LGBM"
RUN_DATE = datetime.datetime.now()

# Configuration des donn√©es
DATA_PATH = "../data/processed/"
TRAIN_FILE = "features_train.csv"
TEST_FILE = "features_test.csv"

# Configuration du mod√®le baseline
MODEL_CONFIG = {
    "n_estimators": 500,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "class_weight": "balanced",
    "random_state": 42
}

# Configuration de la validation
VALIDATION_SPLIT_RATIO = 0.2
RANDOM_STATE = 42

# Configuration des tags MLflow
MLFLOW_TAGS = {
    "project_version": PROJECT_VERSION,
    "notebook": NOTEBOOK_NAME,
    "phase": "baseline",
    "desequilibre_handling": "class_weight_balanced",
    "date": RUN_DATE,
}

print("Configuration charg√©e avec succ√®s !")
print(f"MLflow Experiment: {MLFLOW_EXPERIMENT_NAME}")
print(f"Project Version: {PROJECT_VERSION}")
print(f"Model: {MODEL_NAME}")


# 03 - LightGBM Modeling with MLflow Tracking

Configuration and experimentation notebook for credit scoring model.
All runs will be tracked in MLflow for comparison and reproducibility.

In [None]:
from src.mlflow_config import configure_mlflow

mlflow = configure_mlflow()

In [None]:
import pandas as pd

# Exemple si tu as sauvegard√© les features
X_train = pd.read_csv("../data/processed/features_train.csv")
y_train = X_train.pop("TARGET")  # ou le nom de ta cible
# M√™me chose pour X_val, y_val si tu as un split

In [None]:
# ============================================================================
# Cross-validation LightGBM + co√ªt m√©tier
# ============================================================================
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, confusion_matrix

# Nettoyage des colonnes avant entra√Ænement (noms + types)
object_cols = X_train.select_dtypes(include=['object']).columns.tolist()
if object_cols:
    for col in object_cols:
        X_train[col] = pd.to_numeric(X_train[col], errors='coerce').fillna(0)
X_train.columns = (
    X_train.columns
    .str.replace(' ', '_')
    .str.replace('[^a-zA-Z0-9_]', '_', regex=True)
    .str.replace('__+', '_', regex=True)
)

# Param√®tres mod√®le (assurer class_weight=balanced)
MODEL_CONFIG_CV = {**MODEL_CONFIG, "class_weight": "balanced"}

# K-Fold stratifi√©
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_results = []
thresholds = np.round(np.arange(0.1, 0.91, 0.05), 2)

RUN_NAME_CV = "LGBM_baseline_CV"

with mlflow.start_run(run_name=RUN_NAME_CV):
    # Log param√®tres
    mlflow.log_params(MODEL_CONFIG_CV)
    
    # Log tags existants
    for tag_key, tag_value in MLFLOW_TAGS.items():
        mlflow.set_tag(tag_key, tag_value)
    mlflow.set_tag("model_type", MODEL_NAME)
    mlflow.set_tag("phase", "baseline_cv")
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), start=1):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = LGBMClassifier(**MODEL_CONFIG_CV)
        model.fit(X_tr, y_tr)
        
        y_val_proba = model.predict_proba(X_val)[:, 1]
        auc = roc_auc_score(y_val, y_val_proba)
        
        best_threshold = None
        min_cost = None
        best_fp = None
        best_fn = None
        
        for thr in thresholds:
            y_val_pred = (y_val_proba >= thr).astype(int)
            tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()
            cost = 10 * fn + 1 * fp
            if (min_cost is None) or (cost < min_cost):
                min_cost = cost
                best_threshold = thr
                best_fp = fp
                best_fn = fn
        
        fold_results.append({
            "fold": fold_idx,
            "auc": auc,
            "best_threshold": best_threshold,
            "min_cost": min_cost,
            "fp": best_fp,
            "fn": best_fn
        })
    
    cv_results_df = pd.DataFrame(fold_results)
    cv_auc_mean = cv_results_df["auc"].mean()
    cv_min_cost_mean = cv_results_df["min_cost"].mean()
    cv_best_threshold_mean = cv_results_df["best_threshold"].mean()
    
    # Log m√©triques moyennes
    mlflow.log_metric("cv_auc_mean", cv_auc_mean)
    mlflow.log_metric("cv_min_cost_mean", cv_min_cost_mean)
    mlflow.log_metric("cv_best_threshold_mean", cv_best_threshold_mean)
    
    # Log artefact JSON
    mlflow.log_dict(cv_results_df.to_dict(orient="records"), "cv_results.json")
    
    print("‚úì Cross-validation termin√©e")
    print(f"AUC moyen: {cv_auc_mean:.4f}")
    print(f"Co√ªt m√©tier moyen: {cv_min_cost_mean:.2f}")
    print(f"Seuil optimal moyen: {cv_best_threshold_mean:.2f}")

In [None]:
# ============================================================================
# Optimisation RAPIDE Optuna (~10 min max)
# ============================================================================
import optuna
from optuna.pruners import MedianPruner

# Seuils grossiers pour vitesse
thresholds = np.arange(0.2, 0.8, 0.1)  # 6 valeurs au lieu de ~20

# CV acc√©l√©r√©e : 3 folds
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

def objective(trial):
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 31, 128),         # plage r√©duite
        "max_depth": trial.suggest_int("max_depth", -1, 12),            # -1 = no limit
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.1, log=True),
        "n_estimators": 800,                                            # fixe haut + early stopping
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 80),
        "subsample": trial.suggest_float("subsample", 0.7, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.7, 1.0),
        "class_weight": "balanced",
        "random_state": 42,
        "early_stopping_rounds": 50,                                    # arr√™t pr√©coce
        "verbose": -1,
    }
    
    fold_costs = []
    for train_idx, val_idx in skf.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        model = LGBMClassifier(**params)
        model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
        
        y_val_proba = model.predict_proba(X_val)[:, 1]
        min_cost, _ = compute_min_cost_per_fold(y_val, y_val_proba, thresholds)
        fold_costs.append(min_cost)
        
        # Pruning manuel possible ici si besoin
        trial.report(np.mean(fold_costs), step=len(fold_costs)-1)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return -np.mean(fold_costs)

# Lancement dans run parent (UI propre)
with mlflow.start_run(run_name="LGBM_optuna_rapide", nested=True):
    pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=10)
    study = optuna.create_study(direction="maximize", pruner=pruner)
    study.optimize(objective, n_trials=15, timeout=600)  # 10 min max s√©curit√©

# ... (garde ton code post-optimisation : best_params, √©valuation CV, run final)

In [None]:
# ============================================================================
# SECTION FINALE : Validation & Optimisation du seuil m√©tier (Hold-out)
# ============================================================================
# WHY HOLD-OUT :
# - Valide la g√©n√©ralisation (√©vite overfitting de la CV)
# - √âvalue le mod√®le sur donn√©es jamais vues pendant Optuna
# - Refl√®te mieux la performance en production
#
# WHY SEUIL FIN ICI (pas dans Optuna) :
# - Optuna avec seuils grossiers (0.2-0.7, step 0.1) : ~10 min, peu de pr√©cision
# - Seuil fin (0.05-0.95, step 0.01) : ici rapidement sans ralentir optimisation
#
# WHY NESTED RUNS :
# - Groupe parent "LGBM_final_validation" contient tous les r√©sultats
# - UI MLflow propre (arborescence avec parent/enfants)
# ============================================================================

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score
import matplotlib.pyplot as plt

# 1. Cr√©er le hold-out stratifi√© (20% test, 80% train)
X_train_final, X_holdout, y_train_final, y_holdout = train_test_split(
    X_train, y_train, 
    test_size=0.2, 
    stratify=y_train, 
    random_state=RANDOM_STATE
)

print(f"üîÄ Hold-out split:")
print(f"   Train: {X_train_final.shape[0]} | Hold-out: {X_holdout.shape[0]}")

# 2. Utiliser best_params d'Optuna (ou valeurs par d√©faut si besoin)
best_params = {
    'num_leaves': 90,
    'max_depth': 8,
    'learning_rate': 0.034964706548010455,
    'n_estimators': 206,
    'min_child_samples': 99,
    'subsample': 0.6916174727904909,
    'colsample_bytree': 0.8928138662796579,
    'class_weight': 'balanced',
    'early_stopping_rounds': 50
}

# 3. Entra√Æner le mod√®le final sur 80%
print("\nüöÄ Entra√Ænement du mod√®le final...")
final_model = LGBMClassifier(**best_params)

# eval_set pour early_stopping
final_model.fit(
    X_train_final, y_train_final,
    eval_set=[(X_holdout, y_holdout)],
    eval_metric='auc',
    callbacks=[
        plt.figure(),  # Ne pas afficher les logs
    ]
)

print("‚úì Mod√®le final entra√Æn√©")

# 4. Pr√©dire probabilit√©s sur hold-out
y_holdout_proba = final_model.predict_proba(X_holdout)[:, 1]

# 5. Calculer AUC-ROC sur hold-out
holdout_auc = roc_auc_score(y_holdout, y_holdout_proba)
print(f"\nüìä Hold-out AUC-ROC: {holdout_auc:.4f}")

# 6. Optimisation FINE du seuil (0.05-0.95, step 0.01)
fine_thresholds = np.arange(0.05, 0.96, 0.01)
threshold_costs = []

for thr in fine_thresholds:
    y_holdout_pred = (y_holdout_proba >= thr).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_holdout, y_holdout_pred).ravel()
    cost = 10 * fn + 1 * fp
    threshold_costs.append({
        'threshold': thr,
        'cost': cost,
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tn': tn
    })

threshold_costs_df = pd.DataFrame(threshold_costs)
optimal_idx = threshold_costs_df['cost'].idxmin()
optimal_threshold = threshold_costs_df.loc[optimal_idx, 'threshold']
min_cost = threshold_costs_df.loc[optimal_idx, 'cost']

print(f"üéØ Seuil optimal : {optimal_threshold:.2f}")
print(f"üí∞ Co√ªt minimal : {min_cost:.2f}")

# 7. Calculer F1 et Recall au seuil optimal
y_holdout_optimal = (y_holdout_proba >= optimal_threshold).astype(int)
holdout_f1 = f1_score(y_holdout, y_holdout_optimal)
holdout_recall = recall_score(y_holdout, y_holdout_optimal)

print(f"üìà F1-score (seuil optimal) : {holdout_f1:.4f}")
print(f"üìà Recall classe 1 (seuil optimal) : {holdout_recall:.4f}")

# 8. Tracer la courbe co√ªt vs seuil
plt.figure(figsize=(10, 6))
plt.plot(threshold_costs_df['threshold'], threshold_costs_df['cost'], 
         marker='o', linewidth=2, markersize=4)
plt.axvline(optimal_threshold, color='red', linestyle='--', 
            label=f'Optimal = {optimal_threshold:.2f}')
plt.xlabel('Seuil de d√©cision')
plt.ylabel('Co√ªt m√©tier (10*FN + 1*FP)')
plt.title('Courbe de co√ªt vs seuil (Hold-out)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()

# Sauvegarder le plot
plot_path = '/home/valentin/Env_Python/OC_P6/threshold_cost_curve.png'
plt.savefig(plot_path, dpi=300, bbox_inches='tight')
print(f"\nüìä Plot sauvegard√© : {plot_path}")

# 9. Cr√©er run parent NESTED pour MLflow
run_name_final = "LGBM_final_validation"

with mlflow.start_run(run_name=run_name_final, nested=False) as parent_run:
    # Log params
    mlflow.log_params(best_params)
    
    # Log tags
    for tag_key, tag_value in MLFLOW_TAGS.items():
        mlflow.set_tag(tag_key, tag_value)
    mlflow.set_tag("model_type", MODEL_NAME)
    mlflow.set_tag("phase", "final_validation")
    mlflow.set_tag("validation_method", "hold-out_20pct")
    
    # Log metrics
    mlflow.log_metric("holdout_auc", holdout_auc)
    mlflow.log_metric("holdout_min_cost", min_cost)
    mlflow.log_metric("optimal_threshold", optimal_threshold)
    mlflow.log_metric("holdout_f1", holdout_f1)
    mlflow.log_metric("holdout_recall", holdout_recall)
    
    # Log plot
    mlflow.log_artifact(plot_path)
    
    # Log tableau des co√ªts par d√©cile (JSON)
    decile_costs = threshold_costs_df[::10].to_dict(orient='records')
    mlflow.log_dict(decile_costs, "threshold_costs_deciles.json")
    
    print(f"\n‚úÖ Run MLflow parent : {parent_run.info.run_name}")
    print(f"   üìä M√©triques : AUC={holdout_auc:.4f}, Min Cost={min_cost:.2f}, F1={holdout_f1:.4f}")
    print(f"   üéØ Seuil optimal : {optimal_threshold:.2f}")

## Interpr√©tabilit√© (global + local) avec SHAP
SHAP est pertinent pour la transparence m√©tier car il fournit une attribution **coh√©rente et locale** des contributions de chaque variable √† une d√©cision, tout en restant **agr√©geable au niveau global**. Cela permet d‚Äôexpliquer un score client individuel (force plot) et de justifier les facteurs principaux √† l‚Äô√©chelle du portefeuille (summary plot), ce qui est attendu en contexte de scoring de cr√©dit.

In [None]:
# ============================================================================
# Mod√®le final + Feature importance + SHAP
# ============================================================================
import os
from pathlib import Path
import matplotlib.pyplot as plt
import lightgbm as lgb
import shap

# Entra√Æner le mod√®le final sur tout le train set
final_model = LGBMClassifier(**best_params)
final_model.fit(X_train, y_train)

with mlflow.start_run(run_name="LGBM_final_interpretability"):
    # Tags + params
    mlflow.log_params(best_params)
    for tag_key, tag_value in MLFLOW_TAGS.items():
        mlflow.set_tag(tag_key, tag_value)
    mlflow.set_tag("model_type", MODEL_NAME)
    mlflow.set_tag("phase", "final_interpretability")
    
    # Log du mod√®le final
    mlflow.lightgbm.log_model(final_model, MODEL_NAME)
    
    # --- Feature importance globale (gain) ---
    fig_gain, ax_gain = plt.subplots(figsize=(8, 6))
    lgb.plot_importance(final_model, importance_type="gain", ax=ax_gain, max_num_features=30)
    ax_gain.set_title("Feature Importance (Gain)")
    mlflow.log_figure(fig_gain, "feature_importance_gain.png")
    plt.close(fig_gain)
    
    # --- Feature importance globale (split) ---
    fig_split, ax_split = plt.subplots(figsize=(8, 6))
    lgb.plot_importance(final_model, importance_type="split", ax=ax_split, max_num_features=30)
    ax_split.set_title("Feature Importance (Split)")
    mlflow.log_figure(fig_split, "feature_importance_split.png")
    plt.close(fig_split)
    
    # --- SHAP : interpr√©tabilit√© locale & globale ---
    sample_size = min(1000, len(X_train))
    X_sample = X_train.sample(n=sample_size, random_state=42)
    
    explainer = shap.TreeExplainer(final_model)
    shap_values = explainer.shap_values(X_sample)
    
    # Pour binaire, shap_values peut √™tre une liste [classe0, classe1]
    if isinstance(shap_values, list):
        shap_values_to_use = shap_values[1]
    else:
        shap_values_to_use = shap_values
    
    # Summary plot (bee swarm)
    shap.summary_plot(shap_values_to_use, X_sample, show=False)
    fig_summary = plt.gcf()
    fig_summary.set_size_inches(10, 6)
    mlflow.log_figure(fig_summary, "shap_summary_beeswarm.png")
    plt.close(fig_summary)
    
    # Force plots pour 5 clients al√©atoires
    force_dir = Path("shap_force_plots")
    force_dir.mkdir(parents=True, exist_ok=True)
    rng = np.random.default_rng(42)
    sample_indices = rng.choice(X_sample.index, size=min(5, len(X_sample)), replace=False)
    
    for i, idx in enumerate(sample_indices, start=1):
        force_plot = shap.force_plot(explainer.expected_value if not isinstance(explainer.expected_value, (list, tuple)) else explainer.expected_value[1],
                                     shap_values_to_use[X_sample.index.get_loc(idx)],
                                     X_sample.loc[idx],
                                     matplotlib=False)
        force_path = force_dir / f"shap_force_plot_{i}.html"
        shap.save_html(str(force_path), force_plot)
        mlflow.log_artifact(str(force_path))
    
    print("‚úì Mod√®le final et artefacts d'interpr√©tabilit√© logg√©s dans MLflow")

In [None]:
# Convertir les colonnes object en types num√©riques
import numpy as np

# Identifier et convertir les colonnes object
object_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Colonnes object d√©tect√©es: {object_cols}")

# Convertir chaque colonne object en numeric
for col in object_cols:
    X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
    # Remplacer les NaN introduits par la conversion par 0
    X_train[col] = X_train[col].fillna(0)

# Nettoyer les noms de colonnes (remplacer les caract√®res sp√©ciaux)
X_train.columns = X_train.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)

# V√©rifier que toutes les colonnes sont num√©riques
print(f"Dtypes apr√®s conversion:\n{X_train.dtypes.value_counts()}")
print(f"\nColonnes (exemples): {X_train.columns[:5].tolist()}")

## Runs de mod√®les
Les entra√Ænements et le logging MLflow commencent ici.

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, recall_score
from sklearn.model_selection import train_test_split

# Split si pas d√©j√† fait
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, 
    test_size=VALIDATION_SPLIT_RATIO, 
    stratify=y_train, 
    random_state=RANDOM_STATE
)

# Appliquer les m√™mes transformations aux donn√©es splitt√©es
X_train_split.columns = X_train_split.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)
X_val_split.columns = X_val_split.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)

# Nom du run avec version
RUN_NAME = f"{MODEL_NAME}_baseline_{PROJECT_VERSION}"

with mlflow.start_run(run_name=RUN_NAME):
    
    # D√©finition du mod√®le avec la configuration
    model = LGBMClassifier(**MODEL_CONFIG)
    
    # Entra√Ænement
    model.fit(X_train_split, y_train_split)
    
    # Pr√©dictions et m√©triques
    y_pred_proba = model.predict_proba(X_val_split)[:, 1]
    y_pred = model.predict(X_val_split)
    
    auc = roc_auc_score(y_val_split, y_pred_proba)
    f1 = f1_score(y_val_split, y_pred)
    recall_1 = recall_score(y_val_split, y_pred)
    
    # === TRACKING MLFlow ===
    # Appliquer les tags depuis la configuration
    for tag_key, tag_value in MLFLOW_TAGS.items():
        mlflow.set_tag(tag_key, tag_value)
    
    # Ajouter des tags suppl√©mentaires
    mlflow.set_tag("model_type", MODEL_NAME)
    
    # M√©triques principales
    mlflow.log_metric("auc_roc", auc)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall_class_1", recall_1)
    
    # Artefacts utiles (ex: plot importance)
    # import matplotlib.pyplot as plt
    # ... plot feature importance ...
    # plt.savefig("feature_importance.png")
    # mlflow.log_artifact("feature_importance.png")
    
    # Log du mod√®le avec le nom depuis la configuration
    mlflow.lightgbm.log_model(model, MODEL_NAME)
    
    print(f"‚úì Run termin√©: {RUN_NAME}")
    print(f"  AUC: {auc:.4f} | F1: {f1:.4f} | Recall_1: {recall_1:.4f}")
    print(f"  Tags appliqu√©s: {MLFLOW_TAGS}")
