In [10]:
# ============================================================================
# CONFIGURATION DU NOTEBOOK
# ============================================================================

# Configuration MLflow
MLFLOW_TRACKING_URI = "http://127.0.0.1:5000"
MLFLOW_EXPERIMENT_NAME = "OC_P6_Credit_Scoring"

# Configuration du projet
PROJECT_VERSION = "1.0"
MODEL_NAME = "LightGBM"
NOTEBOOK_NAME = "03_LGBM"
RUN_DATE = "2026-02-03"

# Configuration des donn√©es
DATA_PATH = "../data/processed/"
TRAIN_FILE = "features_train.csv"
TEST_FILE = "features_test.csv"

# Configuration du mod√®le baseline
MODEL_CONFIG = {
    "n_estimators": 500,
    "learning_rate": 0.05,
    "num_leaves": 31,
    "class_weight": "balanced",
    "random_state": 42
}

# Configuration de la validation
VALIDATION_SPLIT_RATIO = 0.2
RANDOM_STATE = 42

# Configuration des tags MLflow
MLFLOW_TAGS = {
    "project_version": PROJECT_VERSION,
    "notebook": NOTEBOOK_NAME,
    "phase": "baseline",
    "desequilibre_handling": "class_weight_balanced",
    "date": RUN_DATE,
}

print("Configuration charg√©e avec succ√®s !")
print(f"MLflow Experiment: {MLFLOW_EXPERIMENT_NAME}")
print(f"Project Version: {PROJECT_VERSION}")
print(f"Model: {MODEL_NAME}")


Configuration charg√©e avec succ√®s !
MLflow Experiment: OC_P6_Credit_Scoring
Project Version: 1.0
Model: LightGBM


# 03 - LightGBM Modeling with MLflow Tracking

Configuration and experimentation notebook for credit scoring model.
All runs will be tracked in MLflow for comparison and reproducibility.

In [11]:
from src.mlflow_config import configure_mlflow

mlflow = configure_mlflow()

2026/02/03 23:29:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2026/02/03 23:29:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


In [12]:
import pandas as pd

# Exemple si tu as sauvegard√© les features
X_train = pd.read_csv("../data/processed/features_train.csv")
y_train = X_train.pop("TARGET")  # ou le nom de ta cible
# M√™me chose pour X_val, y_val si tu as un split

In [13]:
# Convertir les colonnes object en types num√©riques
import numpy as np

# Identifier et convertir les colonnes object
object_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Colonnes object d√©tect√©es: {object_cols}")

# Convertir chaque colonne object en numeric
for col in object_cols:
    X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
    # Remplacer les NaN introduits par la conversion par 0
    X_train[col] = X_train[col].fillna(0)

# Nettoyer les noms de colonnes (remplacer les caract√®res sp√©ciaux)
X_train.columns = X_train.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)

# V√©rifier que toutes les colonnes sont num√©riques
print(f"Dtypes apr√®s conversion:\n{X_train.dtypes.value_counts()}")
print(f"\nColonnes (exemples): {X_train.columns[:5].tolist()}")

Colonnes object d√©tect√©es: []
Dtypes apr√®s conversion:
float64    568
bool       131
int64       42
Name: count, dtype: int64

Colonnes (exemples): ['SK_ID_CURR', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN']


## Runs de mod√®les
Les entra√Ænements et le logging MLflow commencent ici.

In [14]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, recall_score
from sklearn.model_selection import train_test_split

# Split si pas d√©j√† fait
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, 
    test_size=VALIDATION_SPLIT_RATIO, 
    stratify=y_train, 
    random_state=RANDOM_STATE
)

# Appliquer les m√™mes transformations aux donn√©es splitt√©es
X_train_split.columns = X_train_split.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)
X_val_split.columns = X_val_split.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)

# Nom du run avec version
RUN_NAME = f"{MODEL_NAME}_baseline_{PROJECT_VERSION}"

with mlflow.start_run(run_name=RUN_NAME):
    
    # D√©finition du mod√®le avec la configuration
    model = LGBMClassifier(**MODEL_CONFIG)
    
    # Entra√Ænement
    model.fit(X_train_split, y_train_split)
    
    # Pr√©dictions et m√©triques
    y_pred_proba = model.predict_proba(X_val_split)[:, 1]
    y_pred = model.predict(X_val_split)
    
    auc = roc_auc_score(y_val_split, y_pred_proba)
    f1 = f1_score(y_val_split, y_pred)
    recall_1 = recall_score(y_val_split, y_pred)
    
    # === TRACKING MLFlow ===
    # Appliquer les tags depuis la configuration
    for tag_key, tag_value in MLFLOW_TAGS.items():
        mlflow.set_tag(tag_key, tag_value)
    
    # Ajouter des tags suppl√©mentaires
    mlflow.set_tag("model_type", MODEL_NAME)
    
    # M√©triques principales
    mlflow.log_metric("auc_roc", auc)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall_class_1", recall_1)
    
    # Artefacts utiles (ex: plot importance)
    # import matplotlib.pyplot as plt
    # ... plot feature importance ...
    # plt.savefig("feature_importance.png")
    # mlflow.log_artifact("feature_importance.png")
    
    # Log du mod√®le avec le nom depuis la configuration
    mlflow.lightgbm.log_model(model, MODEL_NAME)
    
    print(f"‚úì Run termin√©: {RUN_NAME}")
    print(f"  AUC: {auc:.4f} | F1: {f1:.4f} | Recall_1: {recall_1:.4f}")
    print(f"  Tags appliqu√©s: {MLFLOW_TAGS}")




[LightGBM] [Info] Number of positive: 620, number of negative: 7380
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19292
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 647
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




‚úì Run termin√©: LightGBM_baseline_1.0
  AUC: 0.7402 | F1: 0.1166 | Recall_1: 0.0839
  Tags appliqu√©s: {'project_version': '1.0', 'notebook': '03_LGBM', 'phase': 'baseline', 'desequilibre_handling': 'class_weight_balanced', 'date': '2026-02-03'}
üèÉ View run LightGBM_baseline_1.0 at: http://127.0.0.1:5000/#/experiments/1/runs/15a3e06c66634ba0ae72b10d13cf480c
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/1
