In [1]:
from src.mlflow_config import configure_mlflow

mlflow = configure_mlflow()

  return FileStore(store_uri, store_uri)
2026/02/03 19:13:55 INFO mlflow.tracking.fluent: Experiment with name 'OC_P6_Credit_Scoring' does not exist. Creating a new experiment.


In [2]:
import pandas as pd

# Exemple si tu as sauvegardé les features
X_train = pd.read_csv("../data/processed/features_train.csv")
y_train = X_train.pop("TARGET")  # ou le nom de ta cible
# Même chose pour X_val, y_val si tu as un split

In [3]:
# Convertir les colonnes object en types numériques
import numpy as np

# Identifier et convertir les colonnes object
object_cols = X_train.select_dtypes(include=['object']).columns.tolist()
print(f"Colonnes object détectées: {object_cols}")

# Convertir chaque colonne object en numeric
for col in object_cols:
    X_train[col] = pd.to_numeric(X_train[col], errors='coerce')
    # Remplacer les NaN introduits par la conversion par 0
    X_train[col] = X_train[col].fillna(0)

# Nettoyer les noms de colonnes (remplacer les caractères spéciaux)
X_train.columns = X_train.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)

# Vérifier que toutes les colonnes sont numériques
print(f"Dtypes après conversion:\n{X_train.dtypes.value_counts()}")
print(f"\nColonnes (exemples): {X_train.columns[:5].tolist()}")

Colonnes object détectées: []
Dtypes après conversion:
float64    568
bool       131
int64       42
Name: count, dtype: int64

Colonnes (exemples): ['SK_ID_CURR', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN']


## Runs de modèles
Les entraînements et le logging MLflow commencent ici.

In [4]:
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, recall_score
from sklearn.model_selection import train_test_split

# Split si pas déjà fait
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train, random_state=42)

# Appliquer les mêmes transformations aux données splittées
X_train_split.columns = X_train_split.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)
X_val_split.columns = X_val_split.columns.str.replace(' ', '_').str.replace('[^a-zA-Z0-9_]', '_', regex=True)

with mlflow.start_run(run_name="LGBM_baseline_v1"):  # Nom clair pour identifier le run dans l'UI
    
    # Définition du modèle
    model = LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        class_weight="balanced",  # Gère le déséquilibre (ou utilise scale_pos_weight)
        random_state=42
    )
    
    # Entraînement
    model.fit(X_train_split, y_train_split)
    
    # Prédictions et métriques
    y_pred_proba = model.predict_proba(X_val_split)[:, 1]
    y_pred = model.predict(X_val_split)
    
    auc = roc_auc_score(y_val_split, y_pred_proba)
    f1 = f1_score(y_val_split, y_pred)
    recall_1 = recall_score(y_val_split, y_pred)  # Recall sur la classe minoritaire
    
    # === TRACKING MLFlow ===
    # Tags pour contextualiser (MLFlow autolog enregistre déjà les paramètres)
    mlflow.set_tag("model_type", "LightGBM")
    mlflow.set_tag("phase", "baseline")
    mlflow.set_tag("desequilibre", "class_weight_balanced")
    
    # Métriques principales
    mlflow.log_metric("auc_roc", auc)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("recall_class_1", recall_1)
    
    # Artefacts utiles (ex: plot importance)
    # import matplotlib.pyplot as plt
    # ... plot feature importance ...
    # plt.savefig("feature_importance.png")
    # mlflow.log_artifact("feature_importance.png")
    
    # Log du modèle (important pour le registry plus tard)
    mlflow.lightgbm.log_model(model, "model")
    
    print(f"AUC: {auc:.4f} | F1: {f1:.4f} | Recall_1: {recall_1:.4f}")

2026/02/03 19:13:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2026/02/03 19:13:56 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.


[LightGBM] [Info] Number of positive: 620, number of negative: 7380
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002744 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 19292
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 647
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000




AUC: 0.7402 | F1: 0.1166 | Recall_1: 0.0839
