In [7]:
import lightgbm as lgb

import mlflow
from mlflow.models import infer_signature

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, f1_score


## Set global parameters

In [8]:
data_path = "C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\modeling\\data\\04_feature\\second_features_selection.csv"
artifact_path = "C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\modeling\\data\\06_models"

test_size = 0.2
random_state = 18
cost_fn = 10
cost_fp = 1

### Set experiment and tracking URI

In [9]:
mlflow.set_tracking_uri(uri="file:///C:/Users/Z478SG/Desktop/Ecole/OpenClassrooms-Projet-7/modeling/mlruns")
mlflow.set_experiment("Credit_Scoring_Model")

<Experiment: artifact_location='file:///C:/Users/Z478SG/Desktop/Ecole/OpenClassrooms-Projet-7/modeling/mlruns/534552444611581124', creation_time=1725124865324, experiment_id='534552444611581124', last_update_time=1725124865324, lifecycle_stage='active', name='Credit_Scoring_Model', tags={}>

## Load data

In [11]:
# Load data and select only EXT_SOURCE_2 columns
raw_data = pd.read_csv(data_path)
data = raw_data.astype('float16')


In [8]:
#store the 50 000 first rows for testing
# data = data[:50000]


## Split data

In [12]:
X = data.drop("TARGET", axis=1)
y = data["TARGET"]

In [13]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

## Create model

### Train and search best parameters of model

In [14]:
def train_model(X_train, y_train, param_grid):
    # Initialisation du modèle
    model = lgb.LGBMClassifier()

    # Initialisation de GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

    # Entraînement du modèle
    grid_search.fit(X_train, y_train)

    return grid_search

## Evaluate model

In [15]:
# Define a work cost function
def cost_metric(y_true, y_pred, cost_fn=cost_fn, cost_fp=cost_fp):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    cost = (fn * cost_fn) + (fp * cost_fp)
    return cost

In [16]:
def optimize_threshold(y_true, y_proba, cost_fn=cost_fn, cost_fp=cost_fp):
    fpr, tpr, thresholds = roc_curve(y_true, y_proba)
    costs = []
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        cost = cost_metric(y_true, y_pred, cost_fn, cost_fp)
        costs.append(cost)
    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold


In [17]:
def assess_model(best_model, X_test, y_test):
    # Prédictions sur l'ensemble de test
    y_proba = best_model.predict_proba(X_test)[:, 1]
    optimal_threshold = optimize_threshold(y_test, y_proba)
    y_pred = (y_proba >= optimal_threshold).astype(int)

    # Calcul des métriques
    auc_roc = roc_auc_score(y_test, y_proba)
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(recall, precision) # Area Under the Precision-Recall Curve
    cost = cost_metric(y_test, y_pred)
    f1 = f1_score(y_test, y_pred) # F1 score
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold

# MLFlow

## Log experiment data into MLFlow

### Log data

In [18]:
n_negative = (y_train == 0).sum()
n_positive = (y_train == 1).sum()
base_scale_pos_weight = n_negative / n_positive
print(base_scale_pos_weight)
# base_scale_pos_weight = 11.334168964652795

11.334168964652795


General advice for efficient testing:

Use a smaller subset of your data for initial tests to speed up the process.  
Focus on parameters that typically have the most impact first: learning_rate, num_leaves, and n_estimators.
Keep other parameters at their default values while testing these key parameters.  
Once you've found good values for the key parameters, move on to fine-tuning the others.  
Remember that parameters can interact with each other, so you may need to revisit some parameters after changing others.  

By following this strategy, you can quickly narrow down the most promising parameter values with just two initial tests for each parameter. This focused approach will help you efficiently find a good set of parameters for your LightGBM model, considering your specific requirements for handling imbalanced data and asymmetric misclassification costs.

In [22]:
# create param grid with 2 or 3 options on each for LGBMClassifier
param_grid = {
    'num_leaves': [31, 127, ], # 255
    'learning_rate': [0.01, 0.1, ], # 0.3
    'n_estimators': [100, 500, ], # 1000
    # 'max_depth': [6, 12, ], # 20
    # 'min_child_samples': [20, 50, ], # 100
    # 'subsample': [0.6, 0.8, ], # 1.0
    # 'colsample_bytree': [0.6, 0.8, ], # 1.0
    # 'reg_alpha': [0, 1, ], # 10
    # 'reg_lambda': [0, 1, ], # 10
    'scale_pos_weight': [
        base_scale_pos_weight, 
        base_scale_pos_weight * 5, 
        # base_scale_pos_weight * 10,
        ]
}



In [24]:
# Start an MLflow run
with mlflow.start_run():
    ## Run experiment
    # Train model
    grid_search = train_model(X_train, y_train, param_grid)

    best_model = grid_search.best_estimator_

    # Assess model
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold = assess_model(best_model, X_test, y_test)

    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("LGBM hyperparameters tuning", "first try")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    mlflow.log_params(grid_search.best_params_)

    mlflow.log_param("columns", str(X_train.columns))

    # Log metrics (accuracy and AUC-ROC)
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        })

    # Infer the model signature (input and output schema) from the training data
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Log model
    model_info = mlflow.sklearn.log_model(
        sk_model=best_model, 
        artifact_path=artifact_path, 
        signature=signature,
        input_example=X_train,
        )

    # Log data path
    mlflow.log_artifact(data_path)
    

### View experiments in MLFlow UI

In a background terminal, run:  

./run_mlflow_ui.py

### If result is good : Register the model as an official model

In [None]:
# If not registered during the run, officially register the model artifact with a specific name

# result = mlflow.register_model(
#     model_info.model_uri, "LGBMClassifier-V1"
# )