In [2]:
import lightgbm as lgb

import mlflow
from mlflow.models import infer_signature

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, f1_score

import time


In [3]:
import sys
sys.path.insert(0, 'C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\')


## Set global parameters

In [4]:
data_path = "C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\modeling\\data\\04_feature\\second_features_selection.csv"

test_size = 0.2
random_state = 18
cost_fn = 10
cost_fp = 1

### Set experiment and tracking URI

In [5]:
mlflow.set_tracking_uri(uri="file:///C:/Users/Z478SG/Desktop/Ecole/OpenClassrooms-Projet-7/modeling/mlruns")
mlflow.set_experiment("Credit_Scoring_Model")

<Experiment: artifact_location='file:///C:/Users/Z478SG/Desktop/Ecole/OpenClassrooms-Projet-7/modeling/mlruns/534552444611581124', creation_time=1725124865324, experiment_id='534552444611581124', last_update_time=1725124865324, lifecycle_stage='active', name='Credit_Scoring_Model', tags={}>

## Load data

In [6]:
raw_data = pd.read_csv(data_path)
data = raw_data.astype('float16')
data.shape

(307507, 12)

In [7]:
#store the 10 000 first rows for testing
# data = data[:10000]
# data.shape

## Split data

In [7]:
X = data.drop("TARGET", axis=1)
y = data["TARGET"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

## Create model

### Train and search best parameters of model

In [8]:
def train_model(X_train, y_train, param_grid):
    # Initialisation du modèle
    model = lgb.LGBMClassifier()

    # Initialisation de GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

    # Entraînement du modèle
    grid_search.fit(X_train, y_train)

    return grid_search

## Evaluate model

In [9]:
# Define a work cost function
def cost_metric(y_true, y_pred, cost_fn=cost_fn, cost_fp=cost_fp):
    _, fp, fn, _ = confusion_matrix(y_true, y_pred).ravel() # c'est bien arg 2 et 3 pour fp et fn ? -> oui
    cost = (fn * cost_fn) + (fp * cost_fp)
    return cost

In [10]:
def optimize_threshold(y_true, y_proba, cost_fn=cost_fn, cost_fp=cost_fp):
    _, _, thresholds = roc_curve(y_true, y_proba)
    costs = []
    for threshold in thresholds:
        y_pred = (y_proba <= threshold).astype(int)
        cost = cost_metric(y_true, y_pred, cost_fn, cost_fp)
        costs.append(cost)
    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

In [11]:
def optimize_threshold2(y_true, y_proba, cost_fn=cost_fn):
    y_true = np.array(y_true)
    _, _, thresholds = roc_curve(y_true, y_proba)

    scores = []
    for threshold in thresholds:        
        y_pred = (y_proba <= threshold).astype(int) 
        
        false_positives = np.sum((y_true == 0) & (y_pred == 0))
        false_negatives = np.sum((y_true == 1) & (y_pred == 0))
        
        score = -(false_positives*cost_fp + cost_fn*false_negatives)
        scores.append(score)
    
    optimal_idx = np.argmax(scores)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

In [12]:
def f_beta_score(y_true, y_pred, beta=1):
    tp, fp, fn, _ = confusion_matrix(y_true, y_pred).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    return f_beta

# bonne métrique pour les déséquilibres de coût.

In [13]:
def manual_scoring(y_true, y_proba, cost_fn=cost_fn, cost_fp=cost_fp):
    thresholds = np.linspace(0.02, 0.5, 50)
    costs = []

    for threshold in thresholds:
        y_pred = (y_proba <= threshold).astype(int)
        cost = 0 if (y_true==y_pred).all() else (cost_fp if (y_true==0).all() and (y_pred==1).all() else cost_fn)
        costs.append(cost)
    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

In [14]:
def assess_model(best_model, X_test, y_test):
    # Prédictions sur l'ensemble de test
    y_proba = best_model.predict_proba(X_test)[:, 1]
    
    time1 = time.time()
    # optimal_threshold = optimize_threshold(y_test, y_proba) # Production
    optimal_threshold = 0.2 # Test
    print(f"Time for optimal threshold: {time.time()-time1}")
    print(f"optimal_threshold1: {optimal_threshold}")

    time8 = time.time()
    optimal_threshold2 = optimize_threshold2(y_test, y_proba, cost_fn)
    print(f"\nTime for optimal threshold2: {time.time()-time8}")
    print(f"optimal_threshold2: {optimal_threshold2}")

    time9 = time.time()
    # manual_threshold = manual_scoring(y_test, y_proba, cost_fn, cost_fp) # Production
    manual_threshold = 0.2 # Test
    print(f"Time for manual threshold: {time.time()-time9}")
    print(f"manual_threshold: {manual_threshold}")

    y_pred = (y_proba <= manual_threshold).astype(int)

    # Calcul des métriques
    time2 = time.time()
    auc_roc = roc_auc_score(y_test, y_proba)
    print(f"\nTime for auc roc: {time.time()-time2}")

    time3 = time.time()
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    print(f"Time for precision recall: {time.time()-time3}")
    
    time4 = time.time()
    auc_pr = auc(recall, precision) # Area Under the Precision-Recall Curve
    print(f"Time for auc pr: {time.time()-time4}")

    time5 = time.time()
    cost = cost_metric(y_test, y_pred)
    print(f"Time for cost: {time.time()-time5}")

    time6 = time.time()
    f1 = f1_score(y_test, y_pred) # F1 score
    print(f"Time for f1: {time.time()-time6}")

    time7 = time.time()
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Time for accuracy: {time.time()-time7}")

    time10 = time.time()
    f_beta = f_beta_score(y_test, y_pred)
    print(f"Time for f beta: {time.time()-time10}")

    return accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, manual_threshold, f_beta

# MLFlow

## Log experiment data into MLFlow

### Log data

In [15]:
n_negative = (y_train == 0).sum()
n_positive = (y_train == 1).sum()
base_scale_pos_weight = n_negative / n_positive
print(base_scale_pos_weight)
# base_scale_pos_weight = 11.334168964652795

11.334168964652795


General advice for efficient testing:

Use a smaller subset of your data for initial tests to speed up the process.  
Focus on parameters that typically have the most impact first: learning_rate, num_leaves, and n_estimators.
Keep other parameters at their default values while testing these key parameters.  
Once you've found good values for the key parameters, move on to fine-tuning the others.  
Remember that parameters can interact with each other, so you may need to revisit some parameters after changing others.  

By following this strategy, you can quickly narrow down the most promising parameter values with just two initial tests for each parameter. This focused approach will help you efficiently find a good set of parameters for your LightGBM model, considering your specific requirements for handling imbalanced data and asymmetric misclassification costs.

In [16]:
# create param grid with 2 or 3 options on each for LGBMClassifier
param_grid = {
    'num_leaves': [60, 44], # 255, 127, 31
    'learning_rate': [0.05, 0.1, ], # 0.01, 0.3
    'n_estimators': [150, 200 ], # 1000, 500, 100
    # 'max_depth': [6, 12, ], # 20
    # 'min_child_samples': [20, 50, ], # 100
    # 'subsample': [0.6, 0.8, ], # 1.0
    # 'colsample_bytree': [0.6, 0.8, ], # 1.0
    # 'reg_alpha': [0, 1, ], # 10
    # 'reg_lambda': [0, 1, ], # 10
    'scale_pos_weight': [
        base_scale_pos_weight, 
        # base_scale_pos_weight * 5, 
        # base_scale_pos_weight * 10,
        ]
}

# scale_pos_weight y a t il auto comme param?

# Modèle de référence

## 1 (non) pour tout le monde

In [37]:
from modeling.pipelines.utils.always_one_classifier import AlwaysOneClassifier

with mlflow.start_run():  
    always_one_model = AlwaysOneClassifier()

    # Évaluation finale sur l'ensemble de test
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, manual_threshold, f_beta = assess_model(always_one_model, X_test, y_test)
    
    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("Process", "Baseline model")
    mlflow.set_tag("Model", "AlwaysOneClassifier")
    mlflow.set_tag("Data", "full")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    mlflow.log_param("columns", str(X_train.columns.tolist()))

    # Log metrics
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        "Optimal Threshold 2": optimal_threshold2,
        "Manual Threshold": manual_threshold,
        "F beta score": f_beta,
        })

    mlflow.sklearn.log_model(always_one_model, "AlwaysOne Baseline Model")

Time for optimal threshold: 0.0
optimal_threshold1: 0.2

Time for optimal threshold2: 0.031264305114746094
optimal_threshold2: inf
Time for manual threshold: 0.07474446296691895
manual_threshold: 0.02

Time for auc roc: 0.03842282295227051
Time for precision recall: 0.015636682510375977
Time for auc pr: 0.0
Time for cost: 0.5618212223052979
Time for f1: 0.08158016204833984
Time for accuracy: 0.016668319702148438
Time for f beta: 0.4590303897857666




## 0 (oui) pour tout le monde

In [38]:
from modeling.pipelines.utils.always_zero_classifier import AlwaysZeroClassifier

with mlflow.start_run():  
    always_zero_model = AlwaysZeroClassifier()

    # Évaluation finale sur l'ensemble de test
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, manual_threshold, f_beta = assess_model(always_zero_model, X_test, y_test)
    
    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("Process", "Baseline model")
    mlflow.set_tag("Model", "AlwaysOneClassifier")
    mlflow.set_tag("Data", "full")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    mlflow.log_param("columns", str(X_train.columns.tolist()))

    # Log metrics
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        "Optimal Threshold 2": optimal_threshold2,
        "Manual Threshold": manual_threshold,
        "F beta score": f_beta,
        })

    mlflow.sklearn.log_model(always_zero_model, "AlwaysZero Baseline Model")

Time for optimal threshold: 0.0
optimal_threshold1: 0.2

Time for optimal threshold2: 0.03454089164733887
optimal_threshold2: inf
Time for manual threshold: 0.0914607048034668
manual_threshold: 0.02

Time for auc roc: 0.03906059265136719
Time for precision recall: 0.018339872360229492
Time for auc pr: 0.0
Time for cost: 0.45151257514953613
Time for f1: 0.058870553970336914
Time for accuracy: 0.015625953674316406


  recall = tp / (tp + fn)


Time for f beta: 0.5461218357086182




## ...

In [17]:
# Start an MLflow run
with mlflow.start_run():
    ## Run experiment
    # Train model
    grid_search = train_model(X_train, y_train, param_grid)

    best_model = grid_search.best_estimator_

    # Assess model
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold = assess_model(best_model, X_test, y_test)

    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("LGBM hyperparameters tuning", "second try")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    mlflow.log_params(grid_search.best_params_)

    mlflow.log_param("columns", str(X_train.columns.tolist()))

    # Log metrics (accuracy and AUC-ROC)
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        })
    
    # Log model
    mlflow.sklearn.log_model(best_model, "model")

KeyboardInterrupt: 

# Affinage des hyperparamètres en fonction des seuils

In [18]:
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

def custom_scorer(y_true, y_proba, cost_fn=cost_fn, cost_fp=cost_fp):
    optimal_threshold = optimize_threshold2(y_true, y_proba, cost_fn, cost_fp)
    y_pred = (y_proba <= optimal_threshold).astype(int)
    return -cost_metric(y_true, y_pred, cost_fn, cost_fp)  # Négatif car GridSearchCV maximise le score

def evaluate_model(model, X, y, param_grid, cv=5):
    # Définir le scorer personnalisé
    scorer = make_scorer(custom_scorer, needs_proba=True, greater_is_better=False)
    
    # Configurer la recherche sur grille avec validation croisée
    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring=scorer,
        cv=KFold(n_splits=cv, shuffle=True, random_state=random_state),
        n_jobs=-1,
        verbose=1
    )
    
    # Effectuer la recherche sur grille
    grid_search.fit(X, y)
    
    return grid_search.best_estimator_, grid_search.best_params_, grid_search.best_score_

with mlflow.start_run():
    model = lgb.LGBMClassifier()
    
    # Utilisation
    best_model, best_params, best_score = evaluate_model(model, X_train, y_train, param_grid)

    # Évaluation finale sur l'ensemble de test
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, manual_threshold, f_beta = assess_model(best_model, X_test, y_test)
    
        ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("Process", "hyper params and score finetuning")
    mlflow.set_tag("Model", "LGBM")
    mlflow.set_tag("Data", "full")
    mlflow.set_tag("y pred __ y proba X threshold", "<=")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    mlflow.log_param("best_params", best_params)
    mlflow.log_param("best_score", best_score)
    mlflow.log_param("columns", str(X_train.columns.tolist()))

    # Log metrics
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        "Optimal Threshold 2": optimal_threshold2,
        "Manual Threshold": manual_threshold,
        "F beta score": f_beta,
        })

    mlflow.sklearn.log_model(best_model, "model")




Fitting 5 folds for each of 8 candidates, totalling 40 fits




[LightGBM] [Info] Number of positive: 19945, number of negative: 226060
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027094 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2628
[LightGBM] [Info] Number of data points in the train set: 246005, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.081076 -> initscore=-2.427822
[LightGBM] [Info] Start training from score -2.427822
Time for optimal threshold: 0.0
optimal_threshold1: 0.2

Time for optimal threshold2: 10.61282992362976
optimal_threshold2: inf
Time for manual threshold: 0.0
manual_threshold: 0.2

Time for auc roc: 0.12268614768981934
Time for precision recall: 0.03197193145751953
Time for auc pr: 0.0
Time for cost: 0.5494816303253174
Time for f1: 0.07756805419921875
Time for accuracy: 0.01603531837463379
Time for f beta: 0.4054219722747803




# Register best model

In [19]:
import os
from mlflow.tracking import MlflowClient
from modeling.pipelines.utils.custom_threshold_model import CustomThresholdModel

# Créer un client MLflow
client = MlflowClient()

# Récupérer tous les runs de l'expériment
runs = client.search_runs(experiment_ids=['534552444611581124'])

best_run = runs[0]

model_path = os.path.join(os.path.expanduser("~"), "Desktop", "Ecole", "OpenClassrooms-Projet-7", "modeling", "data", "06_models", "latest")

best_model = mlflow.sklearn.load_model(f"runs:/{best_run.info.run_id}/model")

# créer un model custom qui a une fonction .predict() qui contient la valeur de la métrique "Manual Threshold". Cette fonction utilise .predict_proba(), comparer le résultat au seuil et retourne 1 ou 0 en fonction de si la probabilité est supérieure ou inférieure à la valeur de la métrique "Manual Threshold" 
manual_threshold = client.get_metric_history(best_run.info.run_id, "Manual Threshold")[0].value

custom_model = CustomThresholdModel(best_model, manual_threshold)

mlflow.sklearn.save_model(custom_model, model_path)

print(f"Meilleur modèle sauvegardé dans: {model_path}")


Meilleur modèle sauvegardé dans: C:\Users\Z478SG\Desktop\Ecole\OpenClassrooms-Projet-7\modeling\data\06_models\latest


# Test without training

In [21]:
# Test without training

import pickle
with open('C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\modeling\\data\\06_models\\model.pkl', 'rb') as f:
    best_model = pickle.load(f)

In [1]:
with mlflow.start_run():
    # Assess model
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, manual_threshold, f_beta = assess_model(best_model, X_test, y_test)

    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("LGBM hyperparameters tuning", "second try")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    # mlflow.log_params(grid_search.best_params_)

    mlflow.log_param("columns", str(X_train.columns.tolist()))

    # Log metrics (accuracy and AUC-ROC)
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        "Optimal Threshold 2": optimal_threshold2,
        "Manual Threshold": manual_threshold,
        "F beta score": f_beta,
        })

    mlflow.sklearn.log_model(best_model, "model")
