In [1]:
import lightgbm as lgb

import mlflow
from mlflow.models import infer_signature

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, precision_recall_curve, auc, confusion_matrix, f1_score

import time


## Set global parameters

In [2]:
data_path = "C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\modeling\\data\\04_feature\\second_features_selection.csv"
artifact_path = "C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\modeling\\data\\06_models"

test_size = 0.2
random_state = 18
cost_fn = 10
cost_fp = 1

### Set experiment and tracking URI

In [3]:
mlflow.set_tracking_uri(uri="file:///C:/Users/Z478SG/Desktop/Ecole/OpenClassrooms-Projet-7/modeling/mlruns")
mlflow.set_experiment("Credit_Scoring_Model")

<Experiment: artifact_location='file:///C:/Users/Z478SG/Desktop/Ecole/OpenClassrooms-Projet-7/modeling/mlruns/534552444611581124', creation_time=1725124865324, experiment_id='534552444611581124', last_update_time=1725124865324, lifecycle_stage='active', name='Credit_Scoring_Model', tags={}>

## Load data

In [4]:
# Load data and select only EXT_SOURCE_2 columns
raw_data = pd.read_csv(data_path)
data = raw_data.astype('float16')


In [5]:
#store the 10 000 first rows for testing
data = data[:10000]


## Split data

In [6]:
X = data.drop("TARGET", axis=1)
y = data["TARGET"]

In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

## Create model

### Train and search best parameters of model

In [8]:
def train_model(X_train, y_train, param_grid):
    # Initialisation du modèle
    model = lgb.LGBMClassifier()

    # Initialisation de GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)

    # Entraînement du modèle
    grid_search.fit(X_train, y_train)

    return grid_search

## Evaluate model

In [8]:
# Define a work cost function
def cost_metric(y_true, y_pred, cost_fn=cost_fn, cost_fp=cost_fp):
    _, fp, fn, _ = confusion_matrix(y_true, y_pred).ravel()
    cost = (fn * cost_fn) + (fp * cost_fp)
    return cost

In [20]:
def optimize_threshold(y_true, y_proba, cost_fn=cost_fn, cost_fp=cost_fp):
    """
    Optimize the threshold for binary classification based on cost metrics.

    Parameters:
    - y_true (array-like): True labels of the binary classification.
    - y_proba (array-like): Predicted probabilities of the positive class.
    - cost_fn (function, optional): Cost function for false negatives. Default is `cost_fn`.
    - cost_fp (function, optional): Cost function for false positives. Default is `cost_fp`.

    Returns:
    - optimal_threshold (float): The optimal threshold that minimizes the cost metric.
    """
    _, _, thresholds = roc_curve(y_true, y_proba)
    costs = []
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        cost = cost_metric(y_true, y_pred, cost_fn, cost_fp)
        costs.append(cost)
    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

In [10]:
# test a score with loss of money or not :
# -10 if I grant wrongly
# 0 if I refuse wrongly
# 0 if I rightly refuse
# 1 if I accept rightly

def optimize_threshold2(y_true, y_proba, cost_fn=cost_fn):
    _, _, thresholds = roc_curve(y_true, y_proba)
    
    scores = []
    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        
        true_positives = np.sum((y_true == 1) & (y_pred == 1))
        false_positives = np.sum((y_true == 0) & (y_pred == 1))
        
        score = true_positives - cost_fn * false_positives
        scores.append(score)
    
    optimal_idx = np.argmax(scores)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold


In [11]:
# function to calculate the f beta score
def f_beta_score(y_true, y_pred, beta=1):
    """
    Calculate the F-beta score of a binary classification.

    Parameters:
    - y_true (array-like): True labels of the binary classification.
    - y_pred (array-like): Predicted labels of the binary classification.
    - beta (float, optional): Beta value for the F-beta score. Default is 1.

    Returns:
    - f_beta (float): The F-beta score of the binary classification.
    """
    tp, fp, fn, _ = confusion_matrix(y_true, y_pred).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
    return f_beta

In [21]:
def manual_scoring(y_true, y_proba, cost_fn=cost_fn, cost_fp=cost_fp):
    """
    Calculate the manual scoring of a binary classification.

    Parameters:
    - y_true (array-like): True labels of the binary classification.
    - y_pred (array-like): Predicted labels of the binary classification.
    - cost_fn (function, optional): Cost function for false negatives. Default is `cost_fn`.
    - cost_fp (function, optional): Cost function for false positives. Default is `cost_fp`.

    Returns:
    - manual_score (float): The manual scoring of the binary classification.
    """
    thresholds = np.linspace(0.5, 0.98, 25)
    costs = []

    for threshold in thresholds:
        y_pred = (y_proba >= threshold).astype(int)
        cost = 0 if y_true==y_pred else cost_fp if y_true==0 and y_pred==1 else cost_fn
        costs.append(cost)
    optimal_idx = np.argmin(costs)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold

In [23]:
def assess_model(best_model, X_test, y_test):
    # Prédictions sur l'ensemble de test
    y_proba = best_model.predict_proba(X_test)[:, 1]
    
    time1 = time.time()
    optimal_threshold = optimize_threshold(y_test, y_proba)
    print(f"Time for optimal threshold: {time.time()-time1}")

    y_pred = (y_proba >= optimal_threshold).astype(int)

    # Calcul des métriques
    time2 = time.time()
    auc_roc = roc_auc_score(y_test, y_proba)
    print(f"Time for auc roc: {time.time()-time2}")

    time3 = time.time()
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    print(f"Time for precision recall: {time.time()-time3}")
    
    time4 = time.time()
    auc_pr = auc(recall, precision) # Area Under the Precision-Recall Curve
    print(f"Time for auc pr: {time.time()-time4}")

    time5 = time.time()
    cost = cost_metric(y_test, y_pred)
    print(f"Time for cost: {time.time()-time5}")

    time6 = time.time()
    f1 = f1_score(y_test, y_pred) # F1 score
    print(f"Time for f1: {time.time()-time6}")

    time7 = time.time()
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Time for accuracy: {time.time()-time7}")

    time8 = time.time()
    optimal_threshold2 = optimize_threshold2(y_test, y_proba, cost_fn)
    print(f"Time for optimal threshold2: {time.time()-time8}")

    # time9 = time.time()
    # manual_threshold = manual_scoring(y_test, y_proba, cost_fn, cost_fp)
    # print(f"Time for manual threshold: {time.time()-time9}")

    time10 = time.time()
    f_beta = f_beta_score(y_test, y_pred)
    print(f"Time for f beta: {time.time()-time10}")

    # return accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, manual_threshold, f_beta
    return accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, f_beta

# tester sur un autre modèle, 
# ajouter les 3 nouvelles fonctions de métrique

# MLFlow

## Log experiment data into MLFlow

### Log data

In [14]:
n_negative = (y_train == 0).sum()
n_positive = (y_train == 1).sum()
base_scale_pos_weight = n_negative / n_positive
print(base_scale_pos_weight)
# base_scale_pos_weight = 11.334168964652795

12.029315960912053


General advice for efficient testing:

Use a smaller subset of your data for initial tests to speed up the process.  
Focus on parameters that typically have the most impact first: learning_rate, num_leaves, and n_estimators.
Keep other parameters at their default values while testing these key parameters.  
Once you've found good values for the key parameters, move on to fine-tuning the others.  
Remember that parameters can interact with each other, so you may need to revisit some parameters after changing others.  

By following this strategy, you can quickly narrow down the most promising parameter values with just two initial tests for each parameter. This focused approach will help you efficiently find a good set of parameters for your LightGBM model, considering your specific requirements for handling imbalanced data and asymmetric misclassification costs.

In [15]:
# create param grid with 2 or 3 options on each for LGBMClassifier
param_grid = {
    'num_leaves': [60, 44], # 255, 127, 31
    'learning_rate': [0.05, 0.1, ], # 0.01, 0.3
    'n_estimators': [150, 200 ], # 1000, 500, 100
    # 'max_depth': [6, 12, ], # 20
    # 'min_child_samples': [20, 50, ], # 100
    # 'subsample': [0.6, 0.8, ], # 1.0
    # 'colsample_bytree': [0.6, 0.8, ], # 1.0
    # 'reg_alpha': [0, 1, ], # 10
    # 'reg_lambda': [0, 1, ], # 10
    'scale_pos_weight': [
        base_scale_pos_weight, 
        # base_scale_pos_weight * 5, 
        # base_scale_pos_weight * 10,
        ]
}

# scale_pos_weight y a t il auto comme param?

In [13]:
# Start an MLflow run
with mlflow.start_run():
    ## Run experiment
    # Train model
    grid_search = train_model(X_train, y_train, param_grid)

    best_model = grid_search.best_estimator_

    # Assess model
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold = assess_model(best_model, X_test, y_test)

    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("LGBM hyperparameters tuning", "second try")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    mlflow.log_params(grid_search.best_params_)

    mlflow.log_param("columns", str(X_train.columns.tolist()))

    # Log metrics (accuracy and AUC-ROC)
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        })

    # Infer the model signature (input and output schema) from the training data
    signature = infer_signature(X_train, best_model.predict(X_train))
    
    # Log model
    model_info = mlflow.sklearn.log_model(
        sk_model=best_model, 
        artifact_path=artifact_path, 
        signature=signature,
        input_example=X_train,
        )

    # Log data path
    mlflow.log_artifact(data_path)

[       inf 0.92454165 0.91842248 ... 0.05712654 0.05708317 0.01899781]


In [16]:
# Test without training

import pickle
with open('C:\\Users\\Z478SG\\Desktop\\Ecole\\OpenClassrooms-Projet-7\\modeling\\data\\06_models\\model.pkl', 'rb') as f:
    model = pickle.load(f)

In [24]:
with mlflow.start_run():
    # Assess model
    accuracy, auc_roc, cost, auc_pr, f1, optimal_threshold, optimal_threshold2, f_beta_score = assess_model(model, X_test, y_test)

    ## Log data
    # Set a tag that (key, value) we can use to remind ourselves what this run was for
    mlflow.set_tag("LGBM hyperparameters tuning", "second try")

    # Log parameters
    mlflow.log_params({
        "test_size": test_size,
        "random_state": random_state,
        "cost_fn": cost_fn,
        "cost_fp": cost_fp,
        })
    grid_params_with_suffix = {f"{k}_tested": v for k, v in param_grid.items()}
    mlflow.log_params(grid_params_with_suffix)

    # mlflow.log_params(grid_search.best_params_)

    mlflow.log_param("columns", str(X_train.columns.tolist()))

    # Log metrics (accuracy and AUC-ROC)
    mlflow.log_metrics({
        "Accuracy": accuracy,
        "AUC-ROC": auc_roc,
        "Cost": cost,
        "AUC-PR": auc_pr,
        "F1": f1,
        "Optimal Threshold": optimal_threshold,
        "Optimal Threshold 2": optimal_threshold2,
        # "Manual Threshold": manual_threshold,
        "F beta score": f_beta_score,

        })

    # Infer the model signature (input and output schema) from the training data
    # signature = infer_signature(X_train, model.predict(X_train))
    
    # Log model
    model_info = mlflow.sklearn.log_model(
        sk_model=model, 
        artifact_path=artifact_path, 
        # signature=signature,
        input_example=X_train,
        )

    # Log data path
    mlflow.log_artifact(data_path)

Time for optimal threshold: 4.837181568145752
Time for auc roc: 0.0
Time for precision recall: 0.0
Time for auc pr: 0.0
Time for cost: 0.015623807907104492
Time for f1: 0.01564478874206543
Time for accuracy: 0.0
Time for optimal threshold2: 0.2700319290161133
Time for f beta: 0.015623807907104492


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts:   0%|          | 0/8 [00:00<?, ?it/s]
  "dataframe_split": {
    "columns": [
      "PAYMENT_RATE",
      "EXT_SOURCE_3",
      "EXT_SOURCE_2",
      "DAYS_BIRTH",
      "EXT_SOURCE_1",
      "DAYS_EMPLOYED_PERC",
      "ANNUITY_INCOME_PERC",
      "INSTAL_DBD_MEAN",
      "DAYS_LAST_PHONE_CHANGE",
      "REGION_POPULATION_RELATIVE",
      "ACTIVE_DAYS_CREDIT_UPDATE_MEAN"
    ],
    "data": [
      [
        0.107666015625,
        NaN,
        0.6552734375,
        -10680.0,
        NaN,
        0.041839599609375,
        0.124755859375,
        14.3359375,
        -1100.0,
        0.01446533203125,
        NaN
      ],
      [
        0.049102783203125,
        NaN,
        0.265380859375,
        -10368.0,
        NaN,
        0.264892578125,
        0.1551513671875,
        14.2734375,
        0.0,
        0.0251617431640625,
        NaN
      ],
      [
        0.04998779296875,
        0.6591796875,
        

### If result is good : Register the model as an official model

In [15]:
# If not registered during the run, officially register the model artifact with a specific name

# result = mlflow.register_model(
#     model_info.model_uri, "LGBMClassifier-V1"
# )