# MODEL SELECTION

In [89]:

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve
from mlflow.models import infer_signature
import matplotlib.pyplot as plt
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, accuracy_score,ConfusionMatrixDisplay, recall_score, f1_score, roc_curve, roc_auc_score, fbeta_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from preprocessing import preprocessing
import re
import statsmodels.api as sm
from sklearn.pipeline import Pipeline
import mlflow
from mlflow.models import infer_signature


In [159]:
df = pd.read_csv('data/df_final.csv')
df.shape

(356251, 42)

In [160]:
df = df[df['TARGET'].notnull()]
df.shape

(307507, 42)

In [161]:
Y = df['TARGET']
df_train = df.drop(labels='TARGET', axis=1)

In [None]:
df_train.shape, Y.shape

((307507, 41), (307507,))

### Split Train Data : Train, Test, Validation

In [None]:
feats = [f for f in df_train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
df_train[feats].info()

In [165]:
X_train, X_test, y_train, y_test = train_test_split(df_train[feats], Y, stratify=Y, test_size=0.3, random_state=101)
X_train.shape, X_test.shape

((215254, 40), (92253, 40))

In [166]:
y_train.value_counts()/len(y_train) * 100

TARGET
0.0    91.927212
1.0     8.072788
Name: count, dtype: float64

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # Distribution des données n'est pas normale
scaler.fit(df_train[feats])
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Fonction de coût métier

**Objectif :** Prédiction sur la probabilité de faillite d'un client (classe positive de la variable Cible), le client aura des difficultés de paiement.
Minimiser les Faux Négatifs (FN), et maximiser la précision de la prédiction d'un client en défaut (1) qui est la classe minoritaire.
Maximiser le rappel pour minimiser les Faux Négatifs (Prédiction d'absence de diffcultés de paiement pour un client qui a en réalité des difficultés de paiement) => Erreur de type II.

On souhaite éviter en priorité d'accorder un prêt à un mauvais client à tord => Coût FN > Coût FP

Suivi du score **F2 ou Fbeta**, on supposera que le coût d'un FN est **10 foix supérieur** que le coût d'un FP


In [37]:
# calculate f2-measure
def f2_measure(y_true, y_pred):
    return fbeta_score(y_true, y_pred, beta=10)

In [38]:
# evaluate a model
def evaluate_model(X, y, model):
    # define evaluation procedure
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=101)
    # define the model evaluation metric
    metric = make_scorer(f2_measure)
    # evaluate model
    scores = cross_val_score(model, X, y, scoring=metric, cv=cv, n_jobs=-2)
    return scores

In [131]:
def log_model_mlflow(x_train, metric_name, metric, model, model_name):
    with mlflow.start_run():
        params = {
            "nthread":6,
            "n_estimators":10000,
            "learning_rate":0.02,
            "num_leaves":34,
            "colsample_bytree":0.9497036,
            "subsample":0.8715623,
            "max_depth":8,
            "reg_alpha":0.041545473,
            "reg_lambda":0.0735294,
            "min_split_gain":0.0222415,
            "min_child_weight":39.3259775,
            "silent":-1,
            "verbose":-1
        }
        mlflow.log_params(params)

        # Log the loss metric
        mlflow.log_metric(metric_name, metric)
        # mlflow.log_metric("f1_score", metric) # à renommer roc_auc

        # Set a tag that we can use to remind ourselves what this run was for
        mlflow.set_tag("Training Info", model_name)
        # Infer the model signature
        signature = infer_signature(x_train, model.predict(x_train))
        # Log the model
        model_info = mlflow.sklearn.log_model(
            sk_model=model,
            artifact_path=model_name,
            signature=signature,
            registered_model_name=model_name,
        )

In [132]:
def get_model_metrics(model, y_test, y_pred, y_prob):
    metrics = {
        "f2_score": fbeta_score(y_test, y_pred, beta=10),
        "accuracy_score": accuracy_score(y_test, y_pred), # Utilisation de accuracy_score
        "score": model.score(X_test, y_test), # Ou utilisation de la méthode score
        "recall": recall_score(y_true=y_test, y_pred=y_pred),        
        "f1_score": f1_score(y_true=y_test, y_pred=y_pred),
        "auc": roc_auc_score(y_test, y_prob),    
    }

    print(f"La méthode accuracy_score donne: {metrics['accuracy_score']}")
    print(f"La méthode score donne: {metrics['score']}")
    print(f"Le recall est de: {metrics['recall']}")
    print(f"Le F1-score est de: {metrics['f1_score']}")
    print(f"L'AUC est de: {metrics['auc']}")
    print(f"Le F2-score est de: {metrics['f2_score']}")

    return metrics

In [None]:
regressor = ensemble.HistGradientBoostingRegressor()
params = {'learning_rate': [1e-1, 2e-1, 3e-1],
          'max_depth': [2, 4, 5],
          'min_samples_leaf': [30, 31, 32],
          'max_iter': [100, 150]
          }
gsv = model_selection.GridSearchCV(regressor, params, cv=5)
gsv.fit(X_train_scaled, y_train)

##### **Model Dummy**

Métriques

In [61]:
# define the reference model
model = DummyClassifier(strategy='most_frequent')
# evaluate the model
scores = evaluate_model(X_train_scaled, y_train, model)

In [None]:
# summarize performance
print("Calcul du F2 pour le Modèle de Référence strategy most_frequent et sans variable explicative :")
print(f"Score F2 : moyenne = {np.mean(scores):.2%}, écart_type = {np.std(scores):.2%}")

Calcul du F2 pour le Modèle de Référence strategy most_frequent et sans variable explicative :
Score F2 : moyenne = 0.00%, écart_type = 0.00%


In [91]:
np.mean(scores)

np.float64(0.0)

In [94]:
models = []
# models.append(("LogisticRegression",LogisticRegression()))
models.append(("LinearSVC",LinearSVC()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
models.append(("XGBClassifier",XGBClassifier()))
models.append(("LGBM_Classifier",LGBMClassifier()))


In [None]:

results = []
names = []
for name, model in models:
    result = cross_val_score(model, X_train_scaled, y_train,  cv=3)
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())

## **Model Dummy**

In [133]:
# dummy uniform strategy = uniform/ most_frequent / stratified
dummy_clf = DummyClassifier(strategy="most_frequent").fit(X_train_scaled, y_train)
y_pred = dummy_clf.predict(X_test_scaled)
y_train_pred = dummy_clf.predict(X_train_scaled)

# Prédire les probabilités
y_prob = dummy_clf.predict_proba(X_test_scaled)[:, 1]
y_train_prob = dummy_clf.predict_proba(X_train_scaled)[:, 1]
metrics = get_model_metrics(dummy_clf, y_test, y_pred, y_prob)
metrics

La méthode accuracy_score donne: 0.9192654981409819
La méthode score donne: 0.9192654981409819
Le recall est de: 0.0
Le F1-score est de: 0.0
L'AUC est de: 0.5
Le F2-score est de: 0.0


{'f2_score': np.float64(0.0),
 'accuracy_score': 0.9192654981409819,
 'score': 0.9192654981409819,
 'recall': np.float64(0.0),
 'f1_score': np.float64(0.0),
 'auc': np.float64(0.5)}

In [143]:
y_prob

array([0., 0., 0., ..., 0., 0., 0.])

In [111]:
# Set MLflow Experiment ---------------------------------------------------------
mlflow.set_tracking_uri(uri="http://localhost:8080")
mlflow.set_experiment("credit_score_experiment")
log_model_mlflow(X_train_scaled, "f2_score", metrics['f2_score'], dummy_clf, "DummyClassifier")

Registered model 'DummyClassifier' already exists. Creating a new version of this model...
2024/11/29 00:40:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: DummyClassifier, version 3
Created version '3' of model 'DummyClassifier'.
2024/11/29 00:40:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run omniscient-roo-216 at: http://localhost:8080/#/experiments/672589219446151488/runs/6da0e5a50c8a49e6a7d70c1d9f1dbfe8.
2024/11/29 00:40:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/672589219446151488.


## Logistic Regression

In [81]:
clf = LogisticRegression(random_state=0).fit(X_train_scaled, y_train)
y_pred__lr = clf.predict(X_test)
y_train_pred__lr = clf.predict(X_train)



## LinearSVC

In [115]:
svc = LinearSVC().fit(X_train_scaled, y_train)
y_pred__svc = svc.predict(X_test_scaled)
y_train_pred__svc = svc.predict(X_train_scaled)

# Prédire les probabilités
y_prob__svc = svc._predict_proba_lr(X_test_scaled)[:, 1]
y_train_prob__svc = svc._predict_proba_lr(X_train_scaled)[:, 1]

In [116]:
metrics_SVC = get_model_metrics(svc, y_test, y_pred__svc, y_prob__svc)
metrics_SVC

La méthode accuracy_score donne: 0.9192546583850931
La méthode score donne: 0.5394079325333594
Le recall est de: 0.0
Le F1-score est de: 0.0
L'AUC est de: 0.7248651927265247
Le F2-score est de: 0.0




{'f2_score': np.float64(0.0),
 'accuracy_score': 0.9192546583850931,
 'score': 0.5394079325333594,
 'recall': np.float64(0.0),
 'f1_score': np.float64(0.0),
 'auc': np.float64(0.7248651927265247)}

In [117]:
log_model_mlflow(X_train_scaled, "f2_score", metrics_SVC['f2_score'], svc, "LinearSVC")

Successfully registered model 'LinearSVC'.
2024/11/29 00:49:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LinearSVC, version 1
Created version '1' of model 'LinearSVC'.
2024/11/29 00:49:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-stork-715 at: http://localhost:8080/#/experiments/672589219446151488/runs/8322d377cda643789463d56c9b3b24bb.
2024/11/29 00:49:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/672589219446151488.


In [135]:
X_train_scaled.shape, y_train.shape

((215254, 40), (215254,))

# Random Forest Classifier

In [None]:
rfc = RandomForestClassifier(random_state=0).fit(X_train_scaled, y_train)
y_pred__rfc = rfc.predict(X_test_scaled)
# y_train_pred__rfc = rfc.predict(X_train_scaled)

# Prédire les probabilités
y_prob__rfc = rfc.predict_proba(X_test_scaled)[:, 1]
# y_train_prob__rfc = rfc.predict_proba(X_train_scaled)[:, 1]
metrics_RFC = get_model_metrics(rfc, y_test, y_pred__rfc, y_prob__rfc)




La méthode accuracy_score donne: 0.919330536676314
La méthode score donne: 0.16770186335403728
Le recall est de: 0.002551020408163265
Le F1-score est de: 0.0050802139037433155
L'AUC est de: 0.7207379113428285
Le F2-score est de: 0.0025764199175116


In [137]:
log_model_mlflow(X_train_scaled, "f2_score", metrics_RFC['f2_score'], rfc, "RandomForestClassifier")

Successfully registered model 'RandomForestClassifier'.
2024/11/29 01:35:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 1
Created version '1' of model 'RandomForestClassifier'.
2024/11/29 01:35:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run omniscient-stag-410 at: http://localhost:8080/#/experiments/672589219446151488/runs/ed0b32c7e91249188ac07f19c67fd2f4.
2024/11/29 01:35:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/672589219446151488.


In [142]:
len(y_pred__rfc)

92253

In [148]:
rfc_balanced = RandomForestClassifier(random_state=0, class_weight="balanced").fit(X_train_scaled, y_train)
y_pred__rfc_balanced = rfc_balanced.predict(X_test_scaled)
# y_train_pred__rfc = rfc.predict(X_train_scaled)

# Prédire les probabilités
y_prob__rfc_balanced = rfc_balanced.predict_proba(X_test_scaled)[:, 1]
# y_train_prob__rfc = rfc.predict_proba(X_train_scaled)[:, 1]
metrics_RFC_Balanced = get_model_metrics(rfc_balanced, y_test, y_pred__rfc_balanced, y_prob__rfc_balanced)




La méthode accuracy_score donne: 0.9192654981409819
La méthode score donne: 0.9192654981409819
Le recall est de: 0.005773361976369495
Le F1-score est de: 0.011414919033713831
L'AUC est de: 0.7315477090267931
Le F2-score est de: 0.0058304223733564595


In [149]:
log_model_mlflow(X_train_scaled, "f2_score", metrics_RFC_Balanced['f2_score'], rfc_balanced, "RandomForestClassifier")

Registered model 'RandomForestClassifier' already exists. Creating a new version of this model...
2024/11/29 12:54:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: RandomForestClassifier, version 2
Created version '2' of model 'RandomForestClassifier'.
2024/11/29 12:54:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run crawling-calf-746 at: http://localhost:8080/#/experiments/672589219446151488/runs/0423f3b662bd4273adbcf37c265601e3.
2024/11/29 12:54:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/672589219446151488.


# LGBM Classifier

In [None]:
clf = LGBMClassifier(objective= 'binary', class_weight="balanced").fit(X_train_scaled, y_train)
y_pred__clf = clf.predict(X_test_scaled)
y_prob__clf = rfc_balanced.predict_proba(X_test_scaled)
metrics_LGBM = get_model_metrics(clf, y_test, y_pred__clf, y_prob__clf)


[LightGBM] [Info] Number of positive: 17377, number of negative: 197877
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.057043 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9711
[LightGBM] [Info] Number of data points in the train set: 215254, number of used features: 40
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
La méthode accuracy_score donne: 0.7191202454120733
La méthode score donne: 0.7527560079347013
Le recall est de: 0.6821965628356605
Le F1-score est de: 0.28169873038753673
L'AUC est de: 0.7315477090267931
Le F2-score est de: 0.6635166130954998


In [156]:
y_prob__clf = rfc_balanced.predict_proba(X_test_scaled)


In [154]:
log_model_mlflow(X_train_scaled, "f2_score", metrics_LGBM['f2_score'], clf, "LGBMClassifier")

Registered model 'LGBMClassifier' already exists. Creating a new version of this model...
2024/11/29 14:59:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: LGBMClassifier, version 2
Created version '2' of model 'LGBMClassifier'.
2024/11/29 14:59:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run peaceful-bear-131 at: http://localhost:8080/#/experiments/672589219446151488/runs/d0508890b35c445491f65478b927b493.
2024/11/29 14:59:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:8080/#/experiments/672589219446151488.


# Create Model File -- Pickling

In [None]:
import pickle
# Save to file in the current working directory
pkl_filename = "model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

# Load from file
with open(pkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)
    
# Calculate the accuracy score and predict target values
f2_score = metrics_LGBM['f2_score']
score = pickle_model.score(X_test_scaled, y_test)
print("score : {0:.2f} %".format(100 * score))
print("F2 score : {0:.2f} %".format(100 * f2_score))

Y_pred = pickle_model.predict(X_test_scaled)
Y_proba = pickle_model.predict_proba(X_test_scaled)
print(Y_pred)
print(Y_proba)

y_pred_df = pd.DataFrame(Y_pred, columns=['y_pred_test'])
y_pred_proba_df = pd.DataFrame(Y_proba, columns=['proba_classe_0', 'proba_classe_1'])

# Récupération du score du client
y_pred_proba_df = pd.DataFrame(y_pred_proba_df, columns=['proba_classe_0', 'proba_classe_1'])
y_pred_proba_df = pd.concat([y_pred_proba_df['proba_classe_1'], X_test['SK_ID_CURR']], axis=1)
# score = y_pred_df[y_pred_df['SK_ID_CURR']==ID_client]
# score_value = round(score.proba_classe_1.iloc[0]*100, 2)


# ================================================================
# score = pickle_model.score(x_test, y_test)
# print("Test score: {0:.2f} %".format(100 * score))
# Ypredict = pickle_model.predict(x_test)

# ##loading the model from the saved file
# pkl_filename = "model.pkl"
# with open(pkl_filename, 'rb') as f_in:
#     model = pickle.load(f_in)

# predictValue = predict_mpg(config, model)
# predictValue


# y_pred_lgbm = model_LGBM.predict(lecture_X_test_clean().drop(labels="sk_id_curr", axis=1))    # Prédiction de la classe 0 ou 1
# y_pred_lgbm_proba = model_LGBM.predict_proba(lecture_X_test_clean().drop(labels="sk_id_curr", axis=1)) # Prédiction du % de risque

# # Récupération du score du client
# y_pred_lgbm_proba_df = pd.DataFrame(y_pred_lgbm_proba, columns=['proba_classe_0', 'proba_classe_1'])
# y_pred_lgbm_proba_df = pd.concat([y_pred_lgbm_proba_df['proba_classe_1'],
#                                 lecture_X_test_clean()['sk_id_curr']], axis=1)
# #st.dataframe(y_pred_lgbm_proba_df)
# score = y_pred_lgbm_proba_df[y_pred_lgbm_proba_df['sk_id_curr']==ID_client]
# score_value = round(score.proba_classe_1.iloc[0]*100, 2)

score : 71.91 %
F2 score : 66.35 %
[0. 1. 0. ... 0. 0. 1.]
[[0.59252341 0.40747659]
 [0.48312982 0.51687018]
 [0.75870168 0.24129832]
 ...
 [0.82041659 0.17958341]
 [0.9094301  0.0905699 ]
 [0.1289471  0.8710529 ]]


In [168]:
y_pred_proba_df

Unnamed: 0,proba_classe_0,proba_classe_1
0,0.592523,0.407477
1,0.483130,0.516870
2,0.758702,0.241298
3,0.407403,0.592597
4,0.782169,0.217831
...,...,...
92248,0.945559,0.054441
92249,0.280925,0.719075
92250,0.820417,0.179583
92251,0.909430,0.090570


DF PROCESSED MANUALLY

In [118]:
df = pd.read_csv('input/df_final_manual_feature_engineering.csv')
df.shape

(307511, 167)

In [119]:
Y = df['TARGET']
df.drop(labels='TARGET', axis=1, inplace=True)

In [120]:
X_train, X_test, y_train, y_test = train_test_split(df, Y, stratify=Y, test_size=0.3, random_state=101)
X_train.shape, X_test.shape
scaler = MinMaxScaler()
scaler.fit(df.drop(labels=['SK_ID_CURR'], axis=1))
X_train_scaled = scaler.transform(X_train.drop(labels='SK_ID_CURR', axis=1))
X_test_scaled = scaler.transform(X_test.drop(labels='SK_ID_CURR', axis=1))

In [121]:
# define models to test
def get_models():
    models, names = list(), list()
    # LR
    models.append(LogisticRegression())
    names.append('Logistic_Regression')
    # SVM
    models.append(LinearSVC())
    names.append('Linear_SVC')
    # Random Forest
    models.append(RandomForestClassifier())
    names.append('Random_Forest_Classifier')
    # XGB
    models.append(XGBClassifier())
    names.append('XGB_Classifier') 
    # LGBM
    models.append(LGBMClassifier())
    names.append('LGBM_Classifier')
    return models, names

In [122]:
import time

# define models
models, names = get_models()
results = list()

# evaluate each model
for i in range(len(models)):
    temps = time.time()
    # wrap the model in a pipeline
    pipeline = Pipeline(steps=[('m', models[i])])
    scores = evaluate_model(X_train_scaled, y_train, pipeline)
    results.append(scores)
    duration=time.time()-temps
    # summarize and store
    print(f"Score F2 du modèle {names[i]} : moyenne = {np.mean(scores):.2%}, écart_type = {np.std(scores):.2%}")
    print(f"Temps de Calcul pour {names[i]} : {duration:>15.2f} secondes")
    print(end='\n\n')

Score F2 du modèle Logistic_Regression : moyenne = 1.80%, écart_type = 0.32%
Temps de Calcul pour Logistic_Regression :          200.99 secondes




KeyboardInterrupt: 