# ************ NOTEBOOK 3 :  MACHINE LEARNING*********** 

Dans ce notebook, nous allons essayer d'avoir un aperçu de notre  modèle à travers le package shap. Nous allons comprendre et expliquer les variables crédibles sélectionnés .

Le tracé le plus important est le tracé récapitulatif (ci-dessous dans ce cahier), qui montre les 20 variables indépendantes les plus importantes. Pour chaque variable, une distribution est tracée sur la façon dont les échantillons de données d'entrainement influencent le résultat du modèle. Plus les points sont rouges, plus la valeur de la feature est élevée, plus il y a de bleu, plus la valeur de la feature est faible.

Dans ce cas, l'entité EXT_SOURCE_2 est la variable qui a le plus d'impact sur la sortie du modèle. Les échantillons de données d'entrainement avec une faible EXT_SOURCE_2 ont une probabilité plus élevée lors de l'obtention d'un prêt. Si le client a une valeur EXT_SOURCE_2 élevée, la probabilité d'obtenir un prêt est faible. Pour la barre rouge à droite, on voit que beaucoup de clients sont dans ce cas.


# 1 Importation des librairies et jeux de données

In [None]:
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np # fundamental package for scientific computing with Python
import matplotlib.pyplot as plt # for plotting
import seaborn as sns # for making plots with seaborn
color = sns.color_palette()
import time, pickle
#Preprocessing, Upsampling, Model Selection, Model Evaluation
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score, precision_score, recall_score 
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
from sklearn.model_selection import cross_val_predict, cross_val_score, learning_curve, cross_validate
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_row',250)
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore")

# 2_Importation de la libaririe lightgbm 

In [None]:
#Predictive Models
from lightgbm import LGBMClassifier


 # 3_Fonctions utiles

In [None]:
def cf_matrix_roc_auc(model, y_true, y_pred, y_pred_proba):
    '''This function will make a pretty plot of 
  an sklearn Confusion Matrix using a Seaborn heatmap visualization + ROC Curve.'''
    fig = plt.figure(figsize=(20,15))
  
    plt.subplot(221)
    cf_matrix = confusion_matrix(y_true, y_pred)
    group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
  
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    sns.heatmap(cf_matrix, annot=labels, fmt="", cmap='Blues')

    plt.subplot(222)
    fpr,tpr,_ = roc_curve(y_true, y_pred_proba)
   # plt.plot(fpr, tpr, color='orange', linewidth=5, label='AUC = %0.4f' %roc_auc)
    plt.plot(fpr, tpr, color='orange', linewidth=5, label='AUC = %0.4f' %roc_auc )
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()

# 4_Importation du jeux de données avec features les plus crédibles.

In [None]:
data=pd.read_csv('X_data.csv').set_index('SK_ID_CURR')

In [None]:
data.head()

In [None]:
X=data.drop('TARGET',axis=1)
Y=data['TARGET']

In [None]:
print("Data Size : ", X.shape, Y.shape)

In [None]:
#Create train and validation set

In [None]:
train_x, valid_x,train_y,valid_y = train_test_split(X, Y, train_size=0.90, test_size=0.1, stratify=Y, random_state=123)

In [None]:
print("Train/Valid Sizes : ", train_x.shape, valid_x.shape,train_y.shape,valid_y.shape)

# 5_Chargement du Modèle 

In [None]:
file = open("finalized_model_auc.pkl",'rb')
lgbm_clf_auc= pickle.load(file)
file.close()

In [None]:
print(lgbm_clf_auc)

# 6_Generation des probabilités

In [None]:
Y_pred=lgbm_clf_auc.predict_proba(valid_x)

# 7_ Evaluation de Performance

## 7_1 Confusion Matrix et Graphique Roc_Auc 

In [None]:
roc_auc = roc_auc_score(valid_y, lgbm_clf_auc.predict_proba(valid_x)[:,1])
print('AUC : %0.4f' %roc_auc)
print(classification_report(valid_y, lgbm_clf_auc.predict(valid_x)))

In [None]:
cf_matrix_roc_auc(lgbm_clf_auc, valid_y,lgbm_clf_auc.predict(valid_x), lgbm_clf_auc.predict_proba(valid_x)[:,1])

# 8_ Interpretabilité du modèle de prédiction avec SHAP

## 8_1 Importation du package SHAP

In [None]:
import shap


## 8_2 Création de shapley values

In [None]:
shap_values = shap.TreeExplainer(lgbm_clf_auc).shap_values(valid_x)

## 8_3 Graphiques d'interpretabilité

## 8_4 Graphique global

In [None]:
shap.summary_plot(shap_values, valid_x)

# 9_Graphiques locaux avec dépendances

In [None]:
shap.dependence_plot("EXT_SOURCE_2", shap_values[0], valid_x)

In [None]:
shap.dependence_plot("EXT_SOURCE_2", shap_values[1], valid_x)

In [None]:
shap.dependence_plot("CODE_GENDER_M", shap_values[0], valid_x)

In [None]:
shap.dependence_plot("CODE_GENDER_M", shap_values[1], valid_x)

In [None]:
shap.dependence_plot("Age", shap_values[0], valid_x)

In [None]:
shap.dependence_plot("Age", shap_values[1], valid_x)

In [None]:
shap.dependence_plot("NAME_FAMILY_STATUS_Married", shap_values[1], valid_x)

In [None]:
shap.dependence_plot("NAME_FAMILY_STATUS_Married", shap_values[0], valid_x)

# 10 Conception de la mise à jour description des variables à implémenter dans l'application Scoring Client

In [None]:
# Creating the dataframe
data = pd.read_csv("X_data.csv").rename(columns={'Age':'DAYS_BIRTH'})
data

In [None]:
ds_sample=data.sample(10000)
ds_sample

In [None]:
# sort by index labels
data_ski=ds_sample.set_index('SK_ID_CURR').sort_index(axis = 0)

# sorting based on column labels
data_skil=data_ski.sort_index(axis=1)

data_skil.to_csv('data.skill.csv',index=True)
data=pd.read_csv('data.skill.csv')

In [None]:
appli_descriptif=pd.DataFrame(data.columns.values,columns=['Row'])
appli_descriptif.sort_values(by=['Row'])

In [None]:
Old_Descriptif = (pd.read_csv('HomeCredit_columns_description.csv', usecols = ['Row','Description'], encoding='latin'))
Old_Descriptif

In [None]:
Matching_Descriptif = appli_descriptif.merge(Old_Descriptif, on="Row",how = 'inner')
Matching_Descriptif.drop_duplicates(inplace=True)
Matching_Descriptif.drop_duplicates(subset='Row', keep="last",inplace=True)
Matching_Descriptif.reset_index(drop=True,inplace=True)
Matching_Descriptif

In [None]:
list_old_descriptif = [list(Old_Descriptif['Row']),list(Old_Descriptif['Description'])]
list_new_descriptif= [list(Matching_Descriptif['Row']),list(Matching_Descriptif['Description'])]
list_new_row= list(data.columns.values)
tempIte = 0

for index, el in enumerate(list_old_descriptif[0]):
    if not el in list_new_descriptif[0]:
        for newEl in list_new_row:
            if (el in newEl):
                if not newEl in list_new_descriptif[0]:
                    list_new_descriptif[0].append(newEl)
                    list_new_descriptif[1].append(list_old_descriptif[1][index])

for el in list_new_row:
    if not (el in list_new_descriptif[0]):
        list_new_descriptif[0].append(el)
        list_new_descriptif[1].append("NULL")
        

In [None]:
appli_var_descriptif=(pd.DataFrame(list_new_descriptif).transpose()).rename(columns={0:"Variable",1 :"Description"})
appli_var_descriptif.sort_values(by=['Variable'],inplace=True)
appli_var_descriptif.reset_index(drop=True,inplace=True)
appli_var_descriptif

In [None]:
appli_var_descriptif.loc[31,'Description']="ratio of credit amount to total customer revenue "

In [None]:
appli_var_descriptif.loc[36,'Description']="ratio of the total amount of debits in credits to the sum of credits"

In [None]:
appli_var_descriptif.loc[72,'Description']="ratio of the total customer overdue to the total customer debt"

In [None]:
appli_var_descriptif.loc[73,'Description']="Previous amount of credit"

In [None]:
appli_var_descriptif.loc[75,'Description']="Previous applicant count"

In [None]:
appli_var_descriptif

In [None]:
appli_var_descriptif.to_csv('appli_descriptif.csv',index=False)