### Partie modélisation:

* Importation du dataframe preprocessed
* Séparation des données entre Train Data & Test Data
* Equilibrage des données avec SMOTE ( Train Data uniquement)
* Choix du modèle de classification optimal et son optimisation à travers des métriques technique et métier.
* Interprétatbilité des résultats de prédiction avec SHAP.
* Sérialisation avec Joblib, Pickle et Dill.

In [None]:
# Import des bibiliothèques nécessaires:
import numpy as np
import pandas as pd
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, KFold, LeaveOneOut, StratifiedKFold, GridSearchCV

from sklearn.model_selection import train_test_split

from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from matplotlib import pyplot
from numpy import where

from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import warnings
warnings.filterwarnings('ignore')

In [None]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

In [None]:
# Définition de la fonction variance:
def variance(data):
     # Number of observations
     n = len(data)
     # Mean of the data
     mean = sum(data) / n
     # Square deviations
     deviations = [(x - mean) ** 2 for x in data]
     # Variance
     variance = sum(deviations) / n
     return variance

### Chargement du du dataframe preprocessed:

In [None]:
#On charge le dataframe preprocessed:
df= pd.read_csv('df.csv')

In [None]:
# SK_ID_CURR comme index
df.set_index('sk_id_curr',inplace=True)

In [None]:
df.head()

In [None]:
df.info()

### Séparation des données:

In [None]:
# Split des données en train & test
X= df.drop(['target'],axis=1)
y= df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4,shuffle=True, stratify=y, random_state=42)

del df
gc.collect()

In [None]:
# sample of X_test will be used later on the API part:
X_test_sample = X_test.sample(n=20)
X_test_sample.to_csv('X_test_sample.csv')

In [None]:
X_test_sample.info()

In [None]:
# liste de l'échantillon des user ID
user_id_list= X_test_sample.index.to_list()

In [None]:
#liste de l'échantillon des user ID
list(map(int, user_id_list))

In [None]:
test_target_count = y_test.value_counts()
print('Class 0:', test_target_count[0])
print('Class 1:', test_target_count[1])
print('Proportion:', round(test_target_count[0] / test_target_count[1], 2), ': 1')

In [None]:
train_target_count = y_train.value_counts()
print('Class 0:', train_target_count[0])
print('Class 1:', train_target_count[1])
print('Proportion:', round(train_target_count[0] / train_target_count[1], 2), ': 1')

### SMOTE for unbalanced class: training set only

In [None]:
# Oversample and plot imbalanced dataset with SMOTE
# summarize class distribution
print("class distribution before SMOTE:")
counter = Counter(y_train)
print(counter)
print('\n')

# transform the dataset
oversample = SMOTE()
X_train_s, y_train_s= oversample.fit_resample(X_train, y_train)

# summarize the new class distribution
print("class distribution after SMOTE:")
counter = Counter(y_train_s)
print(counter)

In [None]:
X_train_s.info()

### Comparaison des modèles de classification avec cross_val_score:

#### LightGBM classifier:

In [None]:
# LightGBM classifier:
clf_lgbm = LGBMClassifier()
# Validation croisée avec cross_val_score:
# Metric:  ‘roc_auc’
with timer(" process cross_val_score for lgbm"): # process cross_val_score for lgbm - done in 61s
        scores_lgbm= cross_val_score(clf_lgbm,X_train_s, y_train_s, cv= 4, scoring='roc_auc') 

In [None]:
print(scores_lgbm) # [0.88162016 0.99997028 0.99998038 0.99996053]
print(scores_lgbm.mean()) # 0.9703828383557017
print(variance(scores_lgbm)) # 0.0026262712086987383

#### Gradient Boosting Classifier: 

In [None]:
from sklearn.ensemble import GradientBoostingClassifier 

clf_xgb= GradientBoostingClassifier(max_features= 'log2')
#process cross_val_score for xgb - done in 342s
#[0.87237321 0.99936311 0.99935889 0.9992222 ]
#0.9675793521462993
#0.0030214061820368976

clf_xgb= GradientBoostingClassifier(max_features= 'sqrt')
# process cross_val_score for xgb - done in 626s
#[0.87243582 0.99948681 0.99961232 0.99966911]
#0.9678010162024336
#0.003031511055784114

clf_xgb= GradientBoostingClassifier()
#process cross_val_score for xgb - done in 5706s (env 96min)
#[0.87529565 0.99992271 0.99993919 0.99994977]
#0.9687768301317181
#0.0029129103424552654

In [None]:
# Validation croisée avec cross_val_score:
with timer(" process cross_val_score for xgb"):
        scores_xgb= cross_val_score(clf_xgb,X_train_s, y_train_s, cv= 4, scoring='roc_auc') 
print(scores_xgb) 
print(scores_xgb.mean()) 
print(variance(scores_xgb)) 

#### Random Forest Classifier:

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()

In [None]:
# Validation croisée avec cross_val_score:
with timer(" process cross_val_score for rf"): # process cross_val_score for rf - done in 929s (env 17min)
        scores_rf= cross_val_score(clf_rf,X_train_s, y_train_s, cv= 4, scoring='roc_auc')
print(scores_rf) # [0.93448035 0.99999919 0.9999995  0.99999898]
print(scores_rf.mean()) # 0.9836195039754854
print(variance(scores_rf)) #0.0008

* On choisit le LightGBM classifier qui se démarque par sa durée de traitement.

### Optimisation des hyperparamètres du modèle le LightGBM classifier:

In [None]:
# Optimisation avec hyperopt
from hyperopt import fmin, tpe, hp, anneal, Trials
from random import *
from sklearn.metrics import fbeta_score

    
def lgbm_cv(params):
    
    params = {
              'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']),
            'learning_rate': params['learning_rate']
              }
        
    # we use this params to create a new LGBM Regressor
    model = LGBMClassifier(**params )
    
    # and then conduct the cross validation with the same folds as before
    
    score = -cross_val_score(model, X_train_s,y_train_s, cv=3, scoring= 'roc_auc').mean()

    return score

In [None]:
%%time

from random import *

# possible values of parameters
space={ 
    'n_estimators': hp.quniform('n_estimators', 100, 2000, 1),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1),
       'learning_rate': hp.loguniform('learning_rate', -5, 0)
      }


#trials will contain logging information
trials = Trials()

best=fmin(lgbm_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals= 50, # maximum number of iterations
          trials=trials # logging
         )
   

In [None]:
best

In [None]:
# computing the score on the test set
lgbm = LGBMClassifier( n_estimators= 1082,learning_rate= 0.007939102526359921,max_depth= 20)

In [None]:
with timer(" process fit for lgbm"): #  process fit for lgbm - done in 154s
        lgbm.fit(X_train_s,y_train_s)

In [None]:
pred_lgbm= lgbm.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
target_names = ['Class 0', 'Class 1']
print(classification_report(y_test, pred_lgbm, target_names=target_names))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_lgbm)

### Optimisation des hyperparamètres avec un scoring métier:

#### Application de hyperopt:

In [None]:
from hyperopt import fmin, tpe, hp, anneal, Trials
from random import *
from sklearn.metrics import fbeta_score, make_scorer

    
def lgbm_cv(params):
    
    params = {  'n_estimators': int(params['n_estimators']), 
              'max_depth': int(params['max_depth']),
            'learning_rate': params['learning_rate']
             }
    
    # we use this params to create a new LGBM Regressor
    model = LGBMClassifier(**params)
    
    # and then conduct the cross validation with the same folds as before
    # beta=3
    fbeta_scorer = make_scorer(fbeta_score, beta=3) 
    score = -cross_val_score(model, X_train_s,y_train_s, cv=3, scoring= fbeta_scorer).mean()

    return score

In [None]:
%%time

from random import *

# possible values of parameters
space={  'n_estimators': hp.quniform('n_estimators', 100, 2000, 1),
       'max_depth' : hp.quniform('max_depth', 2, 20, 1),
       'learning_rate': hp.loguniform('learning_rate', -5, 0)
      }

# trials will contain logging information
trials = Trials()

best=fmin(lgbm_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals= 50, # maximum number of iterations
          trials=trials, # logging
         )
                       
    

In [None]:
best

In [None]:
# computing the score on the test set using the best parmaeters:
lgbm = LGBMClassifier( n_estimators= 1754,learning_rate= 0.012662637532738413,max_depth= 5)

In [None]:
with timer(" process fit for lgbm"): #  process fit for lgbm - done in 146s
        lgbm.fit(X_train_s,y_train_s)

In [None]:
pred_lgbm= lgbm.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
target_names = ['Class 0', ' Class 1']
print(classification_report(y_test, pred_lgbm, target_names=target_names))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred_lgbm,labels= [0,1])

* Amélioration du score du Recall après optimisation métier (FN passe de 9793 à 9647)

### Features importance

In [None]:
import pandas as pd
%matplotlib inline
#do code to support model
#"data" is the X dataframe and model is the SKlearn object

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train_s.columns, lgbm.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'importance'})
#importances.sort_values(by='importance').plot(kind='bar', rot=45)

In [None]:
importances.sort_values(by='importance',ascending= False).head(20).plot(kind='bar', rot=90)

In [None]:
importances.sort_values(by='importance', ascending= False).head(20)

### Features importance with SHAP

In [None]:
import shap
shap.initjs()

In [None]:
# Here we use the Tree SHAP implementation integrated into Light GBM to explain the dataset:
explainer = shap.TreeExplainer(lgbm)

In [None]:
shap_values = explainer.shap_values(X_test_sample)

In [None]:
# Visualize a single prediction:
shap.force_plot(explainer.expected_value[1], shap_values[1][0,:], X_test_sample.iloc[0,:])

In [None]:
#  Impact of the individual feature on all sample: .
shap.force_plot(explainer.expected_value[1], shap_values[1], X_test_sample)

In [None]:
# SHAP Summary Plot: 
shap.summary_plot(shap_values, X_test_sample)

### Sauvegarde de explainer & shap_values pour le dashboard:

In [None]:
import dill 
# To save explainer:
with open('explainer.pkl', 'wb') as file:
    dill.dump(explainer, file)

In [None]:
shap_values = explainer.shap_values(X_test_sample)

In [None]:
## To save shap_values:
with open('shap_values.pkl', 'wb') as file:
    dill.dump(shap_values, file)

### Sérialisation du modèle avec Joblib:

In [None]:
import joblib

In [None]:
joblib.dump(lgbm, 'lgbm.joblib')

### Sérialisation du modèle avec pickle:


In [None]:
# saving the model 
import pickle 
pickle_out = open("lgbm.pkl", mode = "wb") 
pickle.dump(lgbm, pickle_out) 
pickle_out.close()