# Prediction d'éligibilité à un crédit
## Partie Modélisation

On a deux options pour les données sur lesquelles on fait tourner le modèle :
    On le fait : \
    - soit sur mon feature engineering : dans ce cas on commente la ligne df = main() \
    - soit sur un feature engineering issu de kaggle dans ce cas on laisse df=main()

####  Chargement de fonctions standards

In [None]:
from srfunctions import *
import os

In [None]:
import numpy as np

#### Chargement de fonctions spécifiques

In [None]:
from sklearn.dummy import DummyClassifier

In [None]:
import imblearn
print(imblearn.__version__)

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
oversample = SMOTE()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [None]:
import gc
import time
from contextlib import contextmanager
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
import itertools
import joblib
from tqdm import tqdm
from sklearn.metrics import plot_confusion_matrix

In [None]:
import lime
import random
import warnings
warnings.filterwarnings("ignore")
from lime import lime_tabular
import shap

In [None]:
from tqdm.notebook import tqdm_notebook
from tqdm.notebook import trange, tqdm
import time
from sklearn.model_selection import KFold

In [None]:
from sklearn.metrics import roc_auc_score

#### Chargement du df issu de mon Feature Engineering

In [None]:
cleanpath = "data\\cleaned\\"
apptrain=pd.read_csv(cleanpath+"ApptrainSaved1.csv")
apptrain.head()
print(apptrain.shape)
df=apptrain.copy()

In [None]:
apptrain.columns

#### Ou bien recalcul d'un feature engineering amélioré venant de Kaggle

In [None]:
@contextmanager
def timer(title):
    t0 = time.time()
    yield
    print("{} - done in {:.0f}s".format(title, time.time() - t0))

# One-hot encoding for categorical columns with get_dummies
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

# Preprocess application_train.csv and application_test.csv
def application_train_test(num_rows = None, nan_as_category = False):
    # Read data and merge
    df = pd.read_csv('data\\source\\application_train.csv', nrows= num_rows)
    test_df = pd.read_csv('data\\source\\application_test.csv', nrows= num_rows)
    print("Train samples: {}, test samples: {}".format(len(df), len(test_df)))
    df = df.append(test_df).reset_index()
    # Optional: Remove 4 applications with XNA CODE_GENDER (train set)
    df = df[df['CODE_GENDER'] != 'XNA']
    
    # Categorical features with Binary encode (0 or 1; two categories)
    for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
        df[bin_feature], uniques = pd.factorize(df[bin_feature])
    # Categorical features with One-Hot encode
    df, cat_cols = one_hot_encoder(df, nan_as_category)
    
    # NaN values for DAYS_EMPLOYED: 365.243 -> nan
    df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
    # Some simple new features (percentages)
    df['DAYS_EMPLOYED_PERC'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
    df['INCOME_CREDIT_PERC'] = df['AMT_INCOME_TOTAL'] / df['AMT_CREDIT']
    df['INCOME_PER_PERSON'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']
    df['ANNUITY_INCOME_PERC'] = df['AMT_ANNUITY'] / df['AMT_INCOME_TOTAL']
    df['PAYMENT_RATE'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
    del test_df
    gc.collect()
    return df

# Preprocess bureau.csv and bureau_balance.csv
def bureau_and_balance(num_rows = None, nan_as_category = True):
    bureau = pd.read_csv('data\\source\\bureau.csv', nrows = num_rows)
    bb = pd.read_csv('data\\source\\bureau_balance.csv', nrows = num_rows)
    bb, bb_cat = one_hot_encoder(bb, nan_as_category)
    bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category)
    
    # Bureau balance: Perform aggregations and merge with bureau.csv
    bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
    for col in bb_cat:
        bb_aggregations[col] = ['mean']
    bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
    bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
    bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
    bureau.drop(['SK_ID_BUREAU'], axis=1, inplace= True)
    del bb, bb_agg
    gc.collect()
    
    # Bureau and bureau_balance numeric features
    num_aggregations = {
        'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
        'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
        'DAYS_CREDIT_UPDATE': ['mean'],
        'CREDIT_DAY_OVERDUE': ['max', 'mean'],
        'AMT_CREDIT_MAX_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'],
        'AMT_CREDIT_SUM_OVERDUE': ['mean'],
        'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
        'AMT_ANNUITY': ['max', 'mean'],
        'CNT_CREDIT_PROLONG': ['sum'],
        'MONTHS_BALANCE_MIN': ['min'],
        'MONTHS_BALANCE_MAX': ['max'],
        'MONTHS_BALANCE_SIZE': ['mean', 'sum']
    }
    # Bureau and bureau_balance categorical features
    cat_aggregations = {}
    for cat in bureau_cat: cat_aggregations[cat] = ['mean']
    for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
    
    bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
    # Bureau: Active credits - using only numerical aggregations
    active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
    active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
    active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR')
    del active, active_agg
    gc.collect()
    # Bureau: Closed credits - using only numerical aggregations
    closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
    closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
    closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
    bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
    del closed, closed_agg, bureau
    gc.collect()
    return bureau_agg

# Preprocess previous_applications.csv
def previous_applications(num_rows = None, nan_as_category = True):
    prev = pd.read_csv('data\source\previous_application.csv', nrows = num_rows)
    prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
    # Days 365.243 values -> nan
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
    # Add feature: value ask / value received percentage
    prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
    # Previous applications numeric features
    num_aggregations = {
        'AMT_ANNUITY': ['min', 'max', 'mean'],
        'AMT_APPLICATION': ['min', 'max', 'mean'],
        'AMT_CREDIT': ['min', 'max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
    }
    # Previous applications categorical features
    cat_aggregations = {}
    for cat in cat_cols:
        cat_aggregations[cat] = ['mean']
    
    prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
    prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
    # Previous Applications: Approved Applications - only numerical features
    approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
    approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
    approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
    prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR')
    # Previous Applications: Refused Applications - only numerical features
    refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
    refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
    refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
    prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
    del refused, refused_agg, approved, approved_agg, prev
    gc.collect()
    return prev_agg

# Preprocess POS_CASH_balance.csv
def pos_cash(num_rows = None, nan_as_category = True):
    pos = pd.read_csv('data\source\POS_CASH_balance.csv', nrows = num_rows)
    pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
    # Features
    aggregations = {
        'MONTHS_BALANCE': ['max', 'mean', 'size'],
        'SK_DPD': ['max', 'mean'],
        'SK_DPD_DEF': ['max', 'mean']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    
    pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
    pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
    # Count pos cash accounts
    pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
    del pos
    gc.collect()
    return pos_agg
    
# Preprocess installments_payments.csv
def installments_payments(num_rows = None, nan_as_category = True):
    ins = pd.read_csv('data\source\installments_payments.csv', nrows = num_rows)
    ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
    # Percentage and difference paid in each installment (amount paid and installment value)
    ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
    ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
    # Days past due and days before due (no negative values)
    ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
    ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
    ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
    ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
    # Features: Perform aggregations
    aggregations = {
        'NUM_INSTALMENT_VERSION': ['nunique'],
        'DPD': ['max', 'mean', 'sum'],
        'DBD': ['max', 'mean', 'sum'],
        'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
        'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
        'AMT_INSTALMENT': ['max', 'mean', 'sum'],
        'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
        'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum']
    }
    for cat in cat_cols:
        aggregations[cat] = ['mean']
    ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
    ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
    # Count installments accounts
    ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
    del ins
    gc.collect()
    return ins_agg

# Preprocess credit_card_balance.csv
def credit_card_balance(num_rows = None, nan_as_category = True):
    cc = pd.read_csv('data\\source\\credit_card_balance.csv', nrows = num_rows)
    cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
    # General aggregations
    cc.drop(['SK_ID_PREV'], axis= 1, inplace = True)
    cc_agg = cc.groupby('SK_ID_CURR').agg(['min', 'max', 'mean', 'sum', 'var'])
    cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
    # Count credit card lines
    cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
    del cc
    gc.collect()
    return cc_agg

# Display/plot feature importance
def display_importances(feature_importance_df_):
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance", ascending=False)[:40].index
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')


def main(debug = False):
    num_rows = 10000 if debug else None
    df = application_train_test(num_rows)
    with timer("Process bureau and bureau_balance"):
        bureau = bureau_and_balance(num_rows)
        print("Bureau df shape:", bureau.shape)
        df = df.join(bureau, how='left', on='SK_ID_CURR')
        del bureau
        gc.collect()
        print(df.shape)
    with timer("Process previous_applications"):
        prev = previous_applications(num_rows)
        print("Previous applications df shape:", prev.shape)
        df = df.join(prev, how='left', on='SK_ID_CURR')
        del prev
        gc.collect()
        print(df.shape)
    with timer("Process POS-CASH balance"):
        pos = pos_cash(num_rows)
        print("Pos-cash balance df shape:", pos.shape)
        df = df.join(pos, how='left', on='SK_ID_CURR')
        del pos
        print(df.shape)
        gc.collect()
        print(df.shape)
    with timer("Process installments payments"):
        ins = installments_payments(num_rows)
        print("Installments payments df shape:", ins.shape)
        df = df.join(ins, how='left', on='SK_ID_CURR')
        del ins
        print(df.shape)
        gc.collect()
        print(df.shape)
    with timer("Process credit card balance"):
        cc = credit_card_balance(num_rows)
        print("Credit card balance df shape:", cc.shape)
        df = df.join(cc, how='left', on='SK_ID_CURR')
        del cc
        print(df.shape)
        gc.collect()
        print(df.shape)
    #with timer("Run LightGBM with kfold"):
    #    feat_importance = kfold_lightgbm(df, num_folds= 10, stratified= False, debug= debug)
    return df
#if __name__ == "__main__":
#    #submission_file_name = "submission_kernel02.csv"
#    with timer("Full model run"):
#        df=main()
#        print(df.shape)
#        return df
df=main()

#### Downsampling du df pour la tractabilité de la classification qui peut prendre du temps

In [None]:
df = df.sample(frac=0.5)
df.shape

#### Passage de l'ID client en index

In [None]:
if df.columns.isin(['SK_ID_CURR']).sum()==1 :
    print('YES')
    df.set_index('SK_ID_CURR')

#### Drope de la colonne index 
pas pertinente pour la classification

In [None]:
if df.columns.isin(['index']).sum()==1 :
    df.drop('index',axis=1,inplace=True) 

#### On enleve les lignes si la TARGET n'est pas renseignée

In [None]:
df.drop(df[df["TARGET"].isna()].index,inplace=True)

#### On traite/formatte les valeurs infinies

In [None]:
df = df.replace((np.inf, -np.inf, np.NaN), np.nan).reset_index(drop=True)

In [None]:
print('min max indice client',df.SK_ID_CURR.min(),df.SK_ID_CURR.max())  # 

# Partie commune à tous les modèles
#### Séparation de la target

In [None]:
X_train=np.array(df.drop(columns=['TARGET','SK_ID_CURR']))
Y_train=np.array(df['TARGET'])

#### Option de Shuffling des index pour éviter une éventuelle organisation des données

In [None]:
#p=np.arange(Y_train.shape[0])
#np.random.shuffle(p)
#len(p)
#p[np.int(0.8*len(p)):]

### Création du jeu de test pour l'évaluation
### Partition qu'on initie une seule fois pour tous les modèles

In [None]:
# Creation d'un jeu de validation
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_TRAIN)
X_TRAIN=imp_mean.transform(X_TRAIN)
imp_mean.fit(X_VALID)
X_VALID=imp_mean.transform(X_VALID)
print(X_TRAIN.shape,Y_TRAIN.shape,)

### Création d'un dummy model : valeur constante la plus frequente de la prediction

In [None]:
X = X_TRAIN
y = Y_TRAIN
dummy_clf = DummyClassifier(strategy="constant",constant=0) #DummyClassifier(strategy='most_frequent')
dummy_clf.fit(X_TRAIN, Y_TRAIN)

#### Calcul du score auroc 

In [None]:
pred=dummy_clf.predict(X_VALID)
fpr, tpr, thresholds = metrics.roc_curve(Y_VALID, pred, pos_label=2)
print(fpr, tpr, thresholds)
metrics.auc(fpr, tpr)

In [None]:
roc_auc_score(Y_VALID, pred)

#### Représentation des matrices de confusion

In [None]:
c=confusion_matrix(Y_VALID,pred)
c=c/c.astype(np.float).sum(axis=0)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(c, annot=True, fmt='.2f')#, xticklabels=['0','1'], yticklabels==['0','1'])
plt.ylabel('Réel')
plt.xlabel('Prédit')
plt.show(block=False)

In [None]:
plot_confusion_matrix(dummy_clf, X_VALID, Y_VALID)  
plt.show()

## Test de 5 modèles :  RandomForestClassifier, LogisticRegression, LGBM, XGBoost, CATBoost
### avec GridSearch

## 1 RandomForestClassifier

#### Creation d'un jeu de validation

In [None]:
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)

#### Boucle sur toutes les combinaisons d'hyperparamètres possibles

In [None]:
from tqdm.notebook import trange, tqdm
parameters = { 
    'n_estimators': [150,200],
    'max_features': ['log2','sqrt'],#, 'log2'],
    'criterion' :['gini', 'entropy'],
    'max_depth' :[5,8],
    'min_samples_leaf' :[5,10]
}

keys = parameters.keys()
values = (parameters[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_TRAIN) # imputer
X_TRAIN=imp_mean.transform(X_TRAIN)
X_VALID=imp_mean.transform(X_VALID)

# creation d'une boucle de cross-validation

score=np.empty((len(combinations),5))
score_combi=np.empty((len(combinations),1))

print("all combinations of hyperparameters=",combinations)
n_cv=4
n=0
#for combination in tqdm(combinations):
for combination in tqdm_notebook(combinations):    
    print(combination)
    kf = KFold(n_splits=n_cv)
    i=0
    
    for train_index, test_index in kf.split(X_TRAIN):
        #print("TRAIN:", train_index, "TEST:", test_index)
        x_train, x_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = Y_TRAIN[train_index], Y_TRAIN[test_index]
        #for i in range(n_cv): # iter on CV trains
        # creation d'un jeu de test
        #x_train, x_test, y_train, y_test = train_test_split(X_TRAIN, Y_TRAIN, test_size=0.2, random_state=i)
        counter = Counter(y_train) # print(counter)
        
        rf_clf = RandomForestClassifier(n_estimators=combination["n_estimators"],max_features=combination["max_features"],
                                        criterion=combination["criterion"],max_depth=combination["max_depth"] , min_samples_leaf=combination["min_samples_leaf"] ,class_weight='balanced')
        rf_clf.fit(x_train, y_train)
        pred=rf_clf.predict(x_train) 
        #print('scoretrain:',roc_auc_score(y_train, pred))

        #imp_mean.fit(x_test) # imputer
        #x_test=imp_mean.transform(x_test) 
        pred=rf_clf.predict(x_test)
        print('scoretest:',roc_auc_score(y_test, pred))
        print(n)
        score[n,i]=roc_auc_score(y_test, pred)
        i=i+1
    score_combi[n,0:1]=score[n,0:n_cv].mean()
    print("score_combi",score_combi[n])
    n=n+1

print(np.argmax(score_combi))
bestparameters=combinations[np.argmax(score_combi)]
print('bestparam:',bestparameters)    

# une fois le modèle choisi et ses 'bestparameters' sauvés
# on entraine ce modèle sur le jeu complet, pour le rendre plus robuste
# cela évite de devoir choisir un modèles entrainés sur des jeux de données partiels
# et permet aussi plus de robustesse

rf_clf = RandomForestClassifier(n_estimators=bestparameters["n_estimators"],max_features=bestparameters["max_features"],
                                criterion=bestparameters["criterion"], max_depth=bestparameters["max_depth"] , min_samples_leaf=bestparameters["min_samples_leaf"], class_weight='balanced')
rf_clf = rf_clf.fit(X_TRAIN, Y_TRAIN)
pred = rf_clf.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))
####score[n]=roc_auc_score(Y_TEST, pred)

# on sauve le meilleur modele entrainé
filename = 'model_rf_clf' + '.sav'
joblib.dump(rf_clf, filename)

In [None]:
print(x_train.shape)
print(X_TRAIN.shape)
print(Y_TRAIN)

In [None]:
#BESTMODEL='model_' + str(np.argmax(score_combi)) + '.sav'
# load the model from disk
BESTMODEL='model_rf_clf.sav'

loaded_model = joblib.load(BESTMODEL)

pred=loaded_model.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

### Represenation des résultats

#### matrice de confusion avec les proportions

In [None]:
c=confusion_matrix(Y_VALID,pred)
c=c/c.astype(np.float).sum(axis=0)
fig, ax = plt.subplots(figsize=(5,5))
res=sns.heatmap(c, annot=True, fmt='.2f')#, xticklabels=['0','1'], yticklabels==['0','1'])
plt.ylabel('Réel',fontsize=28)
plt.xlabel('Prédit',fontsize=28)
#plt.title('Correlation Heatmap',fontsize=28);
res.set_xticklabels(res.get_xmajorticklabels(), rotation=0, horizontalalignment='right',fontsize = 18)
res.set_yticklabels(res.get_ymajorticklabels(), rotation=0, horizontalalignment='right',fontsize = 18)
res.tick_params(labelsize=28)
cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)
plt.show(block=False)

#### matrice de confusion avec les nombres absolus

In [None]:
res=plot_confusion_matrix(loaded_model, X_VALID, Y_VALID) 
plt.ylabel('Réel',fontsize=28)
plt.xlabel('Prédit',fontsize=28)
plt.xticks(fontsize=28)
plt.yticks(fontsize=28)
    

cbar = ax.collections[0].colorbar
cbar.ax.tick_params(labelsize=20)
plt.show()

### Explication du modèle

In [None]:
explainer = lime_tabular.LimeTabularExplainer(X_TRAIN, mode="classification",
                                              class_names= df.TARGET.unique(),
                                              feature_names= df.columns.drop(['TARGET']),
                                             )

explainer

#### on choisit plusieurs cas particuliers à expliquer.

In [None]:
idx = random.randint(1, len(X_VALID))

BESTMODEL='model_rf_clf.sav'
loaded_model = joblib.load(BESTMODEL)

pred=loaded_model.predict(X_VALID)

idx = random.randint(1, len(X_VALID))

print("Prediction : ", loaded_model.predict(X_VALID[idx].reshape(1,-1)))
print("Actual :     ", Y_VALID[idx])

#print("Prediction : ", breast_cancer.target_names[lr.predict(X_VALID[idx].reshape(1,-1))[0]])
#print("Actual :     ", breast_cancer.target_names[Y_VALID[idx]])

explanation = explainer.explain_instance(X_VALID[idx], loaded_model.predict_proba,
                                         num_features=len(df.columns)-1) # je mets moins 1 pour enlever la target qui n'est pas une feature

explanation.show_in_notebook()

In [None]:
preds = loaded_model.predict(X_VALID)
target_preds = np.argwhere(((preds == Y_VALID) & (preds == 0))).flatten()
idx  = random.choice(target_preds)

#BESTMODEL='model_rf_clf.sav'
#loaded_model = joblib.load(BESTMODEL)

#pred=loaded_model.predict(X_VALID)

#idx = random.randint(1, len(X_VALID))

print("Prediction : ", loaded_model.predict(X_VALID[idx].reshape(1,-1)))
print("Actual :     ", Y_VALID[idx])

#print("Prediction : ", breast_cancer.target_names[lr.predict(X_VALID[idx].reshape(1,-1))[0]])
#print("Actual :     ", breast_cancer.target_names[Y_VALID[idx]])

explanation = explainer.explain_instance(X_VALID[idx], loaded_model.predict_proba,
                                         num_features=len(df.columns)-1) # je mets moins 1 pour enlever la target qui n'est pas une feature

explanation.show_in_notebook()

In [None]:
preds = loaded_model.predict(X_VALID)
target_preds = np.argwhere(((preds == Y_VALID) & (preds == 1))).flatten()
idx  = random.choice(target_preds)

#BESTMODEL='model_rf_clf.sav'
#loaded_model = joblib.load(BESTMODEL)

pred=loaded_model.predict(X_VALID)

#idx = random.randint(1, len(X_VALID))

print("Prediction : ", loaded_model.predict(X_VALID[idx].reshape(1,-1)))
print("Actual :     ", Y_VALID[idx])

#print("Prediction : ", breast_cancer.target_names[lr.predict(X_VALID[idx].reshape(1,-1))[0]])
#print("Actual :     ", breast_cancer.target_names[Y_VALID[idx]])

explanation = explainer.explain_instance(X_VALID[idx], loaded_model.predict_proba,
                                         num_features=len(df.columns)-1) # je mets moins 1 pour enlever la target qui n'est pas une feature

explanation.show_in_notebook()

In [None]:
explanation.as_list()[0:10]

In [None]:
print(len(df.columns))
print(np.shape(X_VALID))

#### Explication globale avec shap

In [None]:
explainer=shap.TreeExplainer(loaded_model)#rf_clf)
values=explainer.shap_values(X_TRAIN)
#shap.plots.waterfall(values)#beeswarm(values[0])#waterfall(values[0]) # shap.plots.beeswarm(values[0])#waterfall(values[0])
listfeat=df.columns.drop(['TARGET','SK_ID_CURR'])
shap.summary_plot(values,X_TRAIN, feature_names = listfeat)#.drop(['TARGET'])

In [None]:
df.SK_ID_CURR

#### Extraction des features avec le plus d'importance

In [None]:
vals= np.abs(values).mean(0)
feature_importance = pd.DataFrame(list(zip(listfeat,vals)),columns=['col_name','feature_importance_vals'])
feature_importance['importance_mean']=feature_importance.feature_importance_vals.mean()
feature_importance.sort_values(by=['importance_mean'],ascending=False,inplace=True)
feature_importance.head(10)

In [None]:
feature_importance

In [None]:
df.DAYS_BIRTH

# Application de la meme grid search manuelle pour Lgbm

In [None]:
X_train=np.array(df.drop(columns=['TARGET','SK_ID_CURR']))
Y_train=np.array(df['TARGET'])
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)

In [None]:
import lightgbm as lgb
parameters = {'learning_rate': [0.09, 0.1, 0.11, 0.12],# 0.01],
    'n_estimators': [64, 76, 96, 110, 120],
    'num_leaves': [8, 9, 10, 11, 12],} # large num_leaves helps improve accuracy but might lead to over-fitting}
keys = parameters.keys()
values = (parameters[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_TRAIN) # imputer
X_TRAIN=imp_mean.transform(X_TRAIN)
X_VALID=imp_mean.transform(X_VALID)

# creation d'une boucle de cross-validation

score=np.empty((len(combinations),5)) # matrix de score cv
score_combi=np.empty((len(combinations),1)) # matrice de score grid

print("all combinations of hyperparameters=",combinations)
n_cv=2
n=0
for combination in tqdm_notebook(combinations):    
    print(combination)
    kf = KFold(n_splits=n_cv)
    i=0
    for train_index, test_index in kf.split(X_TRAIN):
        #print("TRAIN:", train_index, "TEST:", test_index)
        x_train, x_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = Y_TRAIN[train_index], Y_TRAIN[test_index]
        
        LightGBM_clf=lgb.LGBMClassifier(learning_rate=combination["learning_rate"],n_estimators=combination["n_estimators"],num_leaves=combination["num_leaves"], class_weight='balanced')#,scale_pos_weight=scale_pos_weight_value)
        LightGBM_clf.fit(x_train, y_train)
        pred=LightGBM_clf.predict(x_train)
        
        #print('scoretrain:',roc_auc_score(y_train, pred))
        pred=LightGBM_clf.predict(x_test)
        print('scoretest:',roc_auc_score(y_test, pred))
        print(n)
        score[n,i]=roc_auc_score(y_test, pred)
        i=i+1
    score_combi[n,0:1]=score[n,0:n_cv].mean()
    print("score_combi",score_combi[n])
    n=n+1
print(np.argmax(score_combi))
bestparameters=combinations[np.argmax(score_combi)]
print('bestparam:',bestparameters)    

# on recupere les hypermarametres et on reentraine le modele avec tout le jeu de données
lgb_clf = lgb.LGBMClassifier(random_state = 0, learning_rate=bestparameters["learning_rate"] , 
                                    n_estimators=bestparameters["n_estimators"], num_leaves=bestparameters["num_leaves"], class_weight='balanced')
lgb_clf = lgb_clf.fit(X_TRAIN, Y_TRAIN)
# on applique le modele au set de validation
pred=lgb_clf.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

# on sauve le meilleur modele entrainé
filename = 'model_lgb_clf' + '.sav'
joblib.dump(lgb_clf, filename)
##############


In [None]:
# load the model from disk
BESTMODEL='model_lgb_clf.sav'

loaded_model = joblib.load(BESTMODEL)

pred=loaded_model.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

## Représentation des résultats 

#### matrice de confusion avec proportions

In [None]:
c=confusion_matrix(Y_VALID,pred)
c=c/c.astype(np.float).sum(axis=0)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(c, annot=True, fmt='.2f')#, xticklabels=['0','1'], yticklabels==['0','1'])
plt.ylabel('Réel')
plt.xlabel('Prédit')
plt.show(block=False)


#### mat de confusion avec nombres absolus

In [None]:
plot_confusion_matrix(loaded_model, X_VALID, Y_VALID)  
plt.show()

In [None]:
BESTMODEL='model_lgb_clf.sav'
loaded_model = joblib.load(BESTMODEL)

pred=loaded_model.predict(X_VALID)

idx = random.randint(1, len(X_VALID))

print("Prediction : ", loaded_model.predict(X_VALID[idx].reshape(1,-1)))
print("Actual :     ", Y_VALID[idx])



#### on fait un simple imputer car Shap n'accepte pas les Nan contrainement à lgbm

In [None]:
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_TRAIN) # imputer
X_TRAIN=imp_mean.transform(X_TRAIN)
X_VALID=imp_mean.transform(X_VALID) 

#### extraction des paraètres globaux

In [None]:
#BESTMODEL='model_lgb_clf.sav'
#loaded_model = joblib.load(BESTMODEL)
#loaded_model.fit(X_TRAIN, Y_TRAIN)
explainer=shap.TreeExplainer(lgb_clf)
values=explainer.shap_values(X_TRAIN)
#shap.plots.waterfall(values)#beeswarm(values[0])#waterfall(values[0])
listfeat=df.columns.drop(['TARGET','SK_ID_CURR'])
shap.summary_plot(values,X_TRAIN, feature_names = listfeat)

In [None]:
vals= np.abs(values).mean(0).mean(0)
feature_importance = pd.DataFrame(list(zip(listfeat,vals)),columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],ascending=False,inplace=True)
feature_importance.head(11)

In [None]:
#1/0

# Realisation d'un modèle simplifié pour la mise en production
#### Stratégie : on réentraine le modèle sur les n=100 features les plus importantes pour alléger et rationaliser la modélisation

In [None]:
# on sauve un df simplifié avec les features qui ont la plus grande importance
df1=df[list(feature_importance.head(100).col_name.values)+list(['TARGET','SK_ID_CURR'])] # on rajoute la TARGET et l'ID client
df1.columns

#### on réentraine le modèle avec les mêmes paramètres et on le sauve pour l'utilisaiton par streamlit

In [None]:
X_train=np.array(df1.drop(columns=['TARGET','SK_ID_CURR']))
Y_train=np.array(df1['TARGET'])
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)


imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_TRAIN) # imputer
X_TRAIN=imp_mean.transform(X_TRAIN)
X_VALID=imp_mean.transform(X_VALID)

# creation d'une boucle de cross-validation


# on recupere les hypermarametres et on reentraine le modele avec tout le jeu de données
lgb_clf = lgb.LGBMClassifier(random_state = 0, learning_rate=bestparameters["learning_rate"] , 
                                    n_estimators=bestparameters["n_estimators"], num_leaves=bestparameters["num_leaves"], class_weight='balanced')
lgb_clf = lgb_clf.fit(X_TRAIN, Y_TRAIN)
# on applique le modele au set de validation
pred=lgb_clf.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

# on sauve le meilleur modele entrainé
filename = 'defaultcredit-fd/fdboard/model/model_lgb_clf_light' + '.sav'
joblib.dump(lgb_clf, filename)
##############

Logiquement le score reste bon en enlevant les features ayant peu d'importance

#### on reloade le modele pour verifier que tout fonctionne bien

In [None]:
# load the model from disk
BESTMODEL='defaultcredit-fd/fdboard/model/model_lgb_clf_light.sav'

loaded_model = joblib.load(BESTMODEL)

pred=loaded_model.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

application à un exemple d'un client - permet de verifier si tout fonctionne correctement en vue du dashboard

In [None]:
seed = np.random.randint(5,30,size=1)
seed[0]

print('REAL TARGET : ',df1.sample(1,random_state=seed[0]).TARGET)

pred=lgb_clf.predict(df1.sample(1,random_state=seed[0]).drop(columns=['TARGET','SK_ID_CURR']).values)
print(lgb_clf.predict_proba(df1.sample(1,random_state=seed[0]).drop(columns=['TARGET','SK_ID_CURR']).values)[:,1])
print('PREDICTED TARGET : ',pred)

In [None]:
# je sauve mes prédictions sur tout le df
X_df=df1.copy()
X_df.drop(columns=['TARGET','SK_ID_CURR'],inplace=True)

In [None]:
df1['PRED']=lgb_clf.predict(df1.drop(columns=['TARGET','SK_ID_CURR']).values)

df1['PREDproba']=lgb_clf.predict_proba(X_df.values)[:,1]


In [None]:
lgb_clf.predict_proba(X_df.values)[:,1]

In [None]:
lgb_clf.predict(X_df.values)

#### on fait un KMeans

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

##### on est obligé de faire un imputer car les données NaN ne sont pas acceptées par le KMeans

In [None]:
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')
X = df1.copy()
imp_mean.fit(X)
X=imp_mean.transform(X)
print(X)

In [None]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(4,12))
visualizer.fit(X)
visualizer.poof() 

In [None]:
kcluster=visualizer.elbow_value_
kcluster

In [None]:
# Run local implementation of kmeans
model = KMeans(n_clusters=kcluster, random_state=40)
model.fit(X)
#centroids = model.cluster_centers_
#centroids
# Plot the clustered data
df1["cluster"]=model.labels_

In [None]:
df1.head(5)

In [None]:
df1["cluster"].unique()

In [None]:
df1["cluster"].value_counts()

In [None]:
df1["bias"]=10.0 #initialisation

In [None]:
# On reeachantillone la base pour avoir une représenation equiprobable des clusters
for i in range(kcluster):
    print(i)
    val=float(df1["cluster"].loc[df1.cluster==i].value_counts())
    #print(1/val)
    #print(df1["cluster"].loc[df1.cluster==i].value_counts())
    df1.loc[df1['cluster'] ==i, 'bias']=1/val #df1["cluster"].loc[df1.cluster==i].value_counts()

## on sauve une base plus PETITE (n=x) pour les représentations graphique du dashboard

Je sauve les valeurs réelles et les valeurs prédites de la Target , ça peut être utile dans le Dashboard

In [None]:
bias = df1["bias"]
df2=df1.sample(n=30, weights=bias )

df2xl=df1.sample(n=4000, weights=bias )

In [None]:
df2.drop(columns=['bias'],inplace=True)
df2xl.drop(columns=['bias'],inplace=True)
df2

In [None]:
df2["cluster"].value_counts()

In [None]:
df2.to_csv("defaultcredit-fd/fdboard/df_for_prod.csv",index=False)
df2xl.to_csv("defaultcredit-fd/fdboard/dfXL_for_prod.csv",index=False)

In [None]:
df2.shape 

In [None]:
df2

### Pour cette plus petite base on va sauver une explication

In [None]:
df2.columns

In [None]:
explainer = lime_tabular.LimeTabularExplainer(X_TRAIN, mode="classification",
                                              class_names= df2.TARGET.unique(),
                                              feature_names= df2.columns.drop(['TARGET','SK_ID_CURR','PRED','PREDproba','cluster']))

In [None]:
explainer

In [None]:
print(X_TRAIN.shape)
print(len(df2.columns.drop(['TARGET','PRED','cluster'])))

In [None]:
df3=pd.read_csv("defaultcredit-fd/fdboard/df_for_prod.csv")

In [None]:
df3

In [None]:
df3

#### on genere des image pour un certain nombre de client pour l'appli en ligne"

In [None]:
for i,idc in enumerate(df3.SK_ID_CURR.values):
    print('i,idc:',i,idc)
    print('pred,proba,target',df3.loc[df3.SK_ID_CURR==idc,['PRED','PREDproba','TARGET']].values)#,df3.loc[df3.SK_ID_CURR==idc,'PREDproba'].values,df3.loc[df3.SK_ID_CURR==idc,'TARGET'].values)
    explanation = explainer.explain_instance(df2.iloc[i].drop(['TARGET','SK_ID_CURR','PRED','PREDproba','cluster']).values, loaded_model.predict_proba,labels=(1,),num_features=10)#len(df2.columns.drop(['TARGET','PRED','cluster'])))
    figi=explanation.as_pyplot_figure()
    plt.tight_layout()
    #explanation.save_to_file('lime.html')
    figname='defaultcredit-fd/fdboard/static/images/lime_'+str(idc)+'.png'
    figi.savefig(figname)
    #df3.loc[df3.index ==i, 'limage']=fig

#### de meme on genere une image shap pour le mise en prod (trop longue à calculer en ligne)

In [None]:
explainer=shap.TreeExplainer(lgb_clf)
values=explainer.shap_values(X_TRAIN)
#shap.plots.waterfall(values)#beeswarm(values[0])#waterfall(values[0])
listfeat=df1.columns.drop(['TARGET','PRED','cluster','bias'])
fig=plt.gcf()
shap.summary_plot(values,X_TRAIN, feature_names = listfeat)
figname='defaultcredit-fd/fdboard/static/images/imageshap.png'
fig.savefig(figname)

In [None]:
#shap.plots.waterfall(explainer.shap_values[X_TRAIN])

In [None]:
df3.SK_ID_CURR

In [None]:
#if df3.columns.isin(['SK_ID_CURR']).sum()==1 :
#    print('YES')
#    df3.set_index('SK_ID_CURR',drop=True, inplace=True)
df2 # on sauve pas cette version car les index sont resetté lors de la sauvegarde.

In [None]:
df3=pd.read_csv("defaultcredit-fd/fdboard/df_for_prod.csv")

In [None]:
df3

In [None]:
# reprise de la reccherche d'un modele optimal avec d'autres méthodes

### LogisticRegression 

In [None]:
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)

In [None]:
parameters = {'solver': ['newton-cg', 'lbfgs'],
              'penalty': ['none', 'l2'],
              'C': [0.01, 0.05]}

keys = parameters.keys()
values = (parameters[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

# creation d'une boucle de cross-validation
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_TRAIN) # imputer
X_TRAIN=imp_mean.transform(X_TRAIN)
X_VALID=imp_mean.transform(X_VALID)

score=np.empty((len(combinations),5)) # matrix de score cv
score_combi=np.empty((len(combinations),1)) # matrice de score grid

print("all combinations of hyperparameters=",combinations)
n_cv=2
n=0

for combination in tqdm_notebook(combinations):    
    print(combination)
    kf = KFold(n_splits=n_cv)
    i=0
    for train_index, test_index in kf.split(X_TRAIN):
        #print("TRAIN:", train_index, "TEST:", test_index)
        x_train, x_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = Y_TRAIN[train_index], Y_TRAIN[test_index]
        
        #imp_mean.fit(x_train)
        #x_train=imp_mean.transform(x_train)
        
        lr_clf = LogisticRegression(random_state = 0, solver=combination["solver"] , 
                                    penalty=combination["penalty"], C=combination["C"], class_weight='balanced')
        lr_clf.fit(x_train, y_train)
        pred=lr_clf.predict(x_train)        
        
        #print('scoretrain:',roc_auc_score(y_train, pred))

        #imp_mean.fit(X_TRAIN)
        #x_test=imp_mean.transform(x_test)

        pred=lr_clf.predict(x_test)
        print('scoretest:',roc_auc_score(y_test, pred))
        print(n)
        score[n,i]=roc_auc_score(y_test, pred)
        
        i=i+1
    
    score_combi[n,0:1]=score[n,0:n_cv].mean()
    print("score_combi",score_combi[n])
    
    n=n+1
    
print(np.argmax(score_combi))
bestparameters=combinations[np.argmax(score_combi)]
print('bestparam:',bestparameters)    
# une fois le modèle choisi
# on entraine ce modèle sur le jeu complet et on score sur : evite de devoir choisir et permet aussi plus de robustesse
#rf_clf = RandomForestClassifier(n_estimators=bestparameters["n_estimators"],max_features=bestparameters["max_features"],criterion=bestparameters["criterion"], max_depth=combination["max_depth"] , min_samples_leaf=combination["min_samples_leaf"], class_weight='balanced')
#rf_clf.fit(X_TRAIN, Y_TRAIN)
#pred=rf_clf.predict(X_VALID)
#print(roc_auc_score(Y_VALID, pred))

# on recupere les hypermarametres et on reentraine le modele avec tout le jeu de données
lr_clf = LogisticRegression(random_state = 0, solver=bestparameters["solver"] , 
                                    penalty=bestparameters["penalty"], C=bestparameters["C"], class_weight='balanced')

lr_clf = lr_clf.fit(X_TRAIN, Y_TRAIN)
# on applique le modele au set de validation
pred=lr_clf.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

# on sauve le meilleur modele entrainé
filename = 'model_lr_clf' + '.sav'
joblib.dump(lr_clf, filename)


#### Apllication du modele aux données de validation

In [None]:
#BESTMODEL='model_' + str(np.argmax(score_combi)) + '.sav'
# load the model from disk
BESTMODEL='model_lr_clf.sav'

loaded_model = joblib.load(BESTMODEL)


pred=loaded_model.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

### Représentation des résultats

In [None]:
c=confusion_matrix(Y_VALID,pred)
c=c/c.astype(np.float).sum(axis=0)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(c, annot=True, fmt='.2f')#, xticklabels=['0','1'], yticklabels==['0','1'])
plt.ylabel('Réel')
plt.xlabel('Prédit')
plt.show(block=False)

In [None]:
plot_confusion_matrix(loaded_model, X_VALID, Y_VALID)  
plt.show()

In [None]:
#explainer=shap.TreeExplainer(lr_clf)
#values=explainer.shap_values(X_TRAIN)
#shap.plots.waterfall(values)#beeswarm(values[0])#waterfall(values[0])
#listfeat=df.columns.drop(['TARGET'])
#shap.summary_plot(values,X_TRAIN, feature_names = listfeat)

# Application de la meme grid search manuelle pour XGBoost

In [None]:
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)

In [None]:
from xgboost import XGBClassifier

parameters = {
        'min_child_weight': [5, 10],
        'colsample_bytree': [0.6, 1.0],
        'max_depth': [3, 5]
        }

keys = parameters.keys()
values = (parameters[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

# creation d'une boucle de cross-validation
#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
score=np.empty((len(combinations),5)) # matrix de score cv
score_combi=np.empty((len(combinations),1)) # matrice de score grid

print("all combinations of hyperparameters=",combinations)
n_cv=2
n=0
for combination in tqdm_notebook(combinations):    
    print(combination)
    kf = KFold(n_splits=n_cv)
    i=0
    for train_index, test_index in kf.split(X_TRAIN):
        x_train, x_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = Y_TRAIN[train_index], Y_TRAIN[test_index]
        
        
        #x_train, y_train = oversample.fit_resample(x_train, y_train) # pour l'application du smote
        xgb_clf=XGBClassifier(min_child_weight=combination["min_child_weight"],colsample_bytree=combination["colsample_bytree"],max_depth=combination["max_depth"], class_weight='balanced')#scale_pos_weight=scale_pos_weight_value)
        xgb_clf.fit(x_train, y_train)
        pred=xgb_clf.predict(x_train)
        
        #print('scoretrain:',roc_auc_score(y_train, pred))
        pred=xgb_clf.predict(x_test)
        print('scoretest:',roc_auc_score(y_test, pred))
        print(n)
        score[n,i]=roc_auc_score(y_test, pred)
        i=i+1
    score_combi[n,0:1]=score[n,0:n_cv].mean()
    print("score_combi",score_combi[n])
    n=n+1
    
print(np.argmax(score_combi))
bestparameters=combinations[np.argmax(score_combi)]
print('bestparam:',bestparameters)    

# on recupere les hypermarametres et on reentraine le modele avec tout le jeu de données d'ENTRAINEMENT
xgb_clf = XGBClassifier(random_state = 0, min_child_weight=bestparameters["min_child_weight"] , 
                                    colsample_bytree=bestparameters["colsample_bytree"], max_depth=bestparameters["max_depth"], class_weight='balanced')

#X_TRAIN, Y_TRAIN = oversample.fit_resample(X_TRAIN, Y_TRAIN)
xgb_clf=xgb_clf.fit(X_TRAIN, Y_TRAIN)
# on applique le modele au set de validation
pred=xgb_clf.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

# on sauve le meilleur modele entrainé
filename = 'model_xgb_clf' + '.sav'
joblib.dump(xgb_clf, filename)


In [None]:
# load the model from disk
BESTMODEL='model_xgb_clf.sav'

loaded_model = joblib.load(BESTMODEL)


pred=loaded_model.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

## Représentation des résultats 

In [None]:
c=confusion_matrix(Y_VALID,pred)
c=c/c.astype(np.float).sum(axis=0)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(c, annot=True, fmt='.2f')#, xticklabels=['0','1'], yticklabels==['0','1'])
plt.ylabel('Réel')
plt.xlabel('Prédit')
plt.show(block=False)

In [None]:
plot_confusion_matrix(loaded_model, X_VALID, Y_VALID)  
plt.show()

In [None]:
explainer=shap.TreeExplainer(xgb_clf)
values=explainer.shap_values(X_TRAIN)
#shap.plots.waterfall(values)#beeswarm(values[0])#waterfall(values[0])
listfeat=df.columns.drop(['TARGET'])
shap.summary_plot(values,X_TRAIN, feature_names = listfeat)


# Application de la meme grid search manuelle pour CatBoost



In [None]:
X_TRAIN, X_VALID, Y_TRAIN, Y_VALID = train_test_split(X_train, Y_train, test_size=0.2, stratify=Y_train, random_state=42)

In [None]:
from catboost import CatBoostClassifier 

#print('AUC on Training set:',cross_val_score(CatBoost_clf, X_TRAIN, Y_TRAIN, cv=3, scoring='roc_auc'))
#print('AUC on Test set:',cross_val_score(CatBoost_clf, X_TEST, Y_TEST, cv=3, scoring='roc_auc'))
#parameters = {
#          'depth': [4, 8],
#          'loss_function': ['Logloss', 'CrossEntropy'],
#          'iterations': [100,500]#,'l2_leaf_reg': np.logspace(-20, 3),
#         }
#parameters = {'loss_function':'Logloss', # objective function
#          'eval_metric':'AUC', # metric
#          'verbose': 200, # output to stdout info about training process every 200 iterations
#          'random_seed': SEED,
#          'depth': [4, 8],
#          'iterations': [50,100]
#         }
parameters = {'learning_rate': [0.4, 0.6],
        'depth': [2, 10],
        'l2_leaf_reg': [2,4,6]}

keys = parameters.keys()
values = (parameters[key] for key in keys)
combinations = [dict(zip(keys, combination)) for combination in itertools.product(*values)]

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit(X_TRAIN) # imputer
X_TRAIN=imp_mean.transform(X_TRAIN)
X_VALID=imp_mean.transform(X_VALID)
# creation d'une boucle de cross-validation
#oversample = SMOTE()
#imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

score=np.empty((len(combinations),5))
score_combi=np.empty((len(combinations),1))

print("all combinations of hyperparameters=",combinations)
n_cv=3
n=0
for combination in tqdm_notebook(combinations):    
    print(combination)
    kf = KFold(n_splits=n_cv)
    i=0
    for train_index, test_index in kf.split(X_TRAIN):
        x_train, x_test = X_TRAIN[train_index], X_TRAIN[test_index]
        y_train, y_test = Y_TRAIN[train_index], Y_TRAIN[test_index]
        
        x_train, y_train = oversample.fit_resample(x_train, y_train)
        
        cat_clf=CatBoostClassifier(learning_rate=combination["learning_rate"],depth=combination["depth"],l2_leaf_reg=combination["l2_leaf_reg"],iterations=50,eval_metric='AUC',verbose=False)#, class_weight='balanced')#scale_pos_weight=scale_pos_weight_value)
        cat_clf.fit(x_train, y_train)
        pred=cat_clf.predict(x_train)
        
        #print('scoretrain:',roc_auc_score(y_train, pred))

        pred=cat_clf.predict(x_test)
        print('scoretest:',roc_auc_score(y_test, pred))
        print(n)
        score[n,i]=roc_auc_score(y_test, pred)
        i=i+1
    score_combi[n,0:1]=score[n,0:n_cv].mean()
    print("score_combi",score_combi[n])
    n=n+1
    
print(np.argmax(score_combi))
bestparameters=combinations[np.argmax(score_combi)]
print('bestparam:',bestparameters)    

# on recupere les hypermarametres et on reentraine le modele avec tout le jeu de données d'ENTRAINEMENT
cat_clf = CatBoostClassifier(random_state = 0, learning_rate=bestparameters["learning_rate"] , 
                                    depth=bestparameters["depth"], l2_leaf_reg=bestparameters["l2_leaf_reg"],iterations=50,eval_metric='AUC',verbose=False)#, class_weight='balanced')
X_TRAIN, Y_TRAIN = oversample.fit_resample(X_TRAIN, Y_TRAIN)
cat_clf = cat_clf.fit(X_TRAIN, Y_TRAIN)
# on applique le modele au set de validation
pred=cat_clf.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))

# on sauve le meilleur modele entrainé
filename = 'model_cat_clf' + '.sav'
joblib.dump(cat_clf, filename)


In [None]:
# load the model from disk
BESTMODEL='model_cat_clf.sav'

loaded_model = joblib.load(BESTMODEL)


pred=loaded_model.predict(X_VALID)
print(roc_auc_score(Y_VALID, pred))
print(roc_auc_score(Y_VALID, pred))

## Représentation des résultats 

In [None]:
c=confusion_matrix(Y_VALID,pred)
c=c/c.astype(np.float).sum(axis=0)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(c, annot=True, fmt='.2f')#, xticklabels=['0','1'], yticklabels==['0','1'])
plt.ylabel('Réel')
plt.xlabel('Prédit')
plt.show(block=False)

In [None]:
plot_confusion_matrix(loaded_model, X_VALID, Y_VALID)
plt.show(block=False)

In [None]:
explainer=shap.TreeExplainer(cat_clf)
values=explainer.shap_values(X_TRAIN)
#shap.plots.waterfall(values)#beeswarm(values[0])#waterfall(values[0])
listfeat=df.columns.drop(['TARGET'])
shap.summary_plot(values,X_TRAIN, feature_names = listfeat)

inspiré de :
https://www.kaggle.com/code/shailaja4247/tackle-any-credit-risk-analysis-problem-homecredit/notebook