# Data Scientist - P7 - Laurent Trichet

## Implémentez un modèle de scoring

## 3 Classification (with conversion to log for some data)

### Import required libraries  - Constants

In [None]:
# Import default libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# Import Garbage Collector (empty dataFrame memory)
import gc

# Remove some warnings
import warnings
warnings.filterwarnings('ignore')
import logging
logging.disable(logging.WARNING)


# Import Imbalanced-learn necessary tools
import imblearn
from collections import Counter

# Import for classification GradientBoostingClassifier & SVC
from sklearn import ensemble
from sklearn import svm
# Import for classification xgboost
from xgboost import XGBClassifier

# Import evaluation tool for classification optimisations
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

# Imports tools for model interpreation, AUC, roc, permutations
from sklearn import metrics
from sklearn import inspection

# tools for execution time estimates
from datetime import datetime

# Pandas parameters
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 120)
pd.set_option('display.max_info_rows', 2000)

# Matplotlib and sns visual parameters
sns.set_palette("Set1")
sns.set_style('whitegrid')
sns.set_context('paper')
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['xtick.labelsize'] = 11
mpl.rcParams['ytick.labelsize'] = 11

# Constants
DIRSOURCE = '../Sources/'
DIRDATASET = './credithome_datasets/'
NUMROWS = 15000    # 1000000 to get complete dateset
# File names with NUMROWS lines and Fill nan with zeros
FILESTD_FNAN0 = DIRDATASET+'Credit_Home_Junction_Std_Fnan0_'+str(NUMROWS)+'.csv'
FILELOG_FNAN0 = DIRDATASET+'Credit_Home_Junction_Log_Fnan0_'+str(NUMROWS)+'.csv'

### 3.1 Load training and test sets, apply correction of imbalanced classes

#### Read reduced dataset (15000) and prepare training and test features and result class

In [None]:
df = pd.read_csv(FILELOG_FNAN0, encoding='Latin-1', sep='\t')

# Retrieve train and test datasets
df_train = df[df['TARGET']!=999]
df_test = df[df['TARGET']==999]
# Keep valid columns for features and result class in future classifications
c_features = [c for c in df.columns if c not in ['index', 'TARGET', 'SK_ID_CURR']]
c_class = 'TARGET'

del df
gc.collect()

#### Fix imbalanced data with Prototype selection (under sample of positive class included in original sample)

In [None]:
counter1 = Counter(df_train[c_class])
print(counter1)

In [None]:
undersample = imblearn.under_sampling.RandomUnderSampler(random_state=0)
X, y = undersample.fit_resample(df_train[c_features], df_train[c_class])

counter2 = Counter(y)
print(counter2)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2,
                        sharex=False, sharey=False,
                        figsize=(16,5))
fig.subplots_adjust(hspace=0.5)

for label, _ in counter1.items():
    row_ix = np.where(df_train[c_class].values == label)[0]
    sns.scatterplot(df_train[c_features].iloc[row_ix, 10],
                    df_train[c_features].iloc[row_ix, 11],
                    label=str(label),
                    ax=axes[0]
                    )
axes[0].set_title('Imbalanced data')

for label, _ in counter2.items():
    row_ix = np.where(y.values == label)[0]
    sns.scatterplot(X.iloc[row_ix, 10],
                    X.iloc[row_ix, 11],
                    label=str(label),
                    ax=axes[1]
                    )
axes[1].set_title('Random Under Sample')
print('\n\tArbitrary selection of 2 variables to see effect of under sampling ...')
plt.show()

### 3.2 Search for Classification method & Hyperparameters

#### LinearSVC, XGBCClassifier, GradientBoostingClassifier best scores

In [None]:
models=[]
iname, itype, iparam = 0, 1, 2
models.append(['LinearSVC ', svm.LinearSVC(),
               { 
                'C': np.logspace(-4, 4, 9),
                'penalty' : ['l1', 'l2'],
                'loss': ['hinge', 'squared_hinge'],
                'dual': [False],
               }
              ])
models.append(['XGBClassifier', XGBClassifier(),
               {
                 'max_depth': [3,5],
                 'min_child_weight': [1, 5, 10],
                 'gamma': [0.5, 1, 1.5, 2, 5],
                 'subsample': [0.6, 0.8, 1.0],
                 'colsample_bytree': [0.6, 0.8, 1.0],
                 'verbosity': [0],
               }
              ])
models.append(['GradBoostC', ensemble.GradientBoostingClassifier(),
               {
                'n_estimators': [200],
                'max_depth': [3,5],
                'criterion': ['friedman_mse', 'squared_error'],
                'min_samples_split': [2, 3, 4],
                'min_weight_fraction_leaf': [0.0, 0.2, 0.4],
               }
              ])
for i, model in enumerate(models):
    mdl = GridSearchCV(model[itype], model[iparam], cv=5, scoring='roc_auc')
    datedeb = datetime.now()
    mdl.fit(X, y)
    duree = datetime.now() - datedeb
    print(f'{model[iname]} \tduree: {duree.seconds}s \tbest_score: {mdl.best_score_:4.3} \tbest_params: {mdl.best_params_}')


> For a 1174 '1 and 0 balanced classes' sample:  
  
>> LinearSVC  	duree: 1356s 	best_score:  0.7 	best_params: {'C': 0.1, 'dual': False, 'loss': 'squared_hinge', 'penalty': 'l1'}  
>>  
>> XGBClassifier 	duree: 1889s 	best_score: 0.72 	best_params: {'colsample_bytree': 1.0, 'gamma': 5, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.8, 'verbosity': 0}  
>>  
>> GradBoostC 	duree: 1993s 	best_score: 0.73 	best_params: {'criterion': 'friedman_mse', 'max_depth': 3, 'min_samples_split': 3, 'min_weight_fraction_leaf': 0.2, 'n_estimators': 200}  

In [None]:
del df_train, df_test
gc.collect()

### 3.3 Kfold Roc Curve and Feature Importances

In [None]:
n_splits = 8
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

tot_valid_y = np.zeros(y.shape[0])
tot_valid_prob = np.zeros(y.shape[0])
tot_score = []
tot_feature_importances = []

for splt, (train_idx, valid_idx) in enumerate(skf.split(X, y)):

    train_x, train_y = X.iloc[train_idx], y.iloc[train_idx]
    valid_x, valid_y = X.iloc[valid_idx], y.iloc[valid_idx]
    
    # GradientBoostingClassifier
    gbc = ensemble.GradientBoostingClassifier(
        n_estimators=200,
        criterion='friedman_mse',
        max_depth=3,
        min_samples_split=3,
        min_weight_fraction_leaf=0.2,
    )
    gbc.fit(train_x, train_y)

    tot_valid_y[valid_idx] = valid_y
    
    valid_prob = gbc.predict_proba(valid_x)[:,1]
    tot_valid_prob[valid_idx] = valid_prob
    
    tot_score.append(metrics.roc_auc_score(valid_y.values, valid_prob))
    tot_feature_importances.append(gbc.feature_importances_)
    
tot_score = [round(1000*s)/1000 for s in tot_score] 
mean_score = sum(tot_score)/len(tot_score)
print(f'tot_score   = {[s for s in tot_score]}')
print(f'mean scores = {mean_score:5.3}')

In [None]:

fig, axe = plt.subplots(figsize=(8,8))
[fpr, tpr, thr] = metrics.roc_curve(tot_valid_y,
                                    tot_valid_prob,
                                    pos_label=1)
axe.plot(fpr, tpr, color='orange', lw=2)
axe.set_title(f'Roc curve ({n_splits} splits) mean AUC = {mean_score:5.3}')
axe.set_xlabel('Specificity')
axe.set_ylabel('Sensitivity')
axe.grid(visible=True, color='#eeeeee')

plt.show()


#### Shape features and importances to find features with main role in classification

In [None]:
importance_mean = pd.DataFrame(tot_feature_importances).mean().to_list()
importance_std = pd.DataFrame(tot_feature_importances).std().to_list()
df_features = pd.DataFrame(data=np.array([[c for c in X.columns], importance_mean, importance_std]).T,
                           columns=['col name', 'mean def', 'std def'])
df_features['mean def'] = df_features['mean def'].astype('float64')
df_features['std def'] = df_features['std def'].astype('float64')
df_features.describe()

In [None]:

df_draw = df_features.sort_values('mean def')
df_draw = df_draw.iloc[-80:,:]
fig, axes = plt.subplots(figsize=(14,int(df_draw.shape[0]//3.5)))
axes.barh([x for x in range(df_draw.shape[0])],
           df_draw['mean def'].values,
           xerr = df_draw['std def'].values,
           color = '#33aa33',
           tick_label=df_draw['col name'].values)
axes.set_title(f'Features Importance KFOLD - {df_draw.shape[0]} first features')
axes.grid(visible=True)
plt.show()


### 3.4 Permutation Feature Importance

In [None]:
gbc = ensemble.GradientBoostingClassifier(
        n_estimators=200,
        criterion='friedman_mse',
        max_depth=3,
        min_samples_split=3,
        min_weight_fraction_leaf=0.2,
    )
gbc.fit(X, y)

result = inspection.permutation_importance(gbc, X, y, n_repeats=8, random_state=0)

In [None]:
df_featpermut = pd.DataFrame(data=np.array([[c for c in X.columns], result.importances_mean, result.importances_std]).T,
                   columns=['col permut', 'mean permut', 'std permut'])
df_featpermut['mean permut'] = df_featpermut['mean permut'].astype('float64')
df_featpermut['std permut'] = df_featpermut['std permut'].astype('float64')
df_featpermut.describe()

In [None]:
df_draw = df_featpermut.sort_values('mean permut')
df_draw = df_draw.iloc[-80:,:]
fig, axes = plt.subplots(figsize=(14,int(df_draw.shape[0]//3.5)))
axes.barh([x for x in range(df_draw.shape[0])],
           df_draw['mean permut'].values,
           xerr = df_draw['std permut'].values,
           color = '#4444aa',
           tick_label=df_draw['col permut'].values)
axes.set_title(f'Features Importance PERMUTATION - {df_draw.shape[0]} first features')
axes.grid(visible=True)
plt.show()


### 3.5 Combination Kfold and Permutation Feature Importance

In [None]:
df_feat = pd.concat([df_features, df_featpermut], axis=1)
df_feat.drop('col permut', axis=1, inplace=True)

del df_features, df_featpermut
gc.collect()

In [None]:
df_draw = df_feat.sort_values('mean permut')
df_draw = df_draw.iloc[-80:,:].sort_values('mean permut', ascending=False)
max_val = max(df_draw['mean def'].max(), df_draw['mean permut'].max())
ratio1 = 100/df_draw['mean def'].max()
ratio2 = 100/df_draw['mean permut'].max()
barWidth=0.8
plt.figure(figsize=(14,int(len(df_draw)/3.5)))
plt.gca().invert_yaxis()
plt.barh(df_draw['col name'].str[:40], df_draw['mean def']*ratio1,
         left=100-(df_draw['mean def']*ratio1), color='#33aa33')
plt.barh(df_draw['col name'].str[:40], df_draw['mean permut']*ratio2,
         left=100, color ='#4444aa')
plt.title('        Kfold mean  -   Permutation mean', fontsize=20)
plt.yticks(fontsize=9, color='#222222')
plt.xlabel('Importance')
label_max_left = df_draw['mean def'].max()
label_max_right = df_draw['mean permut'].max()
labels = [
          f'{label_max_left:4.2}', f'{label_max_left*0.75:4.2}',
          f'{label_max_left*0.5:4.2}', f'{label_max_left*0.25:4.2}',
          '0',
          f'{label_max_right*0.25:4.2}', f'{label_max_right*0.5:4.2}',
          f'{label_max_right*0.75:4.2}', f'{label_max_right:4.2}',
          ]
plt.xticks(np.arange(0,225, step=25), labels)
plt.grid(True)


In [None]:
df_feat['mean importance'] = (df_feat['mean def'] + df_feat['mean permut']) / 2
df_feat['std importance'] = (df_feat['std def'] + df_feat['std permut']) / 2


In [None]:
df_draw = df_feat.sort_values('mean importance')
df_draw = df_draw.iloc[-80:,:]
fig, axes = plt.subplots(figsize=(14,int(df_draw.shape[0]//3.5)))
axes.barh([x for x in range(df_draw.shape[0])],
           df_draw['mean importance'].values,
           xerr = df_draw['std importance'].values,
           color = '#33aa33',
           tick_label=df_draw['col name'].values)
axes.set_title(f'Mean Features Importance KFOLD & Permut. - {df_draw.shape[0]} first features')
axes.grid(visible=True)
plt.show()


### 3.6 Save File with Feature Importance, Description, Min, Max & Mean

In [None]:
df_feat = df_feat.drop(columns=['mean def', 'std def', 'mean permut', 'std permut'])\
                 .sort_values('mean importance', ascending=False)

In [None]:
df_descr = pd.read_csv(DIRSOURCE+'HomeCredit_columns_description.csv',
                             encoding='Latin-1')

def get_col_HC_description(c):
    i = 0
    colname = c
    descr = '(Application) '
    if c.startswith('BURO_'):
        colname = c[5:]
        descr = '(Bureau) '
    if c.startswith('PREV_'):
        colname = c[5:]
        descr = '(Previous Application) '
    if c.startswith('APPROVED_'):
        colname = c[9:]
        descr = '(Previous Application) '
    if c.startswith('REFUSED_'):
        colname = c[8:]
        descr = '(Previous Application) '
    if c.startswith('POS_'):
        colname = c[4:]
        descr = '(POS CASH Balance) '
    if c.startswith('INSTAL_'):
        colname = c[7:]
        descr = '(Installments Payments) '
    if c.startswith('CC_'):
        colname = c[3:]
        descr = '(Credit Card Balance) '
    while i < df_descr.shape[0]:
        if colname.startswith(str(df_descr.iloc[i,:]['Row'])):
            if colname.endswith('_MEAN'):
                descr = descr + 'MEAN, '
            if colname.endswith('_MAX'):
                descr = descr + 'MAX, '
            if colname.endswith('_MIN'):
                descr = descr + 'MIN, '
            if colname.endswith('_SUM'):
                descr = descr + 'SUM, '
            descr = descr + str(df_descr.iloc[i,:]['Description']).replace('\t', ' ')
            break
        i = i + 1
    return descr

In [None]:
df_feat['description'] = df_feat['col name'].map(get_col_HC_description)

del df_descr
gc.collect()

In [None]:
NUMROWS = 1000000    # 1000000 to get complete dateset
# File names with NUMROWS lines and Fill nan with zeros
FILELOG_FNAN0 = DIRDATASET+'Credit_Home_Junction_Log_Fnan0_'+str(NUMROWS)+'.csv'

df = pd.read_csv(FILELOG_FNAN0, encoding='Latin-1', sep='\t')

In [None]:
def calc_min(c):
    return df[c].min()

def calc_max(c):
    return df[c].max()

def calc_mean(c):
    return df[c].mean()

df_feat['min val'] = df_feat['col name'].map(calc_min)
df_feat['max val'] = df_feat['col name'].map(calc_max)
df_feat['mean val'] = df_feat['col name'].map(calc_mean)


In [None]:
df_feat.to_csv(DIRDATASET+'Credit_Home_Features.csv', sep='\t', index=False)