In [22]:
#import libraries
import pandas as pd #pandas class
import numpy as np
import traceback
import os
import copy
import math
import statistics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer
import category_encoders as ce
import time
from scipy.stats import chi2_contingency
from scipy.stats import boxcox
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import precision_score, recall_score, confusion_matrix, f1_score, roc_auc_score, accuracy_score
from imblearn.over_sampling import RandomOverSampler
import logging
plt.style.use('_mpl-gallery')

In [23]:
#configure logger
log_filename=f"/Users/chiragshah/Downloads/Projects/Data Science/home-credit-default-risk/logs/{(time.asctime()).replace(' ','_')}"
logging.basicConfig(filename=log_filename,filemode='a',format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',level=logging.DEBUG)

In [24]:
try:
        
        filepath=f'/Users/chiragshah/Downloads/Projects/Data Science/home-credit-default-risk/Cleaning_3.csv'
        df=pd.read_csv(filepath)
        df.drop(columns='Unnamed: 0',inplace=True)
        #take backup of file
        backup=copy.deepcopy(df)
        
except Exception as ex:
    print(f'Following exception:\n {ex}')
    traceback.print_exc()

In [25]:
df.shape

(307511, 158)

In [6]:
#check for no of positive examples
df[df.iloc[:,0]==1].shape

(24825, 158)

In [None]:
'''Firstly, we'll select all the features and use few tree-based models.

'''

In [27]:
'''Separate target from predictors.'''
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [28]:
#Split the training data in train, cross validation

X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.30,random_state=42)

The basic XGBoost model with no hyperparamter tuning.

The suitable evaluation metrics for this analysis is AUC score because of binary classification task.

In [29]:
def xgboost_model(X_train,X_val,y_train,y_val):
    '''XGBoost model to fit and score various'''
    
    #define model
    xgboost = GradientBoostingClassifier(random_state=0).fit(X_train,y_train)
        
    #predict
    y_predict = xgboost.predict(X_val)
    
    #compute metric
    auc_score = roc_auc_score(y_val, y_predict)
    tn, fp, fn, tp = confusion_matrix(y_val, y_predict).ravel() 
    precision = precision_score(y_val, y_predict)
    recall = recall_score(y_val, y_predict)
    accuracy = accuracy_score(y_val, y_predict)
    
    #print results
    print('True negatives: ', tn, '\nFalse positives: ', fp, '\nFalse negatives: ', fn, '\nTrue Positives: ', tp)
    print ('Precision: ', precision, '\nRecall: ', recall)
    print('Accuracy: ', accuracy)
    print('The AUC score is {}'.format(auc_score))
    
    return xgboost

In [30]:
xgboost_default = xgboost_model(X_train,X_val,y_train,y_val)

True negatives:  84763 
False positives:  78 
False negatives:  7322 
True Positives:  91
Precision:  0.5384615384615384 
Recall:  0.012275731822474031
Accuracy:  0.9197866759164914
The AUC score is 0.5056781825034506


The default xgboost model is pretty bad with only 91 TP were identified even with 50% percent score.

In order to not to limit ourselves to any one ensemble methods, we will run another model random forest.

Default Random forest model.

In [31]:
'''Random Forest classifier.'''
def randomforest_model(X_train,X_val,y_train,y_val):
    '''Random forest model to fit and score various'''

    #define default model
    random_forest=RandomForestClassifier(random_state=0).fit(X_train,y_train)
    
    #predict
    y_predict = random_forest.predict(X_val)
    
    #compute metric
    auc_score = roc_auc_score(y_val,y_predict)
    tn, fp, fn, tp = confusion_matrix(y_val, y_predict).ravel() 
    precision = precision_score(y_val, y_predict)
    recall = recall_score(y_val, y_predict)
    accuracy = accuracy_score(y_val, y_predict)
    
    #print results
    print('True negatives: ', tn, '\nFalse positives: ', fp, '\nFalse negatives: ', fn, '\nTrue Positives: ', tp)
    print ('Precision: ', precision, '\nRecall: ', recall)
    print('Accuracy: ', accuracy)
    print('The AUC score is {}'.format(auc_score))
    
    return random_forest

In [32]:
randomforest_default = randomforest_model(X_train,X_val,y_train,y_val)

True negatives:  84836 
False positives:  5 
False negatives:  7404 
True Positives:  9
Precision:  0.6428571428571429 
Recall:  0.0012140833670578712
Accuracy:  0.9196891191709845
The AUC score is 0.5005775747984145


Random forest model is also not good in identifying the true positives.

Before we perform hyperparameter tunning it appear our positive examples are very less. It might be a good idea to perform oversampling so that our model can learn the
positive examples better.

In [33]:
#define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')

#fit and apply the transform
X_over, y_over = oversample.fit_resample(X, y)

#split the data again with oversampled dataset.
X_train_over,X_val_over,y_train_over,y_val_over = train_test_split(X_over,y_over,test_size=0.30,random_state=42)

In [34]:
#random forest with oversampling
randomforest_default_over = randomforest_model(X_train_over,X_val,y_train_over,y_val)

True negatives:  84649 
False positives:  192 
False negatives:  3 
True Positives:  7410
Precision:  0.9747434885556433 
Recall:  0.9995953055443141
Accuracy:  0.9978862705140157
The AUC score is 0.9986661243837598


In [35]:
#XGBoost with oversampled dataset.
xgboost_model(X_train_over,X_val_over,y_train_over,y_val_over)

True negatives:  58087 
False positives:  26758 
False negatives:  26083 
True Positives:  58684
Precision:  0.6868284918424195 
Recall:  0.6922977101938255
Accuracy:  0.6884595429568663
The AUC score is 0.6884613072154818


GradientBoostingClassifier(random_state=0)

In [None]:
#XGBoost with oversampling increased our model's AUC score to 0.69.

Clearly, oversampling our dataset is performing really well, even without any hyperparameter tunning.

The default XGBoost with oversample gave us any output of 0.69 AUC score, whereas the Random Forest performed with 0.99 percent. The number of true positives are also
really high.

So, now the question is out of 157 predictors how many are important to train our model and what are irrelavant.
We'll try to check the feature importance in our model to identify the best subset of predictors to achieve global optimum.

For this we'll use a greedy feature selection approach called RFE (Recursive Feature Elimination)

In [8]:
'''Random Forest with recursive feature elimination.
'''
def compute_metrics(model,X_train,y_train,X_val,y_val):
    '''Function to compute test and validation accuracy'''
    
    logging.info('Execution start compute_metrics')
    
    f_score = 0 
    
    #calculate accuracy
    train_accuracy_score = model.score(X_train,y_train)
    val_accuracy_score = model.score(X_val,y_val)
    logging.info('Train accuracy achieved is {}'.format(train_accuracy_score))
    logging.info('Validation accuracy achieved is {}'.format(val_accuracy_score))
    
    #calculate TP, TN, FP, FN
    prediction = model.predict(X_val)
    
    tn, fp, fn, tp = confusion_matrix(y_val, prediction).ravel()
    logging.info(f'True negatives: {tn}, False positives: {fp}, False negatives: {fn}, True Positives: {tp}')
    
    #calculate f1_score
    f_score = f1_score(y_val,prediction)
    logging.info('F1 Score: {}'.format(f_score))
    
    logging.info('Execution end compute_metrics')
    
    #calculate auc_score
    auc_score = roc_auc_score(y_val,prediction)
    logging.info('AUC score: {}'.format(auc_score))
    
    return auc_score
    
def compute_permutation_importance(model,X_val,y_val):
    '''Calculate permuation importance of model's predictors.
        Reason for choosing this method is because the predictors are of various data types.
    '''
    
    logging.info('Execution start compute_permutation_importance function')
    
    #train_result = permutation_importance(model,X_train,y_train,n_repeats=5,n_jobs=2,random_state=42)
    val_result = permutation_importance(model,X_val,y_val,n_repeats=5,n_jobs=2,random_state=42)
    
    sorted_idx = val_result.importances_mean.argsort()
    
    #train_importances = pd.DataFrame(train_result.imp[sorted_idx].T,columns=X_train.column[sorted_idx])
    test_importances = pd.DataFrame(val_result.importances[sorted_idx].T,columns=X_val.columns[sorted_idx])
    
    logging.info('Execution end compute_permutation_importance function.')
    
    return test_importances

def recursive_feature_selection(X_over,y_over):
    '''The function will employ a feature selection technique names RFE to identify minimun subset of features that can achieve
        our objective and make prediction with acceptable accuracy.
    '''
    
    logging.info('Execution start recursive_feature_selection function.')
    
    #declare variables
    assessment_sheet = {}
    sampling_count = 10
    rfe_df = copy.deepcopy(df)
    
    logging.info(f'Initial dataframe predictors shape: {X_over.shape}, and response shape: {y_over.shape}')

    while sampling_count > 0:
        
        auc_score = 0
        logging.info('-'*180)
        
        #assign predictors and response to X, y for simplicity
        X = X_over.sample(frac = 0.5, random_state = 42, axis = 1)
        y =  y_over
        logging.info(f'Predictors shape: {X.shape}, Response shape: {y.shape}')
    
        #Split the data in train, cross validation.
        X_train,X_val,y_train,y_val = train_test_split(X,y,test_size=0.30,random_state=42)
                    
        num_of_predictors = X_train.shape[1]
        
        while num_of_predictors > 25:
            
            #define default model
            random_forest = RandomForestClassifier(n_estimators = 10).fit(X_train,y_train)
    
            #compute evaluation metrics
            auc_score = compute_metrics(random_forest,X_train,y_train,X_val,y_val)
        
            #print the auc score with current subset of predictors.
            logging.info('Current subset of predictors are: {}'.format(list(X.columns)))
        
            #compute permuation importance
            test_importances = compute_permutation_importance(random_forest,X_val,y_val)
    
            #identify the least importance feature
            lst_imp=np.mean(test_importances,axis=0).sort_values().head(1)
            lst_imp_index,lst_imp_value = list(lst_imp.index)[0],lst_imp.iloc[0]
            logging.info(f'The least important predictor found in this iteration is: {lst_imp_index}, Value: {lst_imp_value}')  
    
            #remove the least important predictor
            X.drop(columns = lst_imp_index,inplace = True)
            logging.info(f'Dropped the least important predictor from this iteration.')
            logging.info(f'Current shape of predictors: {X.shape}')
        
            #assign remaining num of predictors.
            num_of_predictors = X.shape[1]
        
        sampling_count -= 1
    
    
    logging.info(f'Remaining predictors are {list(sampled_df.columns)}')
    logging.info('Execution end recursive_feature_selection function.')
    
    

In [9]:
recursive_feature_selection(X_over,y_over)
print('Process end')

KeyboardInterrupt: 