This notebook is starting point for Home Credit Default Risk Kaggle Problem Solution
Primary objective of this notbook is to load data from /input.nosync folder and perform basic EDA.
Detailed modelling and prediction should be done in separate notebook based on EDA results that we get here

In [1]:
# all the imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import os
import pprint
import copy

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler,Imputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
#Notebook lavel configurations
warnings.filterwarnings('ignore')
pp = pprint.PrettyPrinter(indent=4)

In [3]:
# List input files, if not available please download from below url inot a /input.nosync folder
input_dir = 'input.nosync'
input_files = os.listdir(input_dir)
if input_files is None or len(input_files) < 11 :
    raise Exception('You do not have all the files in {} directory'.format(input_dir))

print('You have all the input files listed below')
pp.pprint(input_files)

You have all the input files listed below
[   'application_test.csv',
    '.DS_Store',
    'HomeCredit_columns_description.csv',
    'POS_CASH_balance.csv',
    'credit_card_balance.csv',
    'installments_payments.csv',
    'application_train.csv',
    'bureau.csv',
    'previous_application.csv',
    'bureau_balance.csv',
    'sample_submission.csv']


Read in application_train and application test, note any changes to train need to be made to test as well

In [4]:
# Read application_train.csv
app_train = pd.read_csv(input_dir+'/application_train.csv')
#app_test = pd.read_csv(input_dir+'/application_test.csv')
print('Training data shape: ', app_train.shape)
#print('Test data shape: ', app_test.shape)
app_train.head()

Training data shape:  (307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


Now lets crate test data by splitting training set

In [5]:
app_train_labels = app_train['TARGET']
app_train=app_train.drop('TARGET', axis=1)

# remove SK_ID_CURR for now since this column does not give any info about defaults
app_train=app_train.drop('SK_ID_CURR', axis=1)


app_train_data, app_test_data, train_labels, test_labels = train_test_split(
    app_train,app_train_labels,test_size = .2, random_state = 23 )

print('Training data shape: ', app_train_data.shape, train_labels.shape)
print('Test data shape: ', app_test_data.shape, test_labels.shape)

Training data shape:  (246008, 120) (246008,)
Test data shape:  (61503, 120) (61503,)


Convert categorical columns to numeric using One Hot Encoding

In [6]:
app_train_data = pd.get_dummies(app_train_data)
app_test_data = pd.get_dummies(app_test_data)

print('Training Features shape: ', app_train_data.shape)
print('Testing Features shape: ', app_test_data.shape)

Training Features shape:  (246008, 244)
Testing Features shape:  (61503, 243)


Align train and test so clumns match

In [7]:
app_train_data, app_test_data = app_train_data.align(app_test_data, join = 'inner', axis = 1)

In [8]:
print(app_train_data.shape)
print(app_test_data.shape)

(246008, 243)
(61503, 243)


Impute missing values, using the column mean

In [9]:
imputer = Imputer(strategy = 'median')

app_train_data = imputer.fit_transform(app_train_data)
app_test_data = imputer.transform(app_test_data)

In [10]:
scaler = MinMaxScaler(feature_range = (0, 1))
app_train_data = scaler.fit_transform(app_train_data)
app_test_data = scaler.transform(app_test_data)

In [11]:
class Stat_Holder:
    
    def __init__(self, pca_component_no, classifier_name, classifier_params
                 , accurancy): 
        self.pca_component_no = pca_component_no
        self.classifier_name = classifier_name
        self.classifier_params = classifier_params
        self.accurancy = accurancy
        
    def __str__(self):
        return '{pca_component_no='+ str(self.pca_component_no) +\
                ' classifier_name=' + str(self.classifier_name) +\
                ' classifier_params=' + str(self.classifier_params) +\
                ' accurancy=' + str(self.accurancy) +\
                '}'
        
    def __lt__(self, other):

        
        # sort decending by accurancy
        return self.accurancy > other.accurancy 
# End of class Stat_Hlder   

def sort_print_stats(stats,top=10):    
    stats=sorted(stats)
    print_stats(stats,top)

def print_stats(stats,top=10):
    print('{:^6}{:^12}{:^25}{:^10}{:^30}'
          .format('Sr. No.','No. of PCA','Classifier','Accuracy','Classifier Params'))
    print('_'*80)
    for index,stat in enumerate(stats[:top]):
        print('{:^6}{:^12}{:^25}{:>10.5}{:^30}'
              .format(index+1,stat.pca_component_no,stat.classifier_name
                      ,stat.accurancy,stat.classifier_params))

Now lets use various combinations of PCA and Random Forest to get baseline predictions

In [12]:
def run_classifier(clf, train_data,train_labels,test_data,test_labels,pca_no,clf_params):
    clf.fit(train_data,train_labels)
    test_labels_predicted = clf.predict(test_data)
    #print(test_labels_predicted[:10])
    #print(test_labels[:10])
    accurancy = metrics.roc_auc_score(test_labels,test_labels_predicted)
    st = Stat_Holder(pca_no,type(clf).__name__,clf_params,accurancy)
    return st

def run_classifiers(min_PCA, max_PCA, classifier_list):
    
    stats = []
    for classifier in classifier_list:
        clf = classifier.get('clf')
        clf_params = classifier.get('params')
        print('{} started with params {}'.format(type(clf).__name__,clf_params))
        #print(clf)
        
        #print(clf_params)
        if min_PCA is None:
            st = run_classifier(clf, app_train_data,train_labels
                                        ,app_test_data,test_labels,'NA',clf_params)
            stats.append(st)
        else:
            pca_range = range(min_PCA,max_PCA+1)
            for pca_no in pca_range:
                    print('\tPCA {} started'.format(pca_no), end=" ")
                    pca = PCA(n_components = pca_no,copy = True)
                    train_data_pca = pca.fit_transform(app_train_data) 
                    test_data_pca = pca.transform(app_test_data) 

                    st = run_classifier(clf, train_data_pca,train_labels
                                        ,test_data_pca,test_labels,pca_no,clf_params)
                    stats.append(st)
                    print('\tPCA {} completed'.format(pca_no))
        print('{} completed'.format(type(clf).__name__))
    return stats
    


In [13]:
stats=[]

minPCA = 1
maxPCA = 5

classifiers = []
classifiers.append({'clf':DecisionTreeClassifier(max_depth=5), 'params':'max_depth=5'} )
classifiers.append({'clf':KMeans(n_clusters=5), 'params':'n_clusters=5'} )
classifiers.append({'clf':GaussianMixture(n_components=3), 'params':'n_components=3'} )
classifiers.append({'clf':KNeighborsClassifier(n_neighbors=3), 'params':'n_neighbors=3'} )
#classifiers
#stats=run_classifiers(minPCA,maxPCA,classifiers)

In [14]:
sort_print_stats(stats,top=15)

Sr. No. No. of PCA        Classifier         Accuracy       Classifier Params       
________________________________________________________________________________


In [None]:
classifiers = []
classifiers.append({'clf':LogisticRegression(C = 0.1), 'params':'C = 0.1'} )
classifiers.append({'clf':LogisticRegression(C = 0.01), 'params':'C = 0.01'} )
classifiers.append({'clf':LogisticRegression(C = 0.001), 'params':'C = 0.001'} )
classifiers.append({'clf':LogisticRegression(C = 0.0001), 'params':'C = 0.0001'} )
classifiers.append({'clf':RandomForestClassifier(n_estimators = 50), 'params':'n_estimators = 50'} )

stats=run_classifiers(None,None,classifiers)
sort_print_stats(stats)

LogisticRegression started with params C = 0.1
[0 0 0 0 0 0 0 0 0 0]
250029    0
70283     0
155376    0
283056    0
179554    0
219018    0
174515    1
131643    0
71706     0
122526    0
Name: TARGET, dtype: int64
LogisticRegression completed
LogisticRegression started with params C = 0.01
[0 0 0 0 0 0 0 0 0 0]
250029    0
70283     0
155376    0
283056    0
179554    0
219018    0
174515    1
131643    0
71706     0
122526    0
Name: TARGET, dtype: int64
LogisticRegression completed
LogisticRegression started with params C = 0.001
[0 0 0 0 0 0 0 0 0 0]
250029    0
70283     0
155376    0
283056    0
179554    0
219018    0
174515    1
131643    0
71706     0
122526    0
Name: TARGET, dtype: int64
LogisticRegression completed
LogisticRegression started with params C = 0.0001
[0 0 0 0 0 0 0 0 0 0]
250029    0
70283     0
155376    0
283056    0
179554    0
219018    0
174515    1
131643    0
71706     0
122526    0
Name: TARGET, dtype: int64
LogisticRegression completed
RandomForestCl