This is a jupyter notebook solving Home Credit Default problems using Logistic Regression, Random Forest and Gradient Boosting models. 

In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os
import warnings
warnings.simplefilter('ignore')

from IPython.display import display, HTML

# from imputer import Imputer
from sklearn.preprocessing import Imputer
from missingpy import KNNImputer
import lightgbm as lgb
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, StandardScaler, RobustScaler
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

from memory_profiler import profile

In [2]:
path = '../home_credit_data/'

In [3]:
df = pd.read_csv(path+'/home_credit_processed.csv')

## Data Cleaning 

Since Logistic Regression model have the requirement that data cannot contains NA. We do more data preprocessing here. 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356251 entries, 0 to 356250
Columns: 799 entries, Unnamed: 0 to CC_COUNT
dtypes: float64(622), int64(177)
memory usage: 2.1 GB


In [5]:
# checking missing data
total = df.isnull().sum().sort_values(ascending = False)
percent = (df.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
REFUSED_RATE_DOWN_PAYMENT_MEAN,303648,85.234287
REFUSED_AMT_DOWN_PAYMENT_MIN,303648,85.234287
REFUSED_RATE_DOWN_PAYMENT_MAX,303648,85.234287
REFUSED_RATE_DOWN_PAYMENT_MIN,303648,85.234287
REFUSED_AMT_DOWN_PAYMENT_MEAN,303648,85.234287
REFUSED_AMT_DOWN_PAYMENT_MAX,303648,85.234287
REFUSED_APP_CREDIT_PERC_VAR,298034,83.658432
CC_AMT_PAYMENT_CURRENT_VAR,284649,79.901249
CC_CNT_DRAWINGS_ATM_CURRENT_VAR,284559,79.875986
CC_AMT_DRAWINGS_ATM_CURRENT_VAR,284559,79.875986


In [7]:
df.shape

(356251, 799)

Get the most important 150 features and check for the NAN values. 

In [9]:
index = pd.read_csv('features_selected.csv')

In [10]:
index

Unnamed: 0,feature,importance
0,PAYMENT_RATE,1201
1,EXT_SOURCE_3,872
2,EXT_SOURCE_1,822
3,EXT_SOURCE_2,776
4,DAYS_BIRTH,735
5,DAYS_EMPLOYED,535
6,AMT_ANNUITY,511
7,ACTIVE_DAYS_CREDIT_MAX,469
8,INSTAL_DAYS_ENTRY_PAYMENT_MAX,460
9,AMT_GOODS_PRICE,442


In [12]:
index['feature']

0                                       PAYMENT_RATE
1                                       EXT_SOURCE_3
2                                       EXT_SOURCE_1
3                                       EXT_SOURCE_2
4                                         DAYS_BIRTH
5                                      DAYS_EMPLOYED
6                                        AMT_ANNUITY
7                             ACTIVE_DAYS_CREDIT_MAX
8                      INSTAL_DAYS_ENTRY_PAYMENT_MAX
9                                    AMT_GOODS_PRICE
10                                   DAYS_ID_PUBLISH
11                                DAYS_EMPLOYED_PERC
12                                   INSTAL_DPD_MEAN
13                    ACTIVE_DAYS_CREDIT_ENDDATE_MIN
14                               ANNUITY_INCOME_PERC
15                         APPROVED_CNT_PAYMENT_MEAN
16                            CLOSED_DAYS_CREDIT_MAX
17                                 DAYS_REGISTRATION
18                        REGION_POPULATION_RE

In [15]:
df_new = df[index.feature]

In [18]:
df_new.shape

(356251, 150)

Chech the NAN values in new dataframe. 

In [19]:
# checking missing data
total = df_new.isnull().sum().sort_values(ascending = False)
percent = (df_new.isnull().sum()/df.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(40)

Unnamed: 0,Total,Percent
ACTIVE_AMT_ANNUITY_MAX,,
ACTIVE_AMT_ANNUITY_MEAN,,
ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN,223254.0,62.667614
ACTIVE_AMT_CREDIT_SUM_DEBT_MAX,116391.0,32.671066
ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN,116391.0,32.671066
ACTIVE_AMT_CREDIT_SUM_DEBT_SUM,104440.0,29.316409
ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN,138151.0,38.779119
ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM,,
ACTIVE_AMT_CREDIT_SUM_MAX,104443.0,29.317251
ACTIVE_AMT_CREDIT_SUM_MEAN,104443.0,29.317251


In [20]:
missing_data.head(40)

Unnamed: 0,Total,Percent
ACTIVE_AMT_ANNUITY_MAX,,
ACTIVE_AMT_ANNUITY_MEAN,,
ACTIVE_AMT_CREDIT_MAX_OVERDUE_MEAN,223254.0,62.667614
ACTIVE_AMT_CREDIT_SUM_DEBT_MAX,116391.0,32.671066
ACTIVE_AMT_CREDIT_SUM_DEBT_MEAN,116391.0,32.671066
ACTIVE_AMT_CREDIT_SUM_DEBT_SUM,104440.0,29.316409
ACTIVE_AMT_CREDIT_SUM_LIMIT_MEAN,138151.0,38.779119
ACTIVE_AMT_CREDIT_SUM_LIMIT_SUM,,
ACTIVE_AMT_CREDIT_SUM_MAX,104443.0,29.317251
ACTIVE_AMT_CREDIT_SUM_MEAN,104443.0,29.317251


In [21]:
df_new.fillna(-999)

Unnamed: 0,PAYMENT_RATE,EXT_SOURCE_3,EXT_SOURCE_1,EXT_SOURCE_2,DAYS_BIRTH,DAYS_EMPLOYED,AMT_ANNUITY,ACTIVE_DAYS_CREDIT_MAX,INSTAL_DAYS_ENTRY_PAYMENT_MAX,AMT_GOODS_PRICE,...,BURO_AMT_CREDIT_SUM_DEBT_MAX,PREV_CNT_PAYMENT_SUM,BURO_CREDIT_TYPE_Consumer credit_MEAN,PREV_WEEKDAY_APPR_PROCESS_START_MONDAY_MEAN,CC_CNT_DRAWINGS_CURRENT_VAR,PREV_NAME_CLIENT_TYPE_New_MEAN,BURO_CREDIT_ACTIVE_Active_MEAN,PREV_AMT_ANNUITY_MAX,INSTAL_DPD_SUM,POS_SK_DPD_MEAN
0,0.060749,0.139376,0.083037,0.262949,-9461,-637.0,24700.5,-103.0,-49.0,351000.0,...,245781.000,24.0,0.500000,0.000000,-999.000000,1.000000,0.250000,9251.775,0.0,0.000000
1,0.027598,-999.000000,0.311267,0.622246,-16765,-1188.0,35698.5,-606.0,-544.0,1129500.0,...,0.000,30.0,0.500000,0.000000,-999.000000,0.000000,0.250000,98356.995,0.0,0.000000
2,0.050000,0.729567,-999.000000,0.555912,-19046,-225.0,6750.0,-999.0,-727.0,135000.0,...,0.000,4.0,1.000000,0.000000,-999.000000,1.000000,0.000000,5357.250,0.0,0.000000
3,0.094941,-999.000000,-999.000000,0.650442,-19005,-3039.0,29686.5,-999.0,-12.0,297000.0,...,-999.000,138.0,-999.000000,0.000000,0.000000,0.111111,-999.000000,39954.510,0.0,0.000000
4,0.042623,-999.000000,-999.000000,0.322738,-19932,-3038.0,21865.5,-999.0,-14.0,513000.0,...,0.000,124.0,1.000000,0.166667,-999.000000,0.166667,0.000000,22678.785,63.0,0.000000
5,0.056101,0.621226,-999.000000,0.354225,-16941,-1588.0,27517.5,-78.0,-82.0,454500.0,...,240057.000,56.0,1.000000,0.600000,-999.000000,0.200000,0.333333,25309.575,1317.0,339.060241
6,0.026463,0.492060,0.774761,0.724000,-13778,-3130.0,41301.0,-239.0,-58.0,1395000.0,...,557959.500,56.0,0.888889,0.142857,-999.000000,0.000000,0.222222,17341.605,7.0,0.000000
7,0.027500,0.540654,-999.000000,0.714279,-18850,-449.0,42075.0,-1138.0,-774.0,1530000.0,...,348007.500,10.0,0.500000,0.000000,-999.000000,1.000000,0.500000,27463.410,0.0,0.000000
8,0.033176,0.751724,0.587334,0.205747,-20099,-999.0,33826.5,-999.0,-37.0,913500.0,...,0.000,42.0,0.750000,0.250000,0.216216,0.250000,0.000000,31295.250,1033.0,205.666667
9,0.050000,-999.000000,-999.000000,0.746644,-14469,-2019.0,20250.0,-999.0,-151.0,405000.0,...,-999.000,54.0,-999.000000,0.250000,-999.000000,0.250000,-999.000000,11188.035,27.0,0.000000


## Model Building 

In [23]:
test_df = pd.read_csv(path+'/application_test.csv')

In [24]:
test_df.shape

(48744, 121)

In [25]:
df = pd.read_csv(path+'/application_train.csv')

In [26]:
df.shape

(307511, 122)

In [None]:
df.