In [4]:
import numpy as np
import pandas as pd
import gc
import time
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

import os
warnings.simplefilter(action='ignore', category=FutureWarning)

In [6]:
pd.set_option('max_columns',200)
pd.set_option('max_rows',200)

# Data preprocessing

There are 8 tables in total:
application_train 
application_test 
bureau 
bureau_balance 
credit_card_balance 
installments_payments 
POS_CASH_balance 
previous_application 

## Basic preprocessing functions


In [None]:
def one_hot_encoder(df, nan_as_category=True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

def categorical_to_numeric(series):
    class_mapping = {cls: value for value,cls in enumerate(series.unique())}
    series = series.map(class_mapping)
    return series


## application_train & application_test

In [5]:
df = pd.read_csv('application_train.csv')
test_df = pd.read_csv('application_test.csv')

In [35]:
# a glance at the data
df.iloc[:20]

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_YEAR,APARTMENTS_AVG,APARTMENTS_MEDI,APARTMENTS_MODE,BASEMENTAREA_AVG,BASEMENTAREA_MEDI,BASEMENTAREA_MODE,CNT_CHILDREN,CNT_FAM_MEMBERS,CODE_GENDER,COMMONAREA_AVG,COMMONAREA_MEDI,COMMONAREA_MODE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,DAYS_LAST_PHONE_CHANGE,DAYS_REGISTRATION,DEF_30_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,ELEVATORS_AVG,ELEVATORS_MEDI,ELEVATORS_MODE,EMERGENCYSTATE_MODE,ENTRANCES_AVG,ENTRANCES_MEDI,ENTRANCES_MODE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,FLAG_CONT_MOBILE,FLAG_DOCUMENT_3,FLAG_EMAIL,FLAG_EMP_PHONE,FLAG_MOBIL,FLAG_OWN_CAR,FLAG_OWN_REALTY,FLAG_PHONE,FLAG_WORK_PHONE,FLOORSMAX_AVG,FLOORSMAX_MEDI,FLOORSMAX_MODE,FLOORSMIN_AVG,FLOORSMIN_MEDI,FLOORSMIN_MODE,FONDKAPREMONT_MODE,HOUR_APPR_PROCESS_START,HOUSETYPE_MODE,LANDAREA_AVG,LANDAREA_MEDI,LANDAREA_MODE,LIVE_CITY_NOT_WORK_CITY,LIVE_REGION_NOT_WORK_REGION,LIVINGAPARTMENTS_AVG,LIVINGAPARTMENTS_MEDI,LIVINGAPARTMENTS_MODE,LIVINGAREA_AVG,LIVINGAREA_MEDI,LIVINGAREA_MODE,NAME_CONTRACT_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,NAME_INCOME_TYPE,NAME_TYPE_SUITE,NONLIVINGAPARTMENTS_AVG,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_AVG,NONLIVINGAREA_MEDI,NONLIVINGAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,OCCUPATION_TYPE,ORGANIZATION_TYPE,OWN_CAR_AGE,REGION_POPULATION_RELATIVE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,SK_ID_CURR,TARGET,TOTALAREA_MODE,WALLSMATERIAL_MODE,WEEKDAY_APPR_PROCESS_START,YEARS_BEGINEXPLUATATION_AVG,YEARS_BEGINEXPLUATATION_MEDI,...,CC_AMT_DRAWINGS_ATM_CURRENT_MEAN,CC_AMT_DRAWINGS_ATM_CURRENT_SUM,CC_AMT_DRAWINGS_ATM_CURRENT_VAR,CC_AMT_DRAWINGS_CURRENT_MAX,CC_AMT_DRAWINGS_CURRENT_MEAN,CC_AMT_DRAWINGS_CURRENT_SUM,CC_AMT_DRAWINGS_CURRENT_VAR,CC_AMT_DRAWINGS_OTHER_CURRENT_MAX,CC_AMT_DRAWINGS_OTHER_CURRENT_MEAN,CC_AMT_DRAWINGS_OTHER_CURRENT_SUM,CC_AMT_DRAWINGS_OTHER_CURRENT_VAR,CC_AMT_DRAWINGS_POS_CURRENT_MAX,CC_AMT_DRAWINGS_POS_CURRENT_MEAN,CC_AMT_DRAWINGS_POS_CURRENT_SUM,CC_AMT_DRAWINGS_POS_CURRENT_VAR,CC_AMT_INST_MIN_REGULARITY_MAX,CC_AMT_INST_MIN_REGULARITY_MEAN,CC_AMT_INST_MIN_REGULARITY_SUM,CC_AMT_INST_MIN_REGULARITY_VAR,CC_AMT_PAYMENT_CURRENT_MAX,CC_AMT_PAYMENT_CURRENT_MEAN,CC_AMT_PAYMENT_CURRENT_SUM,CC_AMT_PAYMENT_CURRENT_VAR,CC_AMT_PAYMENT_TOTAL_CURRENT_MAX,CC_AMT_PAYMENT_TOTAL_CURRENT_MEAN,CC_AMT_PAYMENT_TOTAL_CURRENT_SUM,CC_AMT_PAYMENT_TOTAL_CURRENT_VAR,CC_AMT_RECEIVABLE_PRINCIPAL_MAX,CC_AMT_RECEIVABLE_PRINCIPAL_MEAN,CC_AMT_RECEIVABLE_PRINCIPAL_SUM,CC_AMT_RECEIVABLE_PRINCIPAL_VAR,CC_AMT_RECIVABLE_MAX,CC_AMT_RECIVABLE_MEAN,CC_AMT_RECIVABLE_SUM,CC_AMT_RECIVABLE_VAR,CC_AMT_TOTAL_RECEIVABLE_MAX,CC_AMT_TOTAL_RECEIVABLE_MEAN,CC_AMT_TOTAL_RECEIVABLE_SUM,CC_AMT_TOTAL_RECEIVABLE_VAR,CC_CNT_DRAWINGS_ATM_CURRENT_MAX,CC_CNT_DRAWINGS_ATM_CURRENT_MEAN,CC_CNT_DRAWINGS_ATM_CURRENT_SUM,CC_CNT_DRAWINGS_ATM_CURRENT_VAR,CC_CNT_DRAWINGS_CURRENT_MAX,CC_CNT_DRAWINGS_CURRENT_MEAN,CC_CNT_DRAWINGS_CURRENT_SUM,CC_CNT_DRAWINGS_CURRENT_VAR,CC_CNT_DRAWINGS_OTHER_CURRENT_MAX,CC_CNT_DRAWINGS_OTHER_CURRENT_MEAN,CC_CNT_DRAWINGS_OTHER_CURRENT_SUM,CC_CNT_DRAWINGS_OTHER_CURRENT_VAR,CC_CNT_DRAWINGS_POS_CURRENT_MAX,CC_CNT_DRAWINGS_POS_CURRENT_MEAN,CC_CNT_DRAWINGS_POS_CURRENT_SUM,CC_CNT_DRAWINGS_POS_CURRENT_VAR,CC_CNT_INSTALMENT_MATURE_CUM_MAX,CC_CNT_INSTALMENT_MATURE_CUM_MEAN,CC_CNT_INSTALMENT_MATURE_CUM_SUM,CC_CNT_INSTALMENT_MATURE_CUM_VAR,CC_SK_DPD_MAX,CC_SK_DPD_MEAN,CC_SK_DPD_SUM,CC_SK_DPD_VAR,CC_SK_DPD_DEF_MAX,CC_SK_DPD_DEF_MEAN,CC_SK_DPD_DEF_SUM,CC_SK_DPD_DEF_VAR,CC_NAME_CONTRACT_STATUS_Active_MAX,CC_NAME_CONTRACT_STATUS_Active_MEAN,CC_NAME_CONTRACT_STATUS_Active_SUM,CC_NAME_CONTRACT_STATUS_Active_VAR,CC_NAME_CONTRACT_STATUS_Approved_MAX,CC_NAME_CONTRACT_STATUS_Approved_MEAN,CC_NAME_CONTRACT_STATUS_Approved_SUM,CC_NAME_CONTRACT_STATUS_Approved_VAR,CC_NAME_CONTRACT_STATUS_Completed_MAX,CC_NAME_CONTRACT_STATUS_Completed_MEAN,CC_NAME_CONTRACT_STATUS_Completed_SUM,CC_NAME_CONTRACT_STATUS_Completed_VAR,CC_NAME_CONTRACT_STATUS_Demand_MAX,CC_NAME_CONTRACT_STATUS_Demand_MEAN,CC_NAME_CONTRACT_STATUS_Demand_SUM,CC_NAME_CONTRACT_STATUS_Demand_VAR,CC_NAME_CONTRACT_STATUS_Refused_MAX,CC_NAME_CONTRACT_STATUS_Refused_MEAN,CC_NAME_CONTRACT_STATUS_Refused_SUM,CC_NAME_CONTRACT_STATUS_Refused_VAR,CC_NAME_CONTRACT_STATUS_Sent proposal_MAX,CC_NAME_CONTRACT_STATUS_Sent proposal_MEAN,CC_NAME_CONTRACT_STATUS_Sent proposal_SUM,CC_NAME_CONTRACT_STATUS_Sent proposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MAX,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MAX,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,0,24700.5,406597.5,351000.0,202500.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0247,0.025,0.0252,0.0369,0.0369,0.0383,0,1.0,0,0.0143,0.0144,0.0144,-9461,-637.0,-2120,-1134.0,-3648.0,2.0,2.0,0.0,0.0,0.0,0,0.069,0.069,0.069,0.083037,0.262949,0.139376,1,1,0,1,1,0,0,1,0,0.0833,0.0833,0.0833,0.125,0.125,0.125,0,10,0,0.0369,0.0375,0.0377,0,0,0.0202,0.0205,0.022,0.019,0.0193,0.0198,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0,0,,0.018801,2,2,0,0,0,0,100002,1.0,0.0149,0,0,0.9722,0.9722,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,35698.5,1293502.5,1129500.0,270000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0959,0.0968,0.0924,0.0529,0.0529,0.0538,0,2.0,1,0.0605,0.0608,0.0497,-16765,-1188.0,-291,-828.0,-1186.0,0.0,0.0,0.08,0.08,0.0806,0,0.0345,0.0345,0.0345,0.311267,0.622246,,1,1,0,1,1,0,1,1,0,0.2917,0.2917,0.2917,0.3333,0.3333,0.3333,0,11,0,0.013,0.0132,0.0128,0,0,0.0773,0.0787,0.079,0.0549,0.0558,0.0554,0,1,1,0,1,1,0.0039,0.0039,0.0,0.0098,0.01,0.0,1.0,1.0,1,1,,0.003541,1,1,0,0,0,0,100003,0.0,0.0714,1,1,0.9851,0.9851,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,6750.0,135000.0,135000.0,67500.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0,1.0,0,,,,-19046,-225.0,-2531,-815.0,-4260.0,0.0,0.0,,,,1,,,,,0.555912,0.729567,1,0,0,1,1,1,0,1,1,,,,,,,1,9,1,,,,0,0,,,,,,,1,0,0,0,0,0,,,,,,,0.0,0.0,0,2,26.0,0.010032,2,2,0,0,0,0,100004,0.0,,2,1,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,29686.5,312682.5,297000.0,135000.0,,,,,,,,,,,,,0,2.0,1,,,,-19005,-3039.0,-2437,-617.0,-9833.0,0.0,0.0,,,,1,,,,,0.650442,,1,1,0,1,1,0,0,0,0,,,,,,,1,17,1,,,,0,0,,,,,,,0,0,2,0,0,0,,,,,,,2.0,2.0,0,0,,0.008019,2,2,0,0,0,0,100006,0.0,,2,0,,,...,,0.0,,0.0,0.0,0.0,0.0,,,0.0,,,,0.0,,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,,0.0,0.0,0.0,0.0,,,0.0,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,4,21865.5,513000.0,513000.0,121500.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0,1.0,0,,,,-19932,-3038.0,-3458,-1106.0,-4311.0,0.0,0.0,,,,1,,,,,0.322738,,1,0,0,1,1,0,0,0,0,,,,,,,1,11,1,,,,1,0,,,,,,,0,0,0,0,0,0,,,,,,,0.0,0.0,1,3,,0.028663,2,2,0,1,0,0,100007,0.0,,2,2,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,5,27517.5,490495.5,454500.0,99000.0,0.0,0.0,0.0,1.0,0.0,1.0,,,,,,,0,2.0,0,,,,-16941,-1588.0,-477,-2536.0,-4970.0,0.0,0.0,,,,1,,,,,0.354225,0.621226,1,1,0,1,1,0,0,1,1,,,,,,,1,16,1,,,,0,0,,,,,,,0,0,1,0,1,2,,,,,,,0.0,0.0,0,4,,0.035792,2,2,0,0,0,0,100008,0.0,,2,0,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,6,41301.0,1560726.0,1395000.0,171000.0,0.0,0.0,1.0,1.0,0.0,2.0,,,,,,,1,3.0,1,,,,-13778,-3130.0,-619,-1562.0,-1213.0,0.0,0.0,,,,1,,,,0.774761,0.724,0.49206,1,0,0,1,1,1,0,1,0,,,,,,,1,16,1,,,,0,0,,,,,,,0,1,1,0,2,0,,,,,,,1.0,1.0,2,0,17.0,0.035792,2,2,0,0,0,0,100009,0.0,,2,3,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,7,42075.0,1530000.0,1530000.0,360000.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,0,2.0,0,,,,-18850,-449.0,-2379,-1070.0,-4597.0,0.0,0.0,,,,1,,,,,0.714279,0.540654,1,1,0,1,1,1,0,0,1,,,,,,,1,16,1,,,,1,0,,,,,,,0,1,1,0,1,0,,,,,,,2.0,2.0,3,4,8.0,0.003122,3,3,0,1,0,0,100010,0.0,,2,1,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
8,8,33826.5,1019610.0,913500.0,112500.0,0.0,0.0,0.0,0.0,0.0,1.0,,,,,,,0,2.0,1,,,,-20099,,-3514,0.0,-7427.0,0.0,0.0,,,,1,,,,0.587334,0.205747,0.751724,1,1,0,0,1,0,0,0,0,,,,,,,1,14,1,,,,0,0,,,,,,,0,0,1,0,3,3,,,,,,,1.0,1.0,4,5,,0.018634,2,2,0,0,0,0,100011,0.0,,2,0,,,...,2432.432432,180000.0,437837800.0,180000.0,2432.432432,180000.0,437837800.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9000.0,3956.221849,288804.195,20139910.0,55485.0,4843.064189,358386.75,52992600.0,55485.0,4520.067568,334485.0,55858770.0,180000.0,52402.088919,3877754.58,4324223000.0,189000.0,54433.179122,4028055.255,4646736000.0,189000.0,54433.179122,4028055.255,4646736000.0,4.0,0.054054,4.0,0.216216,4.0,0.054054,4.0,0.216216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,25.767123,1881.0,105.847793,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74.0
9,9,20250.0,405000.0,405000.0,135000.0,,,,,,,,,,,,,0,1.0,0,,,,-14469,-2019.0,-3992,-1673.0,-14437.0,0.0,0.0,,,,1,,,,,0.746644,,1,0,0,1,1,0,0,0,0,,,,,,,1,8,1,,,,0,0,,,,,,,1,0,0,0,0,0,,,,,,,2.0,2.0,0,6,,0.019689,2,2,0,0,0,0,100012,0.0,,2,2,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [36]:
test_df.iloc[:20]

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,Cash loans,F,N,Y,0,135000.0,568800.0,20560.5,450000.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.01885,-19241,-2329,-5170.0,-812,,1,1,0,1,0,1,,2.0,2,2,TUESDAY,18,0,0,0,0,0,0,Kindergarten,0.752614,0.789654,0.15952,0.066,0.059,0.9732,,,,0.1379,0.125,,,,0.0505,,,0.0672,0.0612,0.9732,,,,0.1379,0.125,,,,0.0526,,,0.0666,0.059,0.9732,,,,0.1379,0.125,,,,0.0514,,,,block of flats,0.0392,"Stone, brick",No,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,Cash loans,M,N,Y,0,99000.0,222768.0,17370.0,180000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.035792,-18064,-4469,-9118.0,-1623,,1,1,0,1,0,0,Low-skill Laborers,2.0,2,2,FRIDAY,9,0,0,0,0,0,0,Self-employed,0.56499,0.291656,0.432962,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,Cash loans,M,Y,Y,0,202500.0,663264.0,69777.0,630000.0,,Working,Higher education,Married,House / apartment,0.019101,-20038,-4458,-2175.0,-3503,5.0,1,1,0,1,0,0,Drivers,2.0,2,2,MONDAY,14,0,0,0,0,0,0,Transport: type 3,,0.699787,0.610991,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,Cash loans,F,N,Y,2,315000.0,1575000.0,49018.5,1575000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.026392,-13976,-1866,-2000.0,-4208,,1,1,0,1,1,0,Sales staff,4.0,2,2,WEDNESDAY,11,0,0,0,0,0,0,Business Entity Type 3,0.525734,0.509677,0.612704,0.3052,0.1974,0.997,0.9592,0.1165,0.32,0.2759,0.375,0.0417,0.2042,0.2404,0.3673,0.0386,0.08,0.3109,0.2049,0.997,0.9608,0.1176,0.3222,0.2759,0.375,0.0417,0.2089,0.2626,0.3827,0.0389,0.0847,0.3081,0.1974,0.997,0.9597,0.1173,0.32,0.2759,0.375,0.0417,0.2078,0.2446,0.3739,0.0388,0.0817,reg oper account,block of flats,0.37,Panel,No,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,Cash loans,M,Y,N,1,180000.0,625500.0,32067.0,625500.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010032,-13040,-2191,-4000.0,-4262,16.0,1,1,1,1,0,0,,3.0,2,2,FRIDAY,5,0,0,0,0,1,1,Business Entity Type 3,0.202145,0.425687,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,
5,100042,Cash loans,F,Y,Y,0,270000.0,959688.0,34600.5,810000.0,Unaccompanied,State servant,Secondary / secondary special,Married,House / apartment,0.025164,-18604,-12009,-6116.0,-2027,10.0,1,1,0,1,1,0,Drivers,2.0,2,2,MONDAY,15,0,0,0,0,0,0,Government,,0.628904,0.392774,0.2412,0.0084,0.9821,0.7552,0.0452,0.16,0.1379,0.3333,0.375,0.1683,0.1942,0.2218,0.0116,0.0731,0.2458,0.0088,0.9821,0.7648,0.0457,0.1611,0.1379,0.3333,0.375,0.1721,0.2121,0.2311,0.0117,0.0774,0.2436,0.0084,0.9821,0.7585,0.0455,0.16,0.1379,0.3333,0.375,0.1712,0.1975,0.2258,0.0116,0.0746,not specified,block of flats,0.2151,Block,No,0.0,0.0,0.0,0.0,-1705.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,2.0
6,100057,Cash loans,M,Y,Y,2,180000.0,499221.0,22117.5,373500.0,Unaccompanied,Working,Higher education,Married,House / apartment,0.0228,-16685,-2580,-10125.0,-241,3.0,1,1,0,1,0,0,High skill tech staff,4.0,2,2,THURSDAY,9,0,0,0,0,1,1,Industry: type 9,0.760851,0.571084,0.65126,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,1.0,0.0,-1182.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
7,100065,Cash loans,M,N,Y,0,166500.0,180000.0,14220.0,180000.0,Unaccompanied,Working,Higher education,Single / not married,With parents,0.005144,-9516,-1387,-5063.0,-2055,,1,1,1,1,1,0,Core staff,1.0,2,2,FRIDAY,7,0,0,0,0,0,0,Self-employed,0.56529,0.613033,0.312365,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-1182.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
8,100066,Cash loans,F,N,Y,0,315000.0,364896.0,28957.5,315000.0,Unaccompanied,State servant,Higher education,Married,House / apartment,0.04622,-12744,-1013,-1686.0,-3171,,1,1,0,1,0,0,Core staff,2.0,1,1,THURSDAY,18,0,0,0,0,0,0,School,0.718507,0.808788,0.522697,0.1031,0.1115,0.9781,,,0.0,0.2069,0.1667,,,,,,,0.105,0.1157,0.9782,,,0.0,0.2069,0.1667,,,,,,,0.1041,0.1115,0.9781,,,0.0,0.2069,0.1667,,,,,,,,block of flats,0.0702,"Stone, brick",No,0.0,0.0,0.0,0.0,-829.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
9,100067,Cash loans,F,Y,Y,1,162000.0,45000.0,5337.0,45000.0,Family,Working,Higher education,Civil marriage,House / apartment,0.018634,-10395,-2625,-8124.0,-3041,5.0,1,1,1,1,1,0,Sales staff,3.0,2,2,TUESDAY,14,0,0,0,0,0,0,Trade: type 2,0.210562,0.444848,0.194068,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4.0,0.0,4.0,0.0,-1423.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0


In [15]:
print(df.shape, '\n', test_df.shape)

(307511, 122) 
 (48744, 121)


In [None]:
# Dealing with NaN values of CODE_GENDER
df['CODE_GENDER'].replace('XNA', 'F', inplace=True)
# Dealing withNaN values for DAYS_EMPLOYED: 365.243 -> nan
df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True)

docs = [_f for _f in df.columns if 'FLAG_DOC' in _f] 
live = [_f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f)] 

df[['FLAG_OWN_CAR','FLAG_OWN_REALTY']] = df[['FLAG_OWN_CAR','FLAG_OWN_REALTY']].apply(categorical_to_numeric, axis=0)
# Encode categorical features. I do not use the popuplar one-hot-encode since it lacks efficiency

def numeric_to_categorical(series, features):
    a = []
    for feature in features:
        if series[feature] == 1:
            a.append(feature.split("_")[-1])
    return str(a)
df['NEW_FLAG_DOCUMENT'] = df[docs].apply(numeric_to_categorical, axis=1, features=docs)
class_mapping = {cls: value for value,cls in enumerate(df['NEW_FLAG_DOCUMENT'].unique())}
df['NEW_FLAG_DOCUMENT'] = df['NEW_FLAG_DOCUMENT'].map(class_mapping)

df['NEW_FLAG_LIVING'] = df[live].apply(numeric_to_categorical, axis=1, features=live)
class_mapping = {cls: value for value,cls in enumerate(df['NEW_FLAG_LIVING'].unique())}
df['NEW_FLAG_LIVING'] = df['NEW_FLAG_LIVING'].map(class_mapping)


In [None]:
# making some new features about money
df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY']  
df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df['AMT_GOODS_PRICE']
df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df['AMT_INCOME_TOTAL'] 
df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1)
df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1)
df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN'])  
df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / (1 + df['AMT_INCOME_TOTAL'])  

inc_by_org = df[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org)

# making some new features about time period
df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] 
df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH']  
df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED']  
df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_BIRTH']
df['NEW_PHONE_TO_BIRTH_RATIO_EMPLOYER'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_EMPLOYED'] 
df['NEW_PHONE_TO_ID_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_ID_PUBLISH']
df['NEW_PHONE_TO_REGISTRATION_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['DAYS_REGISTRATION']
df['NEW_PHONE_TO_CAR_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df['OWN_CAR_AGE']

# making some new features about external sources
df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df['EXT_SOURCE_3']
df['NEW_EXT_SOURCES_MEAN'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
df['NEW_SCORES_STD'] = df[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna(df['NEW_SCORES_STD'].mean())
df['NEW_SCORES_SUM'] = df['EXT_SOURCE_1'] +df['EXT_SOURCE_2'] +df['EXT_SOURCE_3'] 

# information about thest FLAG_DOCUMENT features has been extracted
dropcolum = ['FLAG_DOCUMENT_2', 'FLAG_DOCUMENT_4',
             'FLAG_DOCUMENT_5', 'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7',
             'FLAG_DOCUMENT_8', 'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10',
             'FLAG_DOCUMENT_11', 'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13',
             'FLAG_DOCUMENT_14', 'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16',
             'FLAG_DOCUMENT_17', 'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19',
             'FLAG_DOCUMENT_20', 'FLAG_DOCUMENT_21']  
df = df.drop(dropcolum, axis=1)
del test_df
gc.collect()

## bureau & bureau_balance

In [18]:
bureau = pd.read_csv("bureau.csv")
bb = pd.read_csv("bureau_balance.csv")

In [37]:
bureau.iloc[0:20]

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,
5,215354,5714467,Active,currency 1,-273,0,27460.0,,0.0,0,180000.0,71017.38,108982.62,0.0,Credit card,-31,
6,215354,5714468,Active,currency 1,-43,0,79.0,,0.0,0,42103.8,42103.8,0.0,0.0,Consumer credit,-22,
7,162297,5714469,Closed,currency 1,-1896,0,-1684.0,-1710.0,14985.0,0,76878.45,0.0,0.0,0.0,Consumer credit,-1710,
8,162297,5714470,Closed,currency 1,-1146,0,-811.0,-840.0,0.0,0,103007.7,0.0,0.0,0.0,Consumer credit,-840,
9,162297,5714471,Active,currency 1,-1146,0,-484.0,,0.0,0,4500.0,0.0,0.0,0.0,Credit card,-690,


In [38]:
bb.iloc[0:20]

Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
5,5715448,-5,C
6,5715448,-6,C
7,5715448,-7,C
8,5715448,-8,C
9,5715448,-9,0


In [21]:
print(bureau.shape, '\n', bb.shape)

(1716428, 17) 
 (27299925, 3)


In [None]:
bb, bb_cat = one_hot_encoder(bb)
bureau, bureau_cat = one_hot_encoder(bureau)

# Bureau balance: Perform aggregations and merge with bureau
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']}
for col in bb_cat:
    bb_aggregations[col] = ['mean']
bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])

bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True)
del bb, bb_agg
gc.collect()

In [None]:
# create weighted features
bureau['weight'] = bureau[['SK_ID_CURR','DAYS_CREDIT']].groupby('SK_ID_CURR').apply(lambda x: 1-abs(x)/x.sum().abs()).iloc[:,1]
bureau['CREDIT_DAY_OVERDUE_WEIGHTED'] = bureau['CREDIT_DAY_OVERDUE']*bureau['weight']
bureau['AMT_CREDIT_MAX_OVERDUE_WEIGHTED'] = bureau['AMT_CREDIT_MAX_OVERDUE']*bureau['weight']
bureau['AMT_CREDIT_SUM_WEIGHTED'] = bureau['AMT_CREDIT_SUM']*bureau['weight']
bureau['AMT_CREDIT_SUM_DEBT_WEIGHTED'] = bureau['AMT_CREDIT_SUM_DEBT']*bureau['weight']
bureau['AMT_CREDIT_SUM_OVERDUE_WEIGHTED'] = bureau['AMT_CREDIT_SUM_OVERDUE']*bureau['weight']
bureau['AMT_CREDIT_SUM_LIMIT_WEIGHTED'] = bureau['AMT_CREDIT_SUM_LIMIT']*bureau['weight']
bureau['AMT_ANNUITY_WEIGHTED'] = bureau['AMT_ANNUITY']*bureau['weight']
bureau['CNT_CREDIT_PROLONG_WEIGHTED'] = bureau['CNT_CREDIT_PROLONG']*bureau['weight']

In [None]:
# Bureau and bureau_balance numeric features
num_aggregations = {
    'DAYS_CREDIT': ['mean', 'var'],
    'DAYS_CREDIT_ENDDATE': ['mean'],
    'DAYS_CREDIT_UPDATE': ['mean'],

    'CREDIT_DAY_OVERDUE': ['mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM': ['mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'AMT_ANNUITY': ['max', 'mean'],
    'CNT_CREDIT_PROLONG': ['sum'],
    # weighted features
    'CREDIT_DAY_OVERDUE_WEIGHTED': ['mean'],
    'AMT_CREDIT_MAX_OVERDUE_WEIGHTED': ['mean'],
    'AMT_CREDIT_SUM_WEIGHTED': ['mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT_WEIGHTED': ['mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE_WEIGHTED': ['mean'],
    'AMT_CREDIT_SUM_LIMIT_WEIGHTED': ['mean', 'sum'],
    'AMT_ANNUITY_WEIGHTED': ['max', 'mean'],
    'CNT_CREDIT_PROLONG_WEIGHTED': ['sum'],
    # bureau balance
    'MONTHS_BALANCE_MIN': ['min'],
    'MONTHS_BALANCE_MAX': ['max'],
    'MONTHS_BALANCE_SIZE': ['mean', 'sum']
}
# Bureau and bureau_balance categorical features
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ['mean']
for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']

bureau_agg = bureau.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])

In [None]:
# Bureau: Active credits - using only numerical aggregations
active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
active_agg.columns = pd.Index(['ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
bureau_agg = bureau_agg.join(active_agg, how='left')
del active, active_agg
gc.collect()

# Bureau: Closed credits - using only numerical aggregations
closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations)
closed_agg.columns = pd.Index(['CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
bureau_agg = bureau_agg.join(closed_agg, how='left')
del closed, closed_agg, bureau
gc.collect()

df = df.join(bureau_agg, how='left', on='SK_ID_CURR') # merge with the main table

## credit_card.balance

In [22]:
cc = pd.read_csv("credit_card_balance.csv")

In [39]:
# a glance of the data
cc.sort_values(by=['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE']).iloc[0:20]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
1636141,1489396,100006,-6,0.0,270000,,0.0,,,0.0,,0.0,0.0,0.0,0.0,,0,,,0.0,Active,0,0
655566,1489396,100006,-5,0.0,270000,,0.0,,,0.0,,0.0,0.0,0.0,0.0,,0,,,0.0,Active,0,0
1399895,1489396,100006,-4,0.0,270000,,0.0,,,0.0,,0.0,0.0,0.0,0.0,,0,,,0.0,Active,0,0
1347528,1489396,100006,-3,0.0,270000,,0.0,,,0.0,,0.0,0.0,0.0,0.0,,0,,,0.0,Active,0,0
520387,1489396,100006,-2,0.0,270000,,0.0,,,0.0,,0.0,0.0,0.0,0.0,,0,,,0.0,Active,0,0
584804,1489396,100006,-1,0.0,270000,,0.0,,,0.0,,0.0,0.0,0.0,0.0,,0,,,0.0,Active,0,0
3131464,1843384,100011,-75,189000.0,180000,180000.0,180000.0,0.0,0.0,,0.0,0.0,180000.0,189000.0,189000.0,4.0,4,0.0,0.0,,Active,0,0
2447092,1843384,100011,-74,184568.85,180000,0.0,0.0,0.0,0.0,9000.0,9000.0,9000.0,180000.0,184568.85,184568.85,0.0,0,0.0,0.0,1.0,Active,0,0
2353190,1843384,100011,-73,181044.54,180000,0.0,0.0,0.0,0.0,9000.0,9000.0,9000.0,175568.85,181044.54,181044.54,0.0,0,0.0,0.0,2.0,Active,0,0
1086495,1843384,100011,-72,177544.35,180000,0.0,0.0,0.0,0.0,9000.0,9000.0,9000.0,172044.54,177544.35,177544.35,0.0,0,0.0,0.0,3.0,Active,0,0


In [24]:
cc.shape

(3840312, 23)

In [None]:
cc, cat_cols = one_hot_encoder(cc, nan_as_category=True)

# General aggregations
cc.drop(['SK_ID_PREV'], axis=1, inplace=True)
cc_agg = cc.groupby('SK_ID_CURR').agg(['max', 'mean', 'sum', 'var'])
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
# Count credit card lines
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()
del cc
gc.collect()

df = df.join(cc_agg, how='left', on='SK_ID_CURR') # merge with the main table

## instalments_payments

In [25]:
ins = pd.read_csv("installments_payments.csv")

In [40]:
# a glance at the data
ins.sort_values(by=['SK_ID_CURR','SK_ID_PREV','NUM_INSTALMENT_NUMBER']).iloc[0:20]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
1478621,1369693,100001,1.0,1,-1709.0,-1715.0,3951.0,3951.0
2568722,1369693,100001,1.0,2,-1679.0,-1715.0,3951.0,3951.0
3458712,1369693,100001,1.0,3,-1649.0,-1660.0,3951.0,3951.0
2624024,1369693,100001,2.0,4,-1619.0,-1628.0,17397.9,17397.9
1761012,1851984,100001,1.0,2,-2916.0,-2916.0,3982.05,3982.05
3774071,1851984,100001,1.0,3,-2886.0,-2875.0,3982.05,3982.05
3435373,1851984,100001,1.0,4,-2856.0,-2856.0,3980.925,3980.925
2144879,1038818,100002,1.0,1,-565.0,-587.0,9251.775,9251.775
2163032,1038818,100002,1.0,2,-535.0,-562.0,9251.775,9251.775
1675768,1038818,100002,1.0,3,-505.0,-529.0,9251.775,9251.775


In [27]:
ins.shape

(13605401, 8)

In [None]:
ins, cat_cols = one_hot_encoder(ins, nan_as_category=True)
# Percentage and difference paid in each installment (amount paid and installment value)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']  
ins['PAYMENT_DIFF_PERC'] = ins['PAYMENT_DIFF'] / ins['AMT_INSTALMENT']

# Days past due and days before due (no negative values)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)

# Create weighted features
ins['weight'] = ins[['SK_ID_PREV', 'DAYS_ENTRY_PAYMENT']].groupby('SK_ID_PREV').apply(lambda x: 1-abs(x)/x.sum().abs()).iloc[:,1]
ins['DPD_WEIGHTED'] = ins['DPD'] * ins['weight']
ins['DBD_WEIGHTED'] = ins['DBD'] * ins['weight']
ins['PAYMENT_PERC_WEIGHTED'] = ins['PAYMENT_PERC'] * ins['weight']
ins['PAYMENT_DIFF_WEIGHTED'] = ins['PAYMENT_DIFF'] * ins['weight']
ins['PAYMENT_DIFF_PERC_WEIGHTED'] = ins['PAYMENT_DIFF_PERC'] * ins['weight']

# Features: Perform aggregations
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max', 'mean', 'sum', 'min', 'std'],
    'DBD': ['max', 'mean', 'sum', 'min', 'std'],
    'PAYMENT_PERC': ['max', 'mean', 'var', 'min', 'std'],
    'PAYMENT_DIFF': ['max', 'mean', 'var', 'min', 'std'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum', 'min', 'std'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum', 'std'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum', 'std'],
    
    'DPD_WEIGHTED': ['max', 'mean', 'sum', 'min', 'std'],
    'DBD_WEIGHTED': ['max', 'mean', 'sum', 'min', 'std'],
    'PAYMENT_PERC_WEIGHTED': ['max', 'mean', 'var', 'min', 'std'],
    'PAYMENT_DIFF_WEIGHTED': ['max', 'mean', 'var', 'min', 'std'],
    'PAYMENT_DIFF_PERC': ['max', 'mean', 'var', 'min', 'std'],
    'PAYMENT_DIFF_PERC_WEIGHTED': ['max', 'mean', 'var', 'min', 'std']
     }

ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# Count installments accounts
ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size()
del ins
gc.collect()

df = df.join(ins_agg, how='left', on='SK_ID_CURR') # merge with the main table

## previous_application

In [28]:
prev = pd.read_csv("previous_application.csv")

In [41]:
# a glance at the data
prev.sort_values(by=['SK_ID_CURR']).iloc[0:20]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
201668,1369693,100001,Consumer loans,3951.0,24835.5,23787.0,2520.0,24835.5,FRIDAY,13,Y,1,0.104326,,,XAP,Approved,-1740,Cash through the bank,XAP,Family,Refreshed,Mobile,POS,XNA,Country-wide,23,Connectivity,8.0,high,POS mobile with interest,365243.0,-1709.0,-1499.0,-1619.0,-1612.0,0.0
892077,1038818,100002,Consumer loans,9251.775,179055.0,179055.0,0.0,179055.0,SATURDAY,9,Y,1,0.0,,,XAP,Approved,-606,XNA,XAP,,New,Vehicles,POS,XNA,Stone,500,Auto technology,24.0,low_normal,POS other with interest,365243.0,-565.0,125.0,-25.0,-17.0,0.0
575941,1810518,100003,Cash loans,98356.995,900000.0,1035882.0,,900000.0,FRIDAY,12,Y,1,,,,XNA,Approved,-746,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-716.0,-386.0,-536.0,-527.0,1.0
1021650,2636178,100003,Consumer loans,64567.665,337500.0,348637.5,0.0,337500.0,SUNDAY,17,Y,1,0.0,,,XAP,Approved,-828,Cash through the bank,XAP,Family,Refreshed,Furniture,POS,XNA,Stone,1400,Furniture,6.0,middle,POS industry with interest,365243.0,-797.0,-647.0,-647.0,-639.0,0.0
1223745,2396755,100003,Consumer loans,6737.31,68809.5,68053.5,6885.0,68809.5,SATURDAY,15,Y,1,0.100061,,,XAP,Approved,-2341,Cash through the bank,XAP,Family,Refreshed,Consumer Electronics,POS,XNA,Country-wide,200,Consumer electronics,12.0,middle,POS household with interest,365243.0,-2310.0,-1980.0,-1980.0,-1976.0,1.0
935548,1564014,100004,Consumer loans,5357.25,24282.0,20106.0,4860.0,24282.0,FRIDAY,5,Y,1,0.212008,,,XAP,Approved,-815,Cash through the bank,XAP,Unaccompanied,New,Mobile,POS,XNA,Regional / Local,30,Connectivity,4.0,middle,POS mobile without interest,365243.0,-784.0,-694.0,-724.0,-714.0,0.0
1259112,1857999,100005,Cash loans,,0.0,0.0,,,FRIDAY,10,Y,1,,,,XNA,Canceled,-315,XNA,XAP,,Repeater,XNA,XNA,XNA,Credit and cash offices,-1,XNA,,XNA,Cash,,,,,,
1378978,2495675,100005,Consumer loans,4813.2,44617.5,40153.5,4464.0,44617.5,THURSDAY,11,Y,1,0.108964,,,XAP,Approved,-757,Cash through the bank,XAP,,New,Mobile,POS,XNA,Country-wide,37,Connectivity,12.0,high,POS mobile with interest,365243.0,-706.0,-376.0,-466.0,-460.0,0.0
900957,1489396,100006,Revolving loans,13500.0,270000.0,270000.0,,270000.0,THURSDAY,15,Y,1,,,,XAP,Approved,-181,XNA,XAP,Unaccompanied,Repeater,XNA,Cards,x-sell,Credit and cash offices,-1,XNA,0.0,XNA,Card X-Sell,365243.0,365243.0,365243.0,365243.0,365243.0,0.0
1607443,1697039,100006,Cash loans,32696.1,688500.0,906615.0,,688500.0,THURSDAY,15,Y,1,,,,XNA,Refused,-181,Cash through the bank,LIMIT,Unaccompanied,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,48.0,low_normal,Cash X-Sell: low,,,,,,


In [None]:
prev, cat_cols = one_hot_encoder(prev, nan_as_category=True)

# Wrong values: Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

# Add features
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
prev['weight'] = prev[['SK_ID_CURR','DAYS_DECISION']].groupby('SK_ID_CURR').apply(lambda x: 1-abs(x)/x.sum().abs()).iloc[:,1]
prev['AMT_ANNUITY_WEIGHTED'] = prev['AMT_ANNUITY']*prev['weight']
prev['AMT_APPLICATION_WEIGHTED'] = prev['AMT_APPLICATION']*prev['weight']
prev['AMT_CREDIT_WEIGHTED'] = prev['AMT_CREDIT']*prev['weight']
prev['APP_CREDIT_PERC_WEIGHTED'] = prev['AMT_APPLICATION_WEIGHTED'] / prev['AMT_CREDIT_WEIGHTED']
prev['AMT_DOWN_PAYMENT_WEIGHTED'] = prev['AMT_DOWN_PAYMENT']*prev['weight']
prev['AMT_GOODS_PRICE_WEIGHTED'] = prev['AMT_GOODS_PRICE']*prev['weight']
prev['RATE_DOWN_PAYMENT_WEIGHTED'] = prev['RATE_DOWN_PAYMENT']*prev['weight']
prev['CNT_PAYMENT_WEIGHTED'] = prev['CNT_PAYMENT']*prev['weight']

In [None]:
# Previous applications numeric features
num_aggregations = {
    'AMT_ANNUITY': ['max', 'mean'],
    'AMT_APPLICATION': ['max', 'mean'],
    'AMT_CREDIT': ['max', 'mean'],
    'APP_CREDIT_PERC': ['max', 'mean'],
    'AMT_DOWN_PAYMENT': ['max', 'mean'],
    'AMT_GOODS_PRICE': ['max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['max', 'mean'],
    'RATE_DOWN_PAYMENT': ['max', 'mean'],
    'DAYS_DECISION': ['max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
    # weighted features
    'AMT_ANNUITY_WEIGHTED': ['max', 'mean'],
    'AMT_APPLICATION_WEIGHTED': ['max', 'mean'],
    'AMT_CREDIT_WEIGHTED': ['max', 'mean'],
    'APP_CREDIT_PERC_WEIGHTED': ['max', 'mean'],
    'AMT_DOWN_PAYMENT_WEIGHTED': ['max', 'mean'],
    'AMT_GOODS_PRICE_WEIGHTED': ['max', 'mean'],
    'RATE_DOWN_PAYMENT_WEIGHTED': ['max', 'mean'],
    'CNT_PAYMENT_WEIGHTED': ['mean', 'sum']
    }

# Previous applications categorical features
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']

prev_agg = prev.groupby('SK_ID_CURR').agg({**num_aggregations, **cat_aggregations})
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])

In [None]:
# Previous Applications: Approved Applications - only numerical features
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
prev_agg = prev_agg.join(approved_agg, how='left')

# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
prev_agg = prev_agg.join(refused_agg, how='left')

del refused, refused_agg, approved, approved_agg, prev
gc.collect()

df = df.join(prev_agg, how='left', on='SK_ID_CURR') # merge with the main table

## POS_CASH_balance

In [29]:
pos = pd.read_csv("POS_CASH_balance.csv")

In [42]:
# a glance at the data
pos.sort_values(by=['SK_ID_CURR','SK_ID_PREV','MONTHS_BALANCE']).iloc[0:20]

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
7167007,1369693,100001,-57,4.0,4.0,Active,0,0
8789081,1369693,100001,-56,4.0,3.0,Active,0,0
7823681,1369693,100001,-55,4.0,2.0,Active,0,0
4704415,1369693,100001,-54,4.0,1.0,Active,0,0
2197888,1369693,100001,-53,4.0,0.0,Completed,0,0
1261679,1851984,100001,-96,4.0,2.0,Active,0,0
1891462,1851984,100001,-95,4.0,1.0,Active,7,7
8531326,1851984,100001,-94,4.0,0.0,Active,0,0
4928574,1851984,100001,-93,4.0,0.0,Completed,0,0
8473918,1038818,100002,-19,24.0,24.0,Active,0,0


In [31]:
pos.shape

(10001358, 8)

In [None]:
pos, cat_cols = one_hot_encoder(pos, nan_as_category=True)

# Features aggregations
aggregations = {
    'MONTHS_BALANCE': ['max', 'mean', 'size'],
    'SK_DPD': ['max', 'mean'],
    'SK_DPD_DEF': ['max', 'mean']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']

pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])

# Count pos cash accounts
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
del pos
gc.collect()

df = df.join(pos_agg, how='left', on='SK_ID_CURR') # merge with the main table

# LightGBM

In [None]:
del bureau_agg, cc_agg, ins_agg, pos_agg, prev_agg

categorical_features = [col for col in df.columns if df[col].dtype=='object']
categorical_features.append('NEW_FLAG_DOCUMENT')
categorical_features.append('NEW_FLAG_LIVING')

df[categorical_features] = df[categorical_features].apply(categorical_to_numeric, axis=0)

train_df = df[df['TARGET'].notnull()]
test_df = df[df['TARGET'].isnull()]

print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
del df
gc.collect()

In [None]:
num_folds = 5 # CV folds
folds = KFold(n_splits=num_folds, shuffle=True, random_state=47)
    
# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['TARGET'])):
    train_x, train_y = train_df[feats].iloc[train_idx], train_df['TARGET'].iloc[train_idx]
    valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['TARGET'].iloc[valid_idx]

    t0 = time.time()
    print("fold: ", n_fold)

    # LightGBM parameters found by Bayesian optimization
    clf = LGBMClassifier(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.02,
        # is_unbalance=True,
        num_leaves=32,
        colsample_bytree=0.9497036,
        subsample=0.8715623,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.073,
        min_split_gain=0.0222415,
        min_child_weight=40,
        silent=-1,
        verbose=-1,
        # scale_pos_weight=11
    )

    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)],
            categorical_feature=categorical_features,
            eval_metric='auc', verbose=100, early_stopping_rounds=100)

    print("time used:",(time.time()-t0)/60)
    record[:,n_fold] = clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1]

    oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(test_df[feats], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()

print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))

# Write submission file and plot feature importance
test_df['TARGET'] = sub_preds
test_df[['SK_ID_CURR', 'TARGET']].to_csv("submission.csv", index=False)

In [None]:
def display_importances(feature_importance_df):
    cols = feature_importance_df[["feature", "importance"]].groupby("feature").mean().sort_values(by="importance",ascending=False)[:70].index
    best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
    plt.figure(figsize=(8, 10))
    sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances01.png')

In [32]:
display_importances(feature_importance_df)