In [None]:
%reset -f
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns
import csv
import re
import gc
import sys, os, random

import matplotlib.pyplot as plt # for plotting

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)

#root = '/Users/schwalmdaniel/github/kaggle/home-credit-default-risk'
root = 'e:/kaggle/home-credit-default-risk'

train=pd.read_csv(root + "/application_train.csv")
test=pd.read_csv(root + "/application_test.csv")
bureau=pd.read_csv(root + "/bureau.csv")
'''bureau_balance=pd.read_csv(root + "/bureau_balance.csv")
POS_CASH_balance=pd.read_csv(root + "/POS_CASH_balance.csv")
credit_card_balance=pd.read_csv(root + "/credit_card_balance.csv")
previous_application=pd.read_csv(root + "/previous_application.csv")
installments_payments=pd.read_csv(root + "/installments_payments.csv")'''

# have a look at the ds
train.head()

In [None]:
bureau.head()

In [None]:
bureau_credit = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_ACTIVE']])
bureau_overdue = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_DAY_OVERDUE']])
bureau_credit_type = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_ACTIVE','CREDIT_TYPE']])
bureau_sum_debt = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_ACTIVE','AMT_CREDIT_SUM_DEBT']])

bureau_credit_active = bureau_credit[bureau_credit['CREDIT_ACTIVE'] == 'Active'].\
    groupby('SK_ID_CURR').CREDIT_ACTIVE.agg(['count']).reset_index()
bureau_credit_closed = bureau_credit[bureau_credit['CREDIT_ACTIVE'] == 'Closed'].\
    groupby('SK_ID_CURR').CREDIT_ACTIVE.agg(['count']).reset_index()
bureau_credit_type_active = bureau_credit_type[bureau_credit_type['CREDIT_ACTIVE'] == 'Active'].\
    groupby('SK_ID_CURR').CREDIT_TYPE.agg(['count']).reset_index()
bureau_credit_type_closed = bureau_credit_type[bureau_credit_type['CREDIT_ACTIVE'] == 'Closed'].\
    groupby('SK_ID_CURR').CREDIT_TYPE.agg(['count']).reset_index()
bureau_sum_debt_active = bureau_sum_debt[bureau_sum_debt['CREDIT_ACTIVE'] == 'Active'].\
    groupby('SK_ID_CURR').AMT_CREDIT_SUM_DEBT.agg(['sum']).reset_index()
bureau_sum_debt_closed = bureau_sum_debt[bureau_sum_debt['CREDIT_ACTIVE'] == 'Closed'].\
    groupby('SK_ID_CURR').AMT_CREDIT_SUM_DEBT.agg(['sum']).reset_index()
bureau_credit_overdue = bureau_overdue.groupby('SK_ID_CURR').CREDIT_DAY_OVERDUE.agg([
        'min', 
        'max', 
        'mean', 
        'std']).reset_index()
bureau_credit_overdue.fillna(0,inplace=True)
bureau_credit_prolong = pd.DataFrame(bureau[['SK_ID_CURR', 'CNT_CREDIT_PROLONG']])\
    .groupby('SK_ID_CURR').CNT_CREDIT_PROLONG.agg([
        'min', 
        'max', 
        'mean', 
        'std']).reset_index()
bureau_currency_count = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_CURRENCY']])\
    .groupby('SK_ID_CURR').CREDIT_CURRENCY.agg(['count']).reset_index()
bureau_sum_overdue = pd.DataFrame(bureau[['SK_ID_CURR', 'AMT_CREDIT_SUM_OVERDUE']])\
    .groupby('SK_ID_CURR').AMT_CREDIT_SUM_OVERDUE.agg(['sum']).reset_index()
bureau_sum_overdue.fillna(0,inplace=True)
bureau_max_overdue = pd.DataFrame(bureau[['SK_ID_CURR', 'AMT_CREDIT_MAX_OVERDUE']])\
    .groupby('SK_ID_CURR').AMT_CREDIT_MAX_OVERDUE.agg(['min','max']).reset_index()
bureau_max_overdue.fillna(0,inplace=True)

train=train.merge(bureau_credit_active,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_active,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_credit_active_count'}, inplace=True)
test.rename(columns={'count': 'bureau_credit_active_count'}, inplace=True)

train=train.merge(bureau_credit_closed,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_closed,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_credit_closed_count'}, inplace=True)
test.rename(columns={'count': 'bureau_credit_closed_count'}, inplace=True)

train=train.merge(bureau_credit_type_active,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_type_active,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_credit_type_active_count'}, inplace=True)
test.rename(columns={'count': 'bureau_credit_type_active_count'}, inplace=True)

train=train.merge(bureau_credit_type_closed,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_type_closed,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_credit_type_closed_count'}, inplace=True)
test.rename(columns={'count': 'bureau_credit_type_closed_count'}, inplace=True)

train=train.merge(bureau_sum_debt_active,on='SK_ID_CURR', how='left')
test=test.merge(bureau_sum_debt_active,on='SK_ID_CURR', how='left')
train.rename(columns={'sum': 'bureau_sum_debt_active_sum'}, inplace=True)
test.rename(columns={'sum': 'bureau_sum_debt_active_sum'}, inplace=True)

train=train.merge(bureau_sum_debt_closed,on='SK_ID_CURR', how='left')
test=test.merge(bureau_sum_debt_closed,on='SK_ID_CURR', how='left')
train.rename(columns={'sum': 'bureau_sum_debt_closed_sum'}, inplace=True)
test.rename(columns={'sum': 'bureau_sum_debt_closed_sum'}, inplace=True)

train=train.merge(bureau_currency_count,on='SK_ID_CURR', how='left')
test=test.merge(bureau_currency_count,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_currency_count'}, inplace=True)
test.rename(columns={'count': 'bureau_currency_count'}, inplace=True)

train=train.merge(bureau_sum_overdue,on='SK_ID_CURR', how='left')
test=test.merge(bureau_sum_overdue,on='SK_ID_CURR', how='left')
train.rename(columns={'sum': 'bureau_sum_overdue'}, inplace=True)
test.rename(columns={'sum': 'bureau_sum_overdue'}, inplace=True)

train=train.merge(bureau_max_overdue,on='SK_ID_CURR', how='left')
test=test.merge(bureau_max_overdue,on='SK_ID_CURR', how='left')
train.rename(columns={'max': 'bureau_max_overdue_max','min': 'bureau_max_overdue_min'}, inplace=True)
test.rename(columns={'max': 'bureau_max_overdue_max','min': 'bureau_max_overdue_min'}, inplace=True)

train=train.merge(bureau_credit_prolong,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_prolong,on='SK_ID_CURR', how='left')
train.rename(columns={'min': 'bureau_credit_prolong_min','max': 'bureau_credit_prolong_max',
                      'mean': 'bureau_credit_prolong_mean','std': 'bureau_credit_prolong_std'}, inplace=True)
test.rename(columns={'min': 'bureau_credit_prolong_min','max': 'bureau_credit_prolong_max',
                      'mean': 'bureau_credit_prolong_mean','std': 'bureau_credit_prolong_std'}, inplace=True)

train=train.merge(bureau_credit_overdue,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_overdue,on='SK_ID_CURR', how='left')
train.rename(columns={'min': 'bureau_credit_overdue_min','max': 'bureau_credit_overdue_max',
                      'mean': 'bureau_credit_overdue_mean','std': 'bureau_credit_overdue_std'}, inplace=True)
test.rename(columns={'min': 'bureau_credit_overdue_min','max': 'bureau_credit_overdue_max',
                      'mean': 'bureau_credit_overdue_mean','std': 'bureau_credit_overdue_std'}, inplace=True)
train['bureau_credit_overdue_min'].fillna(0,inplace=True)
train['bureau_credit_overdue_max'].fillna(0,inplace=True)
train['bureau_credit_overdue_mean'].fillna(0,inplace=True)
train['bureau_credit_overdue_std'].fillna(0,inplace=True)
train['bureau_credit_prolong_min'].fillna(0,inplace=True)
train['bureau_credit_prolong_max'].fillna(0,inplace=True)
train['bureau_credit_prolong_mean'].fillna(0,inplace=True)
train['bureau_credit_prolong_std'].fillna(0,inplace=True)
train['bureau_credit_active_count'].fillna(0,inplace=True)
train['bureau_credit_closed_count'].fillna(0,inplace=True)
train['bureau_sum_debt_active_sum'].fillna(0,inplace=True)
train['bureau_sum_debt_closed_sum'].fillna(0,inplace=True)
train['bureau_credit_type_active_count'].fillna(0,inplace=True)
train['bureau_credit_type_closed_count'].fillna(0,inplace=True)
train['bureau_currency_count'].fillna(0,inplace=True)
train['bureau_sum_overdue'].fillna(0,inplace=True)
train['bureau_max_overdue_max'].fillna(0,inplace=True)
train['bureau_max_overdue_min'].fillna(0,inplace=True)
test['bureau_credit_overdue_min'].fillna(0,inplace=True)
test['bureau_credit_overdue_max'].fillna(0,inplace=True)
test['bureau_credit_overdue_mean'].fillna(0,inplace=True)
test['bureau_credit_overdue_std'].fillna(0,inplace=True)
test['bureau_credit_prolong_min'].fillna(0,inplace=True)
test['bureau_credit_prolong_max'].fillna(0,inplace=True)
test['bureau_credit_prolong_mean'].fillna(0,inplace=True)
test['bureau_credit_prolong_std'].fillna(0,inplace=True)
test['bureau_credit_active_count'].fillna(0,inplace=True)
test['bureau_credit_closed_count'].fillna(0,inplace=True)
test['bureau_sum_debt_active_sum'].fillna(0,inplace=True)
test['bureau_sum_debt_closed_sum'].fillna(0,inplace=True)
test['bureau_credit_type_active_count'].fillna(0,inplace=True)
test['bureau_credit_type_closed_count'].fillna(0,inplace=True)
test['bureau_currency_count'].fillna(0,inplace=True)
test['bureau_sum_overdue'].fillna(0,inplace=True)
test['bureau_max_overdue_max'].fillna(0,inplace=True)
test['bureau_max_overdue_min'].fillna(0,inplace=True)

train.head()
                            

In [None]:
# 'NAME_CONTRACT_TYPE', 2 values, converting to 0/1
train['NAME_CONTRACT_TYPE'] = train['NAME_CONTRACT_TYPE'].apply(lambda x: 0 if x == 'Cash loans' else 1)
test['NAME_CONTRACT_TYPE'] = test['NAME_CONTRACT_TYPE'].apply(lambda x: 0 if x == 'Cash loans' else 1)

In [None]:
# 'CODE_GENDER', drop XNA as only 4 rows, convert the rest to 0/1
train = train[train['CODE_GENDER'] != 'XNA']
train['CODE_GENDER'] = train['CODE_GENDER'].apply(lambda x: 0 if x == 'F' else 1)
test['CODE_GENDER'] = test['CODE_GENDER'].apply(lambda x: 0 if x == 'F' else 1)

In [None]:
# FLAG_OWN_CAR
train['FLAG_OWN_CAR'] = train['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'Y' else 0)
test['FLAG_OWN_CAR'] = test['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'Y' else 1)

In [None]:
# FLAG_OWN_REALTY
train['FLAG_OWN_REALTY'] = train['FLAG_OWN_REALTY'].apply(lambda x: 1 if x == 'Y' else 0)
test['FLAG_OWN_REALTY'] = test['FLAG_OWN_REALTY'].apply(lambda x: 1 if x == 'Y' else 1)

In [None]:
# where null it should be dropped or mean or average income/annuity

avgAnnuityRate = (train['AMT_ANNUITY']/train['AMT_CREDIT']).mean()
train['AMT_ANNUITY'].fillna(avgAnnuityRate * train['AMT_CREDIT'],inplace=True)
test['AMT_ANNUITY'].fillna(avgAnnuityRate * train['AMT_CREDIT'],inplace=True)


In [None]:
#  where null mean or average income / goods price
goodsPriceMean = train['AMT_GOODS_PRICE'].mean()
train['AMT_GOODS_PRICE'].fillna(goodsPriceMean,inplace=True)
test['AMT_GOODS_PRICE'].fillna(goodsPriceMean,inplace=True)

In [None]:
#  categorical, dummify, where null either unknown or most frequent
train['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)
test['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)

In [None]:
# numeric, the older the worse, where null check own car
train['OWN_CAR_AGE'].fillna(100,inplace=True)
test['OWN_CAR_AGE'].fillna(100,inplace=True)
train['OWN_CAR_AGE'] = train['OWN_CAR_AGE'] * -1
test['OWN_CAR_AGE'] = test['OWN_CAR_AGE'] * -1
train['OWN_CAR_AGE'].describe()

In [None]:
# numeric, drop where it is null
train = train[train['CNT_FAM_MEMBERS'] > 0]
test = test[test['CNT_FAM_MEMBERS'] > 0]

In [None]:
# fill null with mean for _1, _2, _3
'''mean1 = train['EXT_SOURCE_1'].mean()
mean2 = train['EXT_SOURCE_2'].mean()
mean3 = train['EXT_SOURCE_3'].mean()
train['EXT_SOURCE_1'].fillna(mean1,inplace=True)
train['EXT_SOURCE_2'].fillna(mean2,inplace=True)
train['EXT_SOURCE_3'].fillna(mean3,inplace=True)
test['EXT_SOURCE_1'].fillna(mean1,inplace=True)
test['EXT_SOURCE_2'].fillna(mean2,inplace=True)
test['EXT_SOURCE_3'].fillna(mean3,inplace=True)'''
train['EXT_SOURCE_1'].fillna(0,inplace=True)
train['EXT_SOURCE_2'].fillna(0,inplace=True)
train['EXT_SOURCE_3'].fillna(0,inplace=True)
test['EXT_SOURCE_1'].fillna(0,inplace=True)
test['EXT_SOURCE_2'].fillna(0,inplace=True)
test['EXT_SOURCE_3'].fillna(0,inplace=True)


In [None]:
# fill with mean all _AVG, _MEDI, _MODE
for col in train.columns.tolist():
    if (col.endswith('_AVG') or col.endswith('_MEDI') or col.endswith('_MODE')) and col not in ['FONDKAPREMONT_MODE','HOUSETYPE_MODE',
                    'WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']: 
        #print (col)
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)
        

In [None]:
# EMERGENCYSTATE_MODE
train['EMERGENCYSTATE_MODE'] = train['EMERGENCYSTATE_MODE'].apply(lambda x: 1 if x == 'Y' else 0)
test['EMERGENCYSTATE_MODE'] = test['EMERGENCYSTATE_MODE'].apply(lambda x: 1 if x == 'Y' else 1)


In [None]:
#  Fill none with mean or median for all circle
for col in train.columns.tolist():
    if col.endswith('_CIRCLE'):
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)


In [None]:
# negative numeric, drop where it is null
train['DAYS_LAST_PHONE_CHANGE'].fillna(0,inplace=True)
test['DAYS_LAST_PHONE_CHANGE'].fillna(0,inplace=True)


In [None]:
# all amt_credit req, maybe bin it
train['AMT_REQ_CREDIT_BUREAU_YEAR'].mean()

for col in train.columns.tolist():
    if 'AMT_REQ_CREDIT_BUREAU_' in col:
        #print (col)
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)


In [None]:
gc.collect()

In [None]:
test.shape

In [None]:
train_objs_num = len(train)
dataset = pd.concat(objs=[train, test], axis=0)
dataset = pd.get_dummies(dataset, 
    columns = ['NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE',
            'OCCUPATION_TYPE','ORGANIZATION_TYPE','FONDKAPREMONT_MODE',
            'HOUSETYPE_MODE','WALLSMATERIAL_MODE'],prefix_sep='__') # ,'WEEKDAY_APPR_PROCESS_START'
train = dataset[:train_objs_num]
test = dataset[train_objs_num:]
train.shape

In [None]:
from sklearn.model_selection import train_test_split

low_correlation_cols = ['ORGANIZATION_TYPE__Trade: type 5','ORGANIZATION_TYPE__Transport: type 2',
                        'NONLIVINGAPARTMENTS_MODE','FLAG_DOCUMENT_12','ORGANIZATION_TYPE__Telecom',
                        'ORGANIZATION_TYPE__Industry: type 6','bureau_credit_prolong_min','ORGANIZATION_TYPE__Housing',
                        'OCCUPATION_TYPE__Realty agents','NAME_HOUSING_TYPE__Co-op apartment','FLAG_DOCUMENT_5',
                        'ORGANIZATION_TYPE__Legal Services','ORGANIZATION_TYPE__Industry: type 7',
                        'ORGANIZATION_TYPE__Advertising','FLAG_DOCUMENT_20',
                        'ORGANIZATION_TYPE__Business Entity Type 1','FLAG_CONT_MOBILE',
                        'NAME_TYPE_SUITE__Group of people','FLAG_MOBIL','WALLSMATERIAL_MODE__Others',
                        'AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_HOUR','HOUSETYPE_MODE__terraced house',
                       'WEEKDAY_APPR_PROCESS_START']

X = train.drop(['SK_ID_CURR','TARGET'] + low_correlation_cols, axis=1)
y = train['TARGET']
X_test = test.drop(['SK_ID_CURR','TARGET'] + low_correlation_cols, axis=1)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import KFold, RepeatedKFold

import gc
import csv

cnt = 0
p_buf = []
n_splits = 6
n_repeats = 4
kf = RepeatedKFold(
    n_splits=n_splits, 
    n_repeats=n_repeats, 
    random_state=0)
auc_buf = []   

params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 12,
        'num_leaves': 31,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 0,
        'num_threads': 8,
        'lambda_l2': 1.5,
        'min_gain_to_split': 0,
    }  

for train_index, valid_index in kf.split(X):
    print('Fold {}/{}'.format(cnt + 1, n_splits))
    
    model = lgb.train(
        params,
        lgb.Dataset(X.loc[train_index], y.loc[train_index], feature_name=X.columns.tolist()),
        num_boost_round=10000,
        valid_sets=[lgb.Dataset(X.loc[valid_index], y.loc[valid_index])],
        early_stopping_rounds=100,
        verbose_eval=100,
    )

    if cnt == 0:
        importance = model.feature_importance()
        model_fnames = model.feature_name()
        tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
        tuples = [x for x in tuples if x[1] > 0]
        print('Important features:')
        print(tuples[:200])

    p = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)
    #auc = roc_auc_score(y.loc[valid_index], p)

    #print('{} AUC: {}'.format(cnt, auc))

    p = model.predict(X_test, num_iteration=model.best_iteration)
    if len(p_buf) == 0:
        p_buf = np.array(p)
    else:
        p_buf += np.array(p)
    #auc_buf.append(auc)

    cnt += 1
    if cnt > 0: # Comment this to run several folds
        break
    
    del model
    gc.collect

#auc_mean = np.mean(auc_buf)
#auc_std = np.std(auc_buf)
#print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))

preds = p_buf/cnt

subm = pd.DataFrame()
subm['SK_ID_CURR'] = test['SK_ID_CURR']
subm['TARGET'] = preds
subm.to_csv('home-default-risk_lgbm.csv', index=False,quoting=csv.QUOTE_NONNUMERIC)
subm.head()

In [None]:
prd_1 = pd.DataFrame(probs)

submit = pd.concat([test['SK_ID_CURR'],prd_1],axis=1)

print (submit.columns.tolist)

submit = submit.drop(submit.columns[1], axis=1)
#probs.head()
submit.to_csv('home-default-risk.csv',index=False,quoting=csv.QUOTE_NONNUMERIC)
submit.head()

In [None]:
ax1 = sns.distplot(train["AMT_CREDIT"][train.TARGET==1], color='y')


In [None]:

sns.kdeplot(train.loc[train['TARGET'] == 0, 'DAYS_BIRTH'], label = 'Repaid Loan')
sns.kdeplot(train.loc[train['TARGET'] == 1, 'DAYS_BIRTH'], label = 'Not Repaid Loan')
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.title('Distribution of Ages');