In [None]:
%reset -f
import pandas as pd
import numpy as np
import matplotlib as mp
import seaborn as sns
import csv
import re
import gc
import sys, os, random

import matplotlib.pyplot as plt # for plotting

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(37)
random.seed(17)

pd.set_option('display.max_rows',1000)

#root = '/Users/schwalmdaniel/github/kaggle/home-credit-default-risk'
root = 'e:/kaggle/home-credit-default-risk'

train=pd.read_csv(root + "/application_train.csv")
test=pd.read_csv(root + "/application_test.csv")
bureau=pd.read_csv(root + "/bureau.csv")
'''bureau_balance=pd.read_csv(root + "/bureau_balance.csv")
POS_CASH_balance=pd.read_csv(root + "/POS_CASH_balance.csv")
credit_card_balance=pd.read_csv(root + "/credit_card_balance.csv")
previous_application=pd.read_csv(root + "/previous_application.csv")
installments_payments=pd.read_csv(root + "/installments_payments.csv")'''

# have a look at the ds
train.head()

In [None]:
bureau.head()

In [None]:
bureau_credit = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_ACTIVE']])
bureau_overdue = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_DAY_OVERDUE']])
bureau_currency = pd.DataFrame(bureau[['SK_ID_CURR', 'CREDIT_CURRENCY']])

bureau_credit_active = bureau_credit[bureau_credit['CREDIT_ACTIVE'] == 'Active'].\
    groupby('SK_ID_CURR').CREDIT_ACTIVE.agg(['count']).reset_index()
bureau_credit_closed = bureau_credit[bureau_credit['CREDIT_ACTIVE'] == 'Closed'].\
    groupby('SK_ID_CURR').CREDIT_ACTIVE.agg(['count']).reset_index()
bureau_credit_overdue = bureau_overdue.groupby('SK_ID_CURR').CREDIT_DAY_OVERDUE.agg([
        'min', 
        'max', 
        'mean', 
        'std']).reset_index()
bureau_credit_overdue.fillna(0,inplace=True)
bureau_currency_count = bureau_currency.groupby('SK_ID_CURR').CREDIT_CURRENCY.agg(['count']).reset_index()

train=train.merge(bureau_credit_active,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_active,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_credit_active_count'}, inplace=True)
test.rename(columns={'count': 'bureau_credit_active_count'}, inplace=True)

train=train.merge(bureau_currency_count,on='SK_ID_CURR', how='left')
test=test.merge(bureau_currency_count,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_currency_count'}, inplace=True)
test.rename(columns={'count': 'bureau_currency_count'}, inplace=True)

train=train.merge(bureau_credit_closed,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_closed,on='SK_ID_CURR', how='left')
train.rename(columns={'count': 'bureau_credit_closed_count'}, inplace=True)
test.rename(columns={'count': 'bureau_credit_closed_count'}, inplace=True)

train=train.merge(bureau_credit_overdue,on='SK_ID_CURR', how='left')
test=test.merge(bureau_credit_overdue,on='SK_ID_CURR', how='left')
train.rename(columns={'min': 'bureau_credit_overdue_min','max': 'bureau_credit_overdue_max',
                      'mean': 'bureau_credit_overdue_mean','std': 'bureau_credit_overdue_std'}, inplace=True)
test.rename(columns={'min': 'bureau_credit_overdue_min','max': 'bureau_credit_overdue_max',
                      'mean': 'bureau_credit_overdue_mean','std': 'bureau_credit_overdue_std'}, inplace=True)
train['bureau_credit_overdue_min'].fillna(0,inplace=True)
train['bureau_credit_overdue_max'].fillna(0,inplace=True)
train['bureau_credit_overdue_mean'].fillna(0,inplace=True)
train['bureau_credit_overdue_std'].fillna(0,inplace=True)
train['bureau_credit_active_count'].fillna(0,inplace=True)
train['bureau_credit_closed_count'].fillna(0,inplace=True)
train['bureau_currency_count'].fillna(0,inplace=True)
test['bureau_credit_overdue_min'].fillna(0,inplace=True)
test['bureau_credit_overdue_max'].fillna(0,inplace=True)
test['bureau_credit_overdue_mean'].fillna(0,inplace=True)
test['bureau_credit_overdue_std'].fillna(0,inplace=True)
test['bureau_credit_active_count'].fillna(0,inplace=True)
test['bureau_credit_closed_count'].fillna(0,inplace=True)
test['bureau_currency_count'].fillna(0,inplace=True)

train.head()
                            




In [None]:
test.shape

In [None]:
train['TARGET'].value_counts()

# it is an unbalanced data, 8.5% of the target is 1, so the baseline is around 92%

In [None]:
# 'NAME_CONTRACT_TYPE', 2 values, converting to 0/1
train['NAME_CONTRACT_TYPE'] = train['NAME_CONTRACT_TYPE'].apply(lambda x: 0 if x == 'Cash loans' else 1)
test['NAME_CONTRACT_TYPE'] = test['NAME_CONTRACT_TYPE'].apply(lambda x: 0 if x == 'Cash loans' else 1)

In [None]:
# 'CODE_GENDER', drop XNA as only 4 rows, convert the rest to 0/1
train = train[train['CODE_GENDER'] != 'XNA']
train['CODE_GENDER'] = train['CODE_GENDER'].apply(lambda x: 0 if x == 'F' else 1)
test['CODE_GENDER'] = test['CODE_GENDER'].apply(lambda x: 0 if x == 'F' else 1)

In [None]:
# FLAG_OWN_CAR
train['FLAG_OWN_CAR'] = train['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'Y' else 0)
test['FLAG_OWN_CAR'] = test['FLAG_OWN_CAR'].apply(lambda x: 1 if x == 'Y' else 1)

In [None]:
# FLAG_OWN_REALTY
train['FLAG_OWN_REALTY'] = train['FLAG_OWN_REALTY'].apply(lambda x: 1 if x == 'Y' else 0)
test['FLAG_OWN_REALTY'] = test['FLAG_OWN_REALTY'].apply(lambda x: 1 if x == 'Y' else 1)

In [None]:
# where null it should be dropped or mean or average income/annuity

avgAnnuityRate = (train['AMT_ANNUITY']/train['AMT_CREDIT']).mean()
train['AMT_ANNUITY'].fillna(avgAnnuityRate * train['AMT_CREDIT'],inplace=True)
test['AMT_ANNUITY'].fillna(avgAnnuityRate * train['AMT_CREDIT'],inplace=True)


In [None]:
#  where null mean or average income / goods price
goodsPriceMean = train['AMT_GOODS_PRICE'].mean()
train['AMT_GOODS_PRICE'].fillna(goodsPriceMean,inplace=True)
test['AMT_GOODS_PRICE'].fillna(goodsPriceMean,inplace=True)

In [None]:
#  categorical, dummify, where null either unknown or most frequent
train['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)
test['NAME_TYPE_SUITE'].fillna('Unaccompanied',inplace=True)

In [None]:
# numeric, the older the worse, where null check own car
train['OWN_CAR_AGE'].fillna(100,inplace=True)
test['OWN_CAR_AGE'].fillna(100,inplace=True)
train['OWN_CAR_AGE'] = train['OWN_CAR_AGE'] * -1
test['OWN_CAR_AGE'] = test['OWN_CAR_AGE'] * -1
train['OWN_CAR_AGE'].describe()

In [None]:
# numeric, drop where it is null
train = train[train['CNT_FAM_MEMBERS'] > 0]
test = test[test['CNT_FAM_MEMBERS'] > 0]

In [None]:
# fill null with mean for _1, _2, _3
mean1 = train['EXT_SOURCE_1'].mean()
mean2 = train['EXT_SOURCE_2'].mean()
mean3 = train['EXT_SOURCE_3'].mean()
train['EXT_SOURCE_1'].fillna(mean1,inplace=True)
train['EXT_SOURCE_2'].fillna(mean2,inplace=True)
train['EXT_SOURCE_3'].fillna(mean3,inplace=True)
test['EXT_SOURCE_1'].fillna(mean1,inplace=True)
test['EXT_SOURCE_2'].fillna(mean2,inplace=True)
test['EXT_SOURCE_3'].fillna(mean3,inplace=True)


In [None]:
# fill with mean all _AVG, _MEDI, _MODE
for col in train.columns.tolist():
    if (col.endswith('_AVG') or col.endswith('_MEDI') or col.endswith('_MODE')) and col not in ['FONDKAPREMONT_MODE','HOUSETYPE_MODE',
                    'WALLSMATERIAL_MODE','EMERGENCYSTATE_MODE']: 
        #print (col)
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)
        

In [None]:
# EMERGENCYSTATE_MODE
train['EMERGENCYSTATE_MODE'] = train['EMERGENCYSTATE_MODE'].apply(lambda x: 1 if x == 'Y' else 0)
test['EMERGENCYSTATE_MODE'] = test['EMERGENCYSTATE_MODE'].apply(lambda x: 1 if x == 'Y' else 1)


In [None]:
#  Fill none with mean or median for all circle
for col in train.columns.tolist():
    if col.endswith('_CIRCLE'):
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)


In [None]:
# negative numeric, drop where it is null
train['DAYS_LAST_PHONE_CHANGE'].fillna(0,inplace=True)
test['DAYS_LAST_PHONE_CHANGE'].fillna(0,inplace=True)


In [None]:
# all amt_credit req, maybe bin it
train['AMT_REQ_CREDIT_BUREAU_YEAR'].mean()

for col in train.columns.tolist():
    if 'AMT_REQ_CREDIT_BUREAU_' in col:
        #print (col)
        mean = train[col].mean()
        train[col].fillna(mean,inplace=True)
        test[col].fillna(mean,inplace=True)


In [None]:
gc.collect()

In [None]:
train_objs_num = len(train)
dataset = pd.concat(objs=[train, test], axis=0)
dataset = pd.get_dummies(dataset, 
    columns = ['NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE',
            'OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE','FONDKAPREMONT_MODE',
            'HOUSETYPE_MODE','WALLSMATERIAL_MODE'],prefix_sep='__')
train = dataset[:train_objs_num]
test = dataset[train_objs_num:]
train.shape

In [None]:
correlations = train.corr()['TARGET'].sort_values()
correlations

In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(['SK_ID_CURR','TARGET'], axis=1)
y = train['TARGET']
X_test = test.drop(['SK_ID_CURR','TARGET'], axis=1)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import KFold, RepeatedKFold

import gc
import csv

cnt = 0
p_buf = []
n_splits = 2
n_repeats = 1
kf = RepeatedKFold(
    n_splits=n_splits, 
    n_repeats=n_repeats, 
    random_state=0)
auc_buf = []   

params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'max_depth': 12,
        'num_leaves': 31,
        'learning_rate': 0.025,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5,
        'verbose': 0,
        'num_threads': 8,
        'lambda_l2': 1.5,
        'min_gain_to_split': 0,
    }  

for train_index, valid_index in kf.split(X):
    print('Fold {}/{}'.format(cnt + 1, n_splits))
    
    model = lgb.train(
        params,
        lgb.Dataset(X.loc[train_index], y.loc[train_index], feature_name=X.columns.tolist()),
        num_boost_round=10000,
        valid_sets=[lgb.Dataset(X.loc[valid_index], y.loc[valid_index])],
        early_stopping_rounds=100,
        verbose_eval=100,
    )

    if cnt == 0:
        importance = model.feature_importance()
        model_fnames = model.feature_name()
        tuples = sorted(zip(model_fnames, importance), key=lambda x: x[1])[::-1]
        tuples = [x for x in tuples if x[1] > 0]
        print('Important features:')
        print(tuples[:200])

    p = model.predict(X.loc[valid_index], num_iteration=model.best_iteration)
    #auc = roc_auc_score(y.loc[valid_index], p)

    #print('{} AUC: {}'.format(cnt, auc))

    p = model.predict(X_test, num_iteration=model.best_iteration)
    if len(p_buf) == 0:
        p_buf = np.array(p)
    else:
        p_buf += np.array(p)
    #auc_buf.append(auc)

    cnt += 1
    if cnt > 0: # Comment this to run several folds
        pass
    
    del model
    gc.collect

#auc_mean = np.mean(auc_buf)
#auc_std = np.std(auc_buf)
#print('AUC = {:.6f} +/- {:.6f}'.format(auc_mean, auc_std))

preds = p_buf/cnt

subm = pd.DataFrame()
subm['SK_ID_CURR'] = test['SK_ID_CURR']
subm['TARGET'] = preds
subm.to_csv('home-default-risk_lgbm.csv', index=False,quoting=csv.QUOTE_NONNUMERIC)
subm.head()

In [None]:
def get_best_model_and_accuracy(model, params, X, y):
    grid = GridSearchCV(model, # the model to grid search
                        params, # the parameter set to try 
                        error_score=0., scoring='roc_auc') # if a parameter set raises an error, continue and set the performance as a big, fat 0
    grid.fit(X, y) # fit the model and parameters
    # our classical metric for performance
    print ("Best Accuracy: {}".format(grid.best_score_))
    # the best parameters that caused the best accuracy
    print ("Best Parameters: {}".format(grid.best_params_))
    # the average time it took a model to fit to the data (in seconds)
    print ("Average Time to Fit (s): {}".format(round(grid.cv_results_['mean_fit_time'].mean(), 3)))
    # the average time it took a model to predict out of sample data (in seconds)
    # this metric gives us insight into how this model will perform in real-time analysis
    print ("Average Time to Score (s): {}".format(round(grid.cv_results_['mean_score_time'].mean(), 3)))

# Logistic Regression
lr_params = {'C':[1e-1, 1e0, 1e1, 1e2], 'penalty':['l1', 'l2']}

# KNN
knn_params = {'n_neighbors': [1, 3, 5, 7]}

# Decision Tree
tree_params = {'max_depth':[None, 1, 3, 5, 7]}

# Random Forest
forest_params = {'n_estimators': [10, 50, 100], 'max_depth': [None, 1, 3, 5, 7]}

lr = LogisticRegression(penalty='l1',n_jobs=-1)
knn = KNeighborsClassifier()
d_tree = DecisionTreeClassifier()
forest = RandomForestClassifier()

#get_best_model_and_accuracy(lr, lr_params, X[:3000], y[:3000])
#get_best_model_and_accuracy(knn, knn_params, X[:3000], y[:3000])
#get_best_model_and_accuracy(d_tree, tree_params, X[:3000], y[:3000])
#get_best_model_and_accuracy(forest, forest_params, X[:3000], y[:3000])
print ('Fitting...')

lr.fit(X,y)
probs = lr.predict_proba(X_test)
print ('Predicting...')

print (lr.score(X,y))

In [None]:
prd_1 = pd.DataFrame(probs)

submit = pd.concat([test['SK_ID_CURR'],prd_1],axis=1)

print (submit.columns.tolist)

submit = submit.drop(submit.columns[1], axis=1)
#probs.head()
submit.to_csv('home-default-risk.csv',index=False,quoting=csv.QUOTE_NONNUMERIC)
submit.head()

In [None]:
ax1 = sns.distplot(train["AMT_CREDIT"][train.TARGET==1], color='y')


In [None]:

sns.kdeplot(train.loc[train['TARGET'] == 0, 'DAYS_BIRTH'], label = 'Repaid Loan')
sns.kdeplot(train.loc[train['TARGET'] == 1, 'DAYS_BIRTH'], label = 'Not Repaid Loan')
plt.xlabel('Age (years)')
plt.ylabel('Density')
plt.title('Distribution of Ages');