# Home Credit Default Risk

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
pd.set_option('max_columns', None)

### Import data, basic preprocessing

In [None]:
train = pd.read_csv('../input/application_train.csv')
test = pd.read_csv('../input/application_test.csv')

train = pd.get_dummies(train)
X_test = pd.get_dummies(test)
X_train = train.drop('TARGET', axis=1)
y_train = train['TARGET']
X_train, X_test = X_train.align(X_test, 'inner', axis=1)  # Only keep columns in both.

### Get baseline prediction

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer

def simple_model(X, y):
    imp = Imputer()
    X_bl = pd.DataFrame(imp.fit_transform(X), index=X.index, columns=X.columns)
    X_train, X_val, y_train, y_val = train_test_split(X_bl, y, random_state=42)

    from  sklearn.naive_bayes import GaussianNB
    from sklearn.metrics import roc_auc_score
    clf = GaussianNB()
    clf.fit(X_train, y_train)
    print('roc', roc_auc_score(y_val, [i[1] for i in clf.predict_proba(X_val)]))

simple_model(X_train, y_train)

### Merge bureau data

For each 'SK_ID_CURR', there are multiple 'SK_ID_BUREAU' records in 'bureau_balance.csv'. Hence, we take their mean and merge with 'bureau.csv'. We then, again, find ourselves with multiple 'SK_ID_CURR' records, so we again take their mean and merge with the original dataset.

In [None]:
# Helper function
def aggregate_df(file_name, aggregates, drop=None):
    df = pd.read_csv('../input/'+file_name)
    df = pd.get_dummies(df)
    if drop:
        df.drop(drop, axis=1, inplace=True)
    df = df.groupby(aggregates).mean().reset_index()
    return df

In [None]:
bb = aggregate_df('bureau_balance.csv', 'SK_ID_BUREAU')
bb = pd.get_dummies(bb)
bb = bb.groupby('SK_ID_BUREAU').mean().reset_index()

bureau = pd.read_csv('../input/bureau.csv')
bureau_bb = bureau.merge(bb, 'left', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU').drop('SK_ID_BUREAU', axis=1)
bureau_bb = pd.get_dummies(bureau_bb)
bureau_bb = bureau_bb.groupby('SK_ID_CURR').mean().reset_index()

X_train = X_train.merge(bureau_bb, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
X_test = X_test.merge(bureau_bb, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

del bureau, bb  # Save on local memory.
gc.collect()

### Aggregate the rest

In [None]:
pos = aggregate_df('POS_CASH_balance.csv', 'SK_ID_CURR')
instals = aggregate_df('installments_payments.csv', 'SK_ID_CURR')
cc_bal = aggregate_df('credit_card_balance.csv', 'SK_ID_CURR')
prevs = aggregate_df('previous_application.csv', 'SK_ID_CURR')

In [None]:
X_train = X_train.merge(pos, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
X_train = X_train.merge(instals, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
X_train = X_train.merge(cc_bal, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
X_train = X_train.merge(prevs, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

X_test = X_test.merge(pos, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
X_test = X_test.merge(instals, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
X_test = X_test.merge(cc_bal, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
X_test = X_test.merge(prevs, 'left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

In [None]:
del pos, instals, cc_bal, prevs
gc.collect()

### Get basic LGBM down

Plenty of data, so a simple validation set will do.

Note: the parameters used in the final cell were taken from the following cell. To verify them for yourself, comment out the last line.

In [None]:
from bayes_opt import BayesianOptimization
import lightgbm as lgb
from sklearn.metrics import roc_auc_score

def optimise_lgbm(X, y, num_leaves, max_depth, learning_rate, max_bin):
    X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
    train_data=lgb.Dataset(X_train, label=y_train)

    def target(num_leaves, max_depth, learning_rate, max_bin):
        param = {'num_leaves': int(num_leaves),
                 'objective':'binary',
                 'max_depth': int(max_depth),
                 'learning_rate': learning_rate,
                 'max_bin': int(max_bin),
                 'metric': 'auc'
                }
        lgbm=lgb.train(param, train_data)
        return roc_auc_score(y_val, lgbm.predict(X_val))
    
    bo = BayesianOptimization(target,
                              {'num_leaves': (10, 100),
                             'max_depth': (7, 100),
                             'learning_rate': (.05, .2),
                             'max_bin': (50, 200)})
    bo.maximize(init_points=15, n_iter=40)
    return bo.res['max']['max_params']

#best_params = optimise_lgbm(X_train, y_train, (10, 200), (7, 100), (.05, .2), (50, 200))

In [None]:
# Output of previous cell, handcoded to save time.
best_params = {'learning_rate': 0.11137508860829018,
               'max_bin': 168.1527152156532,'max_depth': 75.57671833903827,
               'num_leaves': 53.73532416174853}

### Make prediction using best LGBM parameters

In [None]:
train_data=lgb.Dataset(X_train, label=y_train)

param = {'num_leaves': int(best_params['num_leaves']),
         'objective':'binary',
         'max_depth': int(best_params['max_depth']),
         'learning_rate': best_params['learning_rate'],
         'max_bin': int(best_params['max_bin'])}
param['metric'] = 'auc'
lgbm=lgb.train(param,train_data)

test['TARGET'] = lgbm.predict(X_test)
test[['SK_ID_CURR', 'TARGET']].to_csv('preds.csv', index=False)