In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold,cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout, BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.optimizers import Adam
from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.models import load_model
import gc
gc.enable()

Using TensorFlow backend.


In [2]:
def one_hot_encoder(df, nan_as_category = True):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    print (categorical_columns)
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

print('Read data and test')
data = pd.read_csv('./input/application_train.csv')
test = pd.read_csv('./input/application_test.csv')
print('data test Shapes : ', data.shape, test.shape)

y = data['TARGET']
del data['TARGET']
gc.collect()
all_data = pd.concat((data, test)).reset_index(drop=True)

inc_by_org = all_data[['AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE']].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL']
all_data['NEW_INC_BY_ORG'] = all_data['ORGANIZATION_TYPE'].map(inc_by_org)
print('all_data Shape : ', all_data.shape)

for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']:
    all_data[bin_feature], uniques = pd.factorize(all_data[bin_feature])
all_data, cat_cols = one_hot_encoder(all_data)
print('all_data Shape : ', all_data.shape)
# Some simple new features (percentages)
#all_data['DAYS_EMPLOYED'].replace(365243, np.nan, inplace= True)
n_data = data.shape[0]

all_data['INCOME_DAYS_PERC'] = all_data['AMT_INCOME_TOTAL'] / all_data['DAYS_BIRTH']
all_data['DAYS_EMPLOYED_PERC'] = all_data['DAYS_EMPLOYED'] / all_data['DAYS_BIRTH']
all_data['INCOME_CREDIT_PERC'] = all_data['AMT_INCOME_TOTAL'] / all_data['AMT_CREDIT']
all_data['INCOME_PER_PERSON'] = all_data['AMT_INCOME_TOTAL'] / all_data['CNT_FAM_MEMBERS']
all_data['ANNUITY_INCOME_PERC'] = all_data['AMT_ANNUITY'] / all_data['AMT_INCOME_TOTAL']
all_data['PAYMENT_RATE'] = all_data['AMT_CREDIT'] / all_data['AMT_ANNUITY']
print('all_data Shape : ', all_data.shape)
del data,test
gc.collect()

bureau = pd.read_csv('./input/bureau.csv')
buro_bal = pd.read_csv('./input/bureau_balance.csv')
print('bureau  buro_bal Shape : ', bureau.shape,buro_bal.shape)
buro_bal, bb_cat = one_hot_encoder(buro_bal)
bureau, bureau_cat = one_hot_encoder(bureau)
bb_aggregations = {'MONTHS_BALANCE': ['min', 'max','mean']}
for col in bb_cat:
    bb_aggregations[col] = ['mean']
    bb_agg = buro_bal.groupby('SK_ID_BUREAU').agg(bb_aggregations)
bb_agg.columns = pd.Index([e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()])
bb_agg['BB_COUNT'] = buro_bal.groupby('SK_ID_BUREAU').size()
print('bb_agg Shape : ', bb_agg.shape)
bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU')
bureau.drop(columns= 'SK_ID_BUREAU', inplace= True)
print('bureau Shape : ', bureau.shape)
del buro_bal,bb_agg
gc.collect()
bureau['PEC_ANNUITY_DEBT'] = bureau['AMT_ANNUITY']/bureau['AMT_CREDIT_SUM_DEBT']
num_aggregations = {
    'DAYS_CREDIT': ['min', 'max', 'mean', 'var'],
    'CREDIT_DAY_OVERDUE': ['max', 'mean'],
    'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'],
    'AMT_CREDIT_MAX_OVERDUE': ['mean'],
    'CNT_CREDIT_PROLONG': ['sum','mean'],
    'PEC_ANNUITY_DEBT':['min','max','mean'],
    'AMT_CREDIT_SUM': ['max','mean', 'sum'],
    'AMT_CREDIT_SUM_DEBT': ['max','mean', 'sum'],
    'AMT_CREDIT_SUM_OVERDUE': ['mean'],
    'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'],
    'DAYS_CREDIT_UPDATE': ['min','max','mean'],
    'AMT_ANNUITY': ['max', 'mean'],
    'MONTHS_BALANCE_MIN': ['min'],
    'MONTHS_BALANCE_MAX': ['max'],
    'MONTHS_BALANCE_MEAN': ['mean'],
    'BB_COUNT': ['mean', 'sum']
}
cat_aggregations = {}
for cat in bureau_cat: cat_aggregations[cat] = ['mean']
for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean']
bureau_agg = bureau.groupby('SK_ID_CURR').agg(dict(num_aggregations, **cat_aggregations))
bureau_agg.columns = pd.Index(['BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist()])
bureau_agg['BURO_COUNT'] = bureau.groupby('SK_ID_CURR').size()
bureau_agg['BURO_MAX_OVERDUE_DEBT'] = bureau_agg['BURO_AMT_CREDIT_MAX_OVERDUE_MEAN']/bureau_agg['BURO_AMT_CREDIT_SUM_DEBT_MEAN']
print('bureau_agg Shape : ', bureau_agg.shape)

active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1]
active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations)
active_agg.columns = pd.Index(['ACT_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist()])
active_agg['ACT_COUNT'] = active.groupby('SK_ID_CURR').size()
bureau_agg = bureau_agg.reset_index().join(active_agg, how='left', on='SK_ID_CURR')
print('bureau_agg Shape : ', bureau_agg.shape)
del active, active_agg
gc.collect()

closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1]
closed_agg = closed.groupby('SK_ID_CURR').agg(dict({'DAYS_ENDDATE_FACT': ['min','max','mean']},**num_aggregations))
closed_agg.columns = pd.Index(['CLS_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist()])
closed_agg['CLS_COUNT'] = closed.groupby('SK_ID_CURR').size()
bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR')
print('bureau_agg Shape : ', bureau_agg.shape)
del closed, closed_agg, bureau
gc.collect()

all_data = all_data.merge(bureau_agg, how='left', on='SK_ID_CURR')
print('all_data Shape : ', all_data.shape)
del bureau_agg
gc.collect()

prev = pd.read_csv('./input/previous_application.csv')
print('prev Shape : ', prev.shape)
prev, cat_cols = one_hot_encoder(prev, nan_as_category= True)
# Days 365.243 values -> nan
prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace= True)
prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace= True)
prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace= True)
prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace= True)
# Add feature: value ask / value received percentage
prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT']
prev['NEW_PA_AMT_UNREPAID'] = prev[prev['DAYS_LAST_DUE_1ST_VERSION'] > 0]\
    ['DAYS_LAST_DUE_1ST_VERSION']/30*(prev[prev['DAYS_LAST_DUE_1ST_VERSION'] > 0]['AMT_ANNUITY'])
# Previous applications numeric features
num_aggregations = {
    'AMT_ANNUITY': ['min','max', 'mean'],
    'AMT_APPLICATION': ['min','max', 'mean'],
    'AMT_CREDIT': ['min','max', 'mean'],
    'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
    'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
    'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
    'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
    'DAYS_DECISION': ['min', 'max', 'mean'],
    'SELLERPLACE_AREA': ['min', 'max', 'mean'],
    'DAYS_FIRST_DRAWING': ['min', 'max', 'mean'],
    'DAYS_FIRST_DUE': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
    'DAYS_LAST_DUE': ['min', 'max', 'mean'],
    'DAYS_TERMINATION': ['min', 'max', 'mean'],
    'CNT_PAYMENT': ['mean', 'sum'],
    'NEW_PA_AMT_UNREPAID':['min','max','mean', 'sum']
}
# Previous applications categorical features
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']

prev_agg = prev.groupby('SK_ID_CURR').agg(dict(num_aggregations, **cat_aggregations))
prev_agg.columns = pd.Index(['PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist()])
prev_agg['PREV_COUNT'] = prev.groupby('SK_ID_CURR').size()
print('prev_agg Shape : ', prev_agg.shape)
# Previous Applications: Approved Applications - only numerical features

ap_num_aggregations = {
        'AMT_ANNUITY': ['min','max', 'mean'],
        'AMT_APPLICATION': ['min','max', 'mean'],
        'AMT_CREDIT': ['min','max', 'mean'],
        'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'],
        'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'AMT_GOODS_PRICE': ['min', 'max', 'mean'],
        'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'SELLERPLACE_AREA': ['min', 'max', 'mean'],
        'DAYS_FIRST_DRAWING': ['min', 'max', 'mean'],
        'DAYS_FIRST_DUE': ['min', 'max', 'mean'],
        'DAYS_LAST_DUE_1ST_VERSION': ['min', 'max', 'mean'],
        'DAYS_LAST_DUE': ['min', 'max', 'mean'],
        'DAYS_TERMINATION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum']
    }
approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1]
approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations)
approved_agg.columns = pd.Index(['APR_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist()])
approved_agg['APR_COUNT'] = approved.groupby('SK_ID_CURR').size()
prev_agg = prev_agg.reset_index().join(approved_agg, how='left', on='SK_ID_CURR')
print('prev_agg Shape : ', prev_agg.shape)
# Previous Applications: Refused Applications - only numerical features
refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1]
refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations)
refused_agg.columns = pd.Index(['REF_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist()])
refused_agg['REF_COUNT'] = refused.groupby('SK_ID_CURR').size()
prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR')
print('prev_agg Shape : ', prev_agg.shape)
del refused, refused_agg, approved, approved_agg, prev
gc.collect()

all_data = all_data.merge(prev_agg, how='left', on='SK_ID_CURR')
print('all_data Shape : ', all_data.shape)
del prev_agg
gc.collect()

pos = pd.read_csv('./input/POS_CASH_balance.csv')
print('pos Shape : ', pos.shape)
pos, cat_cols = one_hot_encoder(pos, nan_as_category= True)
# Features
aggregations = {
    'MONTHS_BALANCE': ['min','max', 'mean'],
    'CNT_INSTALMENT': ['sum', 'mean'],
    'SK_DPD': ['sum','max','min','mean','size'],
    'SK_DPD_DEF': ['sum','max', 'min','mean','size']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']

pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations)
pos_agg.columns = pd.Index(['POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist()])
# Count pos cash accounts
pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size()
print('pos_agg Shape : ', pos_agg.shape)
del pos
gc.collect()

all_data = all_data.join(pos_agg, how='left', on='SK_ID_CURR')
print('all_data Shape : ', all_data.shape)
del pos_agg
gc.collect()

ins = pd.read_csv('./input/installments_payments.csv')
print('ins Shape : ', ins.shape)
ins, cat_cols = one_hot_encoder(ins, nan_as_category= True)
# Percentage and difference paid in each installment (amount paid and installment value)
ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT']
ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT']
# Days past due and days before due (no negative values)
ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT']
ins['NEW_IP_DPD_RATIO'] = ins['DAYS_ENTRY_PAYMENT'] / ins['DAYS_INSTALMENT']
ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT']
ins['NEW_IP_DBD_RATIO'] = ins['DAYS_INSTALMENT'] / ins['DAYS_ENTRY_PAYMENT']
ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0)
ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0)
# Features: Perform aggregations
aggregations = {
    'NUM_INSTALMENT_VERSION': ['nunique'],
    'DPD': ['max','mean', 'sum'],
    'NEW_IP_DPD_RATIO':['max', 'mean','min'],
    'NEW_IP_DBD_RATIO':['max', 'mean','min'],
    'DBD': ['max', 'mean', 'sum'],
    'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'],
    'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'],
    'AMT_INSTALMENT': ['max', 'mean', 'sum'],
    'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'],
    'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'],
    'DAYS_INSTALMENT': ['max', 'mean', 'sum']
}
for cat in cat_cols:
    aggregations[cat] = ['mean']
ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations)
ins_agg.columns = pd.Index(['INS_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist()])
# Count installments accounts
ins_agg['INS_COUNT'] = ins.groupby('SK_ID_CURR').size()
print('ins_agg Shape : ', ins_agg.shape)
del ins
gc.collect()

all_data = all_data.join(ins_agg, how='left', on='SK_ID_CURR')
print('all_data Shape : ', all_data.shape)
del ins_agg
gc.collect()

cc = pd.read_csv('./input/credit_card_balance.csv')
print('cc Shape : ', cc.shape)
cc, cat_cols = one_hot_encoder(cc, nan_as_category= True)
# Features: Perform aggregations
num_aggregations = {
    'MONTHS_BALANCE': ['min','max', 'mean'],
    'AMT_BALANCE': ['min','max', 'mean','sum'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['min','max', 'mean','sum','var'],
    'AMT_DRAWINGS_ATM_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_OTHER_CURRENT': ['min','max', 'mean','sum'],
    'AMT_DRAWINGS_POS_CURRENT': ['min','max', 'mean','sum'],
    'AMT_INST_MIN_REGULARITY': ['min','max', 'mean','sum','var'],
    'AMT_PAYMENT_CURRENT':['min','max', 'mean','sum'],
    'AMT_PAYMENT_TOTAL_CURRENT':['min','max', 'mean','sum'],
    'AMT_RECEIVABLE_PRINCIPAL':['min','max', 'mean','sum','var'],
    'AMT_RECIVABLE':['min','max', 'mean','sum','var'],
    'AMT_TOTAL_RECEIVABLE':['min','max', 'mean','sum','var'],
    'CNT_DRAWINGS_ATM_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_OTHER_CURRENT':['mean','sum'],
    'CNT_DRAWINGS_POS_CURRENT':['mean','sum'],
    'CNT_INSTALMENT_MATURE_CUM':['mean','sum'],
    'SK_DPD':['max', 'mean', 'sum'],
    'SK_DPD_DEF':['max', 'mean', 'sum']
}
cat_aggregations = {}
for cat in cat_cols:
    cat_aggregations[cat] = ['mean']
# General aggregations
cc_agg = cc.groupby('SK_ID_CURR').agg(dict(num_aggregations,**cat_aggregations))
cc_agg.columns = pd.Index(['CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist()])
# Count credit card lines
cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size()

#cc_agg['INSTALLMENTS_PER_LOAN'] = cc_agg['CNT_INSTALMENT_MATURE_CUM_SUM'] / cc_agg['CC_COUNT']
del cc
print('cc_agg Shape : ', cc_agg.shape)
gc.collect()

all_data = all_data.join(cc_agg, how='left', on='SK_ID_CURR')
print('all_data Shape : ', all_data.shape)
del cc_agg
gc.collect()

Read data and test
data test Shapes :  (307511, 122) (48744, 121)
all_data Shape :  (356255, 122)
['NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE', 'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE']
all_data Shape :  (356255, 255)
all_data Shape :  (356255, 261)
bureau  buro_bal Shape :  (1716428, 17) (27299925, 3)
['STATUS']
['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE']
bb_agg Shape :  (817395, 13)
bureau Shape :  (1716428, 52)
bureau_agg Shape :  (305811, 71)
bureau_agg Shape :  (305811, 107)
bureau_agg Shape :  (305811, 145)
all_data Shape :  (356255, 405)
prev Shape :  (1670214, 37)
['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'N

7

In [4]:
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})

drop_col = missing_data[missing_data['Missing Ratio']==100]
missing_data = missing_data[missing_data['Missing Ratio']<100]

all_data.drop(drop_col.index.values,1,inplace=True)
ID = all_data.SK_ID_CURR
all_data.drop('SK_ID_CURR',1,inplace=True)
all_data = all_data.fillna(all_data.mean()).clip(-1e11,1e11)
scaler = MinMaxScaler()
scaler.fit(all_data)
data = scaler.transform(all_data[:n_data])
test = scaler.transform(all_data[n_data:])
#del avg_buro, avg_prev,avg_pos,avg_cc_bal,avg_inst
del all_data
gc.collect()
print (data.shape)
print (test.shape)


(307511, 846)
(48744, 846)


In [13]:
NFOLDS=5
SEED = 1700
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

def get_nn_oof(x_train, y_train, x_test):
    oof_train = np.zeros((x_train.shape[0],))
    oof_test = np.zeros((x_test.shape[0],))
    oof_test_skf = np.empty((NFOLDS, x_test.shape[0]))

    for i, (train_index, test_index) in enumerate(folds.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train[test_index]
        y_te = y_train.iloc[test_index]
        print( 'Setting up neural network...' )
        K.clear_session()
        nn = Sequential()
        nn.add(Dense(units = 800 , kernel_initializer = 'normal', input_dim = 846))
        nn.add(PReLU())
        #nn.add(Dropout(.3))
        nn.add(Dense(units = 320 , kernel_initializer = 'normal'))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        #nn.add(Dropout(.3))
        nn.add(Dense(units = 128 , kernel_initializer = 'normal'))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        #nn.add(Dropout(.3))
        nn.add(Dense(units = 32, kernel_initializer = 'normal'))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        #nn.add(Dropout(.3))
        nn.add(Dense(units = 16, kernel_initializer = 'normal'))
        nn.add(PReLU())
        nn.add(BatchNormalization())
        #nn.add(Dropout(.3))
        nn.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
        nn.compile(loss='binary_crossentropy', optimizer='adam')
        early_stopping = EarlyStopping(patience=5, verbose=1)
        model_checkpoint = ModelCheckpoint("./keras.model", save_best_only=True, verbose=1)
        reduce_lr = ReduceLROnPlateau(factor=0.1, patience=3, min_lr=0.00001, verbose=1)
        print( 'Fitting neural network...' )
        nn.fit(x_tr, y_tr, validation_data = (x_te, y_te), epochs=100,batch_size=512,
              callbacks=[early_stopping, model_checkpoint, reduce_lr])
        nn = load_model("./keras.model")
        print( 'Predicting...' )
        oof_train[test_index] = nn.predict(x_te).flatten().clip(0,1)
        oof_test_skf[i, :] += nn.predict(x_test).flatten().clip(0,1) / folds.n_splits

        print('Fold %2d Train AUC : %.6f' % (i + 1, roc_auc_score(y_tr, nn.predict(x_tr).flatten().clip(0,1))))
        print('Fold %2d Test AUC : %.6f' % (i + 1, roc_auc_score(y_te, oof_train[test_index])))
        del nn,x_tr,y_tr,x_te,y_te
        gc.collect()
    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [14]:
oof_train, oof_test = get_nn_oof(data, y, test)

Setting up neural network...
Fitting neural network...
Train on 246008 samples, validate on 61503 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.25050, saving model to ./keras.model
Epoch 2/100

Epoch 00002: val_loss did not improve from 0.25050
Epoch 3/100

Epoch 00003: val_loss improved from 0.25050 to 0.24810, saving model to ./keras.model
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.24810
Epoch 5/100

Epoch 00005: val_loss improved from 0.24810 to 0.24551, saving model to ./keras.model
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.24551
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.24551
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.24551

Epoch 00008: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 9/100

Epoch 00009: val_loss improved from 0.24551 to 0.24108, saving model to ./keras.model
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.24108
Epoch 11/100

Epoch 00011: val_loss


Epoch 00004: val_loss did not improve from 0.24354
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.24354
Epoch 6/100

Epoch 00006: val_loss did not improve from 0.24354

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 7/100

Epoch 00007: val_loss improved from 0.24354 to 0.24249, saving model to ./keras.model
Epoch 8/100

Epoch 00008: val_loss improved from 0.24249 to 0.24158, saving model to ./keras.model
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.24158
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.24158
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.24158

Epoch 00011: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 12/100

Epoch 00012: val_loss did not improve from 0.24158
Epoch 13/100

Epoch 00013: val_loss did not improve from 0.24158
Epoch 00013: early stopping
Predicting...
Fold  4 Train AUC : 0.789800
Fold  4 Test AUC : 0.770531
Setting up neural network...
Fittin

In [15]:
oof_train_csv = pd.DataFrame()
oof_train_csv['ID'] = ID[:n_data]
oof_train_csv['nn_oof_train'] = oof_train

oof_train_csv.to_csv('./output/nn_oof_train.csv', index=False)

oof_test_csv = pd.DataFrame()
oof_test_csv['ID'] = ID[n_data:]
oof_test_csv['nn_oof_test'] = oof_test

oof_test_csv.to_csv('./output/nn_oof_test.csv', index=False)

In [16]:
submission = pd.DataFrame()
submission['SK_ID_CURR'] = ID[n_data:]
submission['TARGET'] = oof_test
submission.to_csv('nn_predict.csv', index=False)