In [None]:
# Solution attempt 11, part 1: neural network with improvements
# Final submission: 
# Submission score: 0.762

# This neural network is a revival of v05 code in order to act as part of a multi-model prediction.

In [24]:
import v11_common as com

import numpy as np
import pandas as pd
import feather
import re
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, AlphaDropout

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [2]:
all_data = pd.read_feather("all_data.feather")
target_df = pd.read_feather("target.feather")

train_IDs = all_data[:len(target_df)]["SK_ID_CURR"]
test_IDs = all_data.loc[len(target_df):,"SK_ID_CURR"]

In [3]:
all_data.drop([c for c in all_data.columns if re.search("^ORGANIZATION_", c)], axis = 1, inplace = True)

In [4]:
bureau_df = pd.read_feather("bureau_sub.feather").set_index("SK_ID_CURR")
prev_app_df = pd.read_feather("previous_application_sub.feather").set_index("SK_ID_CURR")
cc_df = pd.read_feather("credit_card_sub.feather").set_index("SK_ID_CURR")
install_payment_df = pd.read_feather("installments_payments_sub.feather").set_index("SK_ID_CURR")
POS_cash_df = pd.read_feather("POS_cash_sub.feather").set_index("SK_ID_CURR")

In [5]:
supplemental_file_df = bureau_df.join(prev_app_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(cc_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(install_payment_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(POS_cash_df, how = "outer")
supplemental_cols = supplemental_file_df.columns

In [15]:
merged_data = all_data.join(supplemental_file_df, how = "left", on = "SK_ID_CURR")

In [16]:
merged_data[supplemental_cols] = merged_data[supplemental_cols].fillna(0).copy()

In [17]:
merged_data.dropna(axis = 1, inplace = True)
merged_data.drop("SK_ID_CURR", axis = 1, inplace = True)
merged_data.drop(["PERCENT_AMT_ATM_DRAWINGS_MAX","PERCENT_AMT_ATM_DRAWINGS_AMEAN"], axis = 1, inplace = True)

In [18]:
sc = StandardScaler()
sc.fit(merged_data)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [19]:
train_orig = merged_data.iloc[:len(target_df),:].copy()
test = merged_data.iloc[len(target_df):,:].copy()

In [20]:
# Artificially increase the number of TARGET = 1 cases
train = train_orig.copy()
train = pd.concat([train, target_df["TARGET"]], axis = 1)
train_real = train.loc[target_df["TARGET"] == 1,:]
train = pd.concat([train_real,train,train_real])
train = train.sample(frac = 1).reset_index(drop = True)
target2 = train["TARGET"]
train.drop("TARGET", axis = 1, inplace = True)

In [21]:
train = sc.transform(train)
test = sc.transform(test)

In [22]:
def keras_nn_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 350, kernel_initializer = 'uniform', activation = 'elu', input_dim = train.shape[1]))
    #classifier.add(BatchNormalization())
    classifier.add(Dropout(rate = 0.3))
    classifier.add(Dense(units = 200, kernel_initializer = 'uniform', activation = 'elu'))
    #classifier.add(BatchNormalization())
    classifier.add(Dropout(rate = 0.3))
    classifier.add(Dense(units = 150, kernel_initializer = 'uniform', activation = 'elu'))
    #classifier.add(BatchNormalization())
    classifier.add(Dropout(rate = 0.3))
    classifier.add(Dense(units = 85, kernel_initializer = 'uniform', activation = 'elu'))
    #classifier.add(BatchNormalization())
    classifier.add(Dropout(rate = 0.3))
    classifier.add(Dense(units = 40, kernel_initializer = 'uniform', activation = 'elu'))
    #classifier.add(BatchNormalization())
    classifier.add(Dropout(rate = 0.3))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier

In [25]:
#import keras.backend as K
# may be able to use the backend to calculate AUC

N_ITER = 8

val_aucs = []

for _ in range(N_ITER):
    train_train, train_val, target_train, target_val = train_test_split(train, target2)
    
    classifier = keras_nn_classifier()
    
    with com.timer("NN Training"):
        classifier.fit(train_train, target_train, batch_size = 5000, epochs = 15, verbose = 0)
    
    val_predictions = classifier.predict(train_val)
    auc = roc_auc_score(target_val, val_predictions)
    val_aucs.append(auc)
    print(auc)
print(f"Average AUC: {np.mean(val_aucs)}")

NN Training -- done in 14.997320890426636 sec
0.7833562998792846
NN Training -- done in 13.88657522201538 sec
0.7831617124575226
NN Training -- done in 15.36235237121582 sec
0.7824977028096484
NN Training -- done in 15.411657571792603 sec
0.7811878222416998
NN Training -- done in 13.577293872833252 sec
0.7810829918562088
NN Training -- done in 13.650235891342163 sec
0.7788608715288631
NN Training -- done in 14.8536958694458 sec
0.7814507520047171
NN Training -- done in 14.752395629882812 sec
0.7803443815664097
Average AUC: 0.7814928167930443


In [19]:
train_predictions = classifier.predict(train_train)
test_predictions = classifier.predict(train_val)
roc_train = roc_auc_score(target_train, train_predictions)
roc_test = roc_auc_score(target_val, test_predictions)
print(f"Train ROC: {roc_train}\nTest ROC: {roc_test}")

Train ROC: 0.7966232662574978
Test ROC: 0.7796471295328824


In [26]:
clf2 = keras_nn_classifier()
clf2.fit(train, target2, batch_size = 5000, epochs = 15, verbose = 1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7efeadb8ed30>

In [28]:
data_predictions = clf2.predict(test)
train_predictions = clf2.predict(train_orig)

In [29]:
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":np.reshape(data_predictions, newshape = (-1))})
train_preds = pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":np.reshape(train_predictions, newshape = (-1))})

In [30]:
submission.to_csv("v11_predictions_Keras.csv", index = False)
train_preds.to_csv("v11_predictions_Keras_train.csv", index = False)