In [7]:
# Solution attempt 5: neural network with improvements
# Final submission: 2018-06-21
# Submission score: 0.719

# This is intended to be a basic neural network thrown at the problem, with
# about the same level of preprocessing as the v01 logistic regression model did.

In [1]:
import numpy as np
#import keras
import pandas as pd
import feather
import re
from keras.models import Sequential
from keras.layers import Dense, Dropout

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def has_NAs(pd_series):
    return pd_series.isnull().values.any()

# This is deleting stuff that it shouldn't
def string_col_to_onehot(df, col_name):
    dummy_cols = pd.get_dummies(df[col_name], drop_first = True, prefix = col_name)
    df.drop(columns = [col_name], inplace = True)
    return(pd.concat([df, dummy_cols], axis = 1))

In [103]:
train = pd.read_feather("./../Data Files/application_train.feather")
test = pd.read_feather("./../Data Files/application_test.feather")

In [104]:
target = train["TARGET"]
train.drop("TARGET", inplace = True, axis = 1)
train_rows = len(train)
test_IDs = test["SK_ID_CURR"]

In [114]:
all_data = pd.concat([train,test], ignore_index = True)

In [115]:
# These operations are a bit more consistent if they're done on the training & test sets together
building_info_columns = [c for c in train.columns.tolist() if re.search("_AVG$|_MODE$|_MEDI$", c)]
other_columns_to_remove = ["EXT_SOURCE_1","EXT_SOURCE_3","OWN_CAR_AGE"]#,"NAME_TYPE_SUITE"]

all_data.drop(building_info_columns + other_columns_to_remove, inplace = True, axis = 1)

# NEW STUFF
all_data["CREDIT_INCOME_RATIO"] = all_data["AMT_CREDIT"] / all_data["AMT_INCOME_TOTAL"]
all_data["INCOME_PER_HEAD"] = all_data["AMT_INCOME_TOTAL"] / all_data["CNT_FAM_MEMBERS"]
all_data["ANNUITY_INCOME_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_INCOME_TOTAL"]

all_data["AMT_INCOME_TOTAL"] = np.log10(all_data["AMT_INCOME_TOTAL"])

conditions = [all_data["CNT_CHILDREN"] == 0, all_data["CNT_CHILDREN"] == 1, all_data["CNT_CHILDREN"] >= 2]
choices = ["0","1","2+"]
all_data["CNT_CHILDREN"] = np.select(conditions, choices)

object_cols = [col for col in all_data.columns if all_data[col].dtype == "O"]
for oc in object_cols:
    all_data = string_col_to_onehot(all_data, oc)

In [116]:
# Add data from bureau file
bureau = pd.read_feather("./../Data Files/bureau.feather")
bureau_sub = bureau.groupby("SK_ID_CURR").agg({"CREDIT_DAY_OVERDUE":lambda x: 1*any(x > 0)})
bureau_sub["CREDIT_COUNT"] = bureau.groupby("SK_ID_CURR").size()
bureau_sub.reset_index(inplace = True)
bureau_sub.rename(columns = {"CREDIT_DAY_OVERDUE":"ANY_OVERDUE"}, inplace = True)

all_data = all_data.join(bureau_sub.set_index("SK_ID_CURR"), on = "SK_ID_CURR").fillna(0)
del bureau, bureau_sub

In [36]:
sum(all_data["AMT_GOODS_PRICE"] == 0)

278

In [117]:
# Add data from previous_application
prev_application = pd.read_feather("./../Data Files/previous_application.feather")

prev_app_sub = prev_application.groupby("SK_ID_CURR").NAME_CONTRACT_STATUS.value_counts()
prev_app_sub = prev_app_sub.unstack("NAME_CONTRACT_STATUS")
prev_app_sub.drop(["Unused offer", "Canceled"], inplace = True, axis = 1)
prev_app_sub.fillna(value = 0, inplace = True)
prev_app_sub.rename(columns = {"Approved":"NUMBER_APPROVED", "Refused":"NUMBER_REFUSED"}, inplace = True)
prev_app_sub["NUMBER_APPLICATIONS"] = prev_application.groupby("SK_ID_CURR").size()

all_data = all_data.join(prev_app_sub, on = "SK_ID_CURR").fillna(0)
del prev_application, prev_app_sub

In [118]:
# Add credit card balance data
cc_balance = pd.read_feather("./../Data Files/credit_card_balance.feather")
cc_balance_grouped = cc_balance.groupby("SK_ID_CURR")
cc_balance_sub = cc_balance_grouped.agg({"SK_DPD_DEF":lambda x: sum(x != 0), "AMT_CREDIT_LIMIT_ACTUAL":max,
                                         "SK_ID_PREV":lambda x: len(x.unique()), "AMT_BALANCE":max})
cc_balance_sub.rename(columns = {"SK_DPD_DEF":"NUM_LATE_CC_PAYMENTS",
                                 "AMT_CREDIT_LIMIT_ACTUAL":"MAX_CREDIT_LIMIT",
                                 "SK_ID_PREV":"NUM_PREV_CC_LOANS",
                                 "AMT_BALANCE":"MAX_BALANCE"}, inplace = True)
cc_balance_sub["AVG_CC_BALANCE"] = cc_balance_grouped["AMT_BALANCE"].mean()

all_data = all_data.join(cc_balance_sub, on = "SK_ID_CURR").fillna(0)
del cc_balance, cc_balance_grouped, cc_balance_sub

In [119]:
# Add installments_payments.csv data
install_paym = pd.read_feather("./../Data Files/installments_payments.feather")
install_paym["AMT_PAYMENT"].fillna(value = install_paym["AMT_INSTALMENT"], inplace = True)
install_paym["DAYS_ENTRY_PAYMENT"].fillna(value = install_paym["DAYS_INSTALMENT"], inplace = True)

install_paym["PAYMENT_UNDER"] = install_paym["AMT_PAYMENT"] < install_paym["AMT_INSTALMENT"]
install_paym["DAYS_OFF_PAYMENT"] = install_paym["DAYS_INSTALMENT"] - install_paym["DAYS_ENTRY_PAYMENT"]
install_paym["PAYMENT_LATE"] = install_paym["DAYS_ENTRY_PAYMENT"] > install_paym["DAYS_INSTALMENT"]


install_paym_sub = install_paym.groupby("SK_ID_CURR").agg({"PAYMENT_UNDER":sum, "PAYMENT_LATE":sum, "AMT_PAYMENT":[min,max],"DAYS_OFF_PAYMENT":[min,max]})
install_paym_sub.columns = ["NUM_PAYMENTS_UNDER","NUM_PAYMENTS_LATE","MIN_PAYMENT","MAX_PAYMENT","BEST_PAYMENT_DATE","WORST_PAYMENT_DATE"]

all_data = all_data.join(install_paym_sub, on = "SK_ID_CURR").fillna(0)

del install_paym, install_paym_sub

In [120]:
# Add POS_CASH_balance.csv data
POS_cash = pd.read_feather("./../Data Files/POS_CASH_balance.feather")
POS_cash_sub = POS_cash.groupby("SK_ID_CURR").agg({"SK_DPD_DEF":lambda x: sum(x == 1)})
POS_cash_sub.columns = ["NUM_LATE_POS_PAYMENTS"]
POS_cash_sub.head()

all_data = all_data.join(POS_cash_sub, on = "SK_ID_CURR").fillna(0)

del POS_cash
del POS_cash_sub

In [61]:
all_data.head()

Unnamed: 0,SK_ID_CURR,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
0,100002,5.306425,406597.5,24700.5,351000.0,0.018801,-9461,-637,-3648.0,-2120,...,0,0,0,0,0,0,0,0,0,0
1,100003,5.431364,1293502.5,35698.5,1129500.0,0.003541,-16765,-1188,-1186.0,-291,...,0,0,0,0,0,0,0,0,0,0
2,100004,4.829304,135000.0,6750.0,135000.0,0.010032,-19046,-225,-4260.0,-2531,...,0,0,0,0,0,0,0,0,0,0
3,100006,5.130334,312682.5,29686.5,297000.0,0.008019,-19005,-3039,-9833.0,-2437,...,0,0,0,0,0,0,0,0,0,0
4,100007,5.084576,513000.0,21865.5,513000.0,0.028663,-19932,-3038,-4311.0,-3458,...,0,0,0,0,0,0,0,0,0,0


In [63]:
[x for x in all_data.columns.tolist() if re.search("CNT",x)]

['CNT_FAM_MEMBERS',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'CNT_CHILDREN_1',
 'CNT_CHILDREN_2+']

In [121]:
all_data.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [None]:
# Here's where the data processing ends and the NN training begins

In [122]:
train = all_data.iloc[:train_rows,:].copy()
test = all_data.iloc[train_rows:,:].copy()

In [123]:
train.dropna(inplace = True)
target = target[train.index]

In [18]:
for col in test.columns:
    if test[col].isnull().sum() > 0:
        if test[col].dtype == "O":
            test[col] = test[col].fillna("Unknown")
        else:
            test.loc[test[col].isnull(),col] = test[col].median(skipna = True)

In [124]:
for col in test.columns:
    print(test[col].dtype)

float64
float64
float64
float64
float64
int64
int64
float64
int64
int64
int64
int64
int64
int64
int64
float64
int64
int64
int64
int64
int64
int64
int64
int64
int64
float64
float64
float64
float64
float64
float64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
float64
float64
float64
float64
float64
float64
float64
float64
float64
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
uint8
ui

In [125]:
# Artificially increase the number of TARGET = 1 cases
train2 = train.copy()
train2 = pd.concat([train2, target], axis = 1)
train2_real = train2.loc[target == 1,:]
train2 = pd.concat([train2_real,train2,train2_real])
train2 = train2.sample(frac = 1).reset_index(drop = True)
target2 = train2["TARGET"]
train2.drop("TARGET", axis = 1, inplace = True)

In [126]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train2 = sc.fit_transform(train2)

In [127]:
split = int(len(train2)*0.8)
train2_train, train2_test, target2_train, target2_test = train2[:split], train2[split:], target2[:split], target2[split:]

In [134]:
#Best so far: 85-85-40 units, 0.35-0.35-0.25 dropout, batch size 2500, 35 epochs
#AUC: 0.774799 train_train / 0.752438 train_test
classifier = Sequential()
classifier.add(Dense(units = 85, kernel_initializer = 'uniform', activation = 'relu', input_dim = train2.shape[1]))
classifier.add(Dropout(rate = 0.35))
classifier.add(Dense(units = 85, kernel_initializer = 'uniform', activation = 'relu', input_dim = train2.shape[1]))
classifier.add(Dropout(rate = 0.35))
classifier.add(Dense(units = 40, kernel_initializer = 'uniform', activation = 'relu', input_dim = train2.shape[1]))
classifier.add(Dropout(rate = 0.25))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics = ['accuracy'])

In [135]:
classifier.fit(train2_train, target2_train, batch_size = 2500, epochs = 35)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.callbacks.History at 0x7ff01ab34160>

In [136]:
train2_test2 = sc.fit_transform(train2_test)
train_predictions = classifier.predict(train2_train)
test_predictions = classifier.predict(train2_test2)
roc_train = roc_auc_score(target2_train, train_predictions)
roc_test = roc_auc_score(target2_test, test_predictions)
print(f"Train ROC: {roc_train}\nTest ROC: {roc_test}")

Train ROC: 0.7747986222907894
Test ROC: 0.7524384601382785


In [26]:
from sklearn.metrics import roc_auc_score

In [137]:
data_predictions = classifier.predict(sc.fit_transform(test))

In [138]:
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":np.reshape(data_predictions, newshape = (-1))})

In [139]:
submission.to_csv("v05_predictions.csv", index = False)