In [None]:
# Solution attempt 10: LightGBM, Stacking with Improved Logistic Regression
# Final submission: 2018-08-18
# Submission score: 0.792

# Further work with lightgbm, including stacking of other models as well.
# Started this mostly to snapshot the previous solution.

In [None]:
# General stuff to work on:
# - figure out where I could stick harmonic/geometric means
# - dimensionality reduction on the _MODE/_MEAN features?

In [1]:
import v10_common as com

In [2]:
import feather
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd

<h2> Load up the data and start combining it all here. </h2>

In [3]:
all_data = pd.read_feather("all_data.feather")
bureau_df = pd.read_feather("bureau_sub.feather").set_index("SK_ID_CURR")
prev_app_df = pd.read_feather("previous_application_sub.feather").set_index("SK_ID_CURR")
cc_df = pd.read_feather("credit_card_sub.feather").set_index("SK_ID_CURR")
install_payment_df = pd.read_feather("installments_payments_sub.feather").set_index("SK_ID_CURR")
POS_cash_df = pd.read_feather("POS_cash_sub.feather").set_index("SK_ID_CURR")
target_df = pd.read_feather("target.feather")

In [4]:
supplemental_file_df = bureau_df.join(prev_app_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(cc_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(install_payment_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(POS_cash_df, how = "outer")

In [5]:
merged_data = all_data.join(supplemental_file_df, how = "left", on = "SK_ID_CURR")

In [6]:
train = merged_data.iloc[:len(target_df),:].copy()
test = merged_data.iloc[len(target_df):,:].copy()
train.reset_index(drop = True).to_feather("v10 train data.feather")
test.reset_index(drop = True).to_feather("v10 test data.feather")

In [7]:
test_IDs = test["SK_ID_CURR"]

train.drop(["SK_ID_CURR"], axis = 1, inplace = True)
test.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [9]:
lgbm_parameters = {"n_estimators":6000,
                   "learning_rate":0.01,     #previous: 0.01
                   "num_leaves":32,          #previous: 32, 24 (opt)
                   "colsample_by_tree":0.8,  #previous: 0.8
                   "subsample":0.771,        #previous: 0.9, 0.771 (opt)
                   "max_depth":7,            #previous: 7, 4 (opt)
                   "reg_alpha":0.084,        #previous: 0.15, 0.084 (opt)
                   "reg_lambda":0.091,       #previous: 0.01, 0.091 (opt)
                   "min_split_gain":0.093,   #previous: 0.1, 0.093 (opt)
                   "min_child_weight":2}     #previous: 2

In [9]:
# Train it up!
# 2018-08-14: Removed a bunch of unimportant cols from all_data;
#             added number of apps in past N days to bureau;
#             added num/frac cols for NAME_CONTRACT_TYPE in prev_app;
#             added AMT_DOWN_PAYMENT aggs to prev_app;
#             added AMT_RECEIVABLE_PRINCIPAL, AMT_RECEIVABLE, AMT_RECEIVABLE_TOTAL aggs to credit_card


# Previous val AUC: 0.790933
# Best AUC: 0.791234

from sklearn.model_selection import KFold

folds = KFold(n_splits = 8, shuffle = True)
auc_scores = []

feature_importance_df = pd.DataFrame({"Features":train.columns})

for n_fold, (train_ids, val_ids) in enumerate(folds.split(train)):
    print("#### FOLD NUMBER " + str(n_fold + 1) + " ####")
    train_train = train.iloc[train_ids]
    train_test = train.iloc[val_ids]
    target_train = target_df["TARGET"][train_ids]
    target_test = target_df["TARGET"][val_ids]
    
    clf = LGBMClassifier(**lgbm_parameters, device = "gpu")
    clf.fit(train_train, target_train, eval_set = [(train_train, target_train), (train_test, target_test)], eval_metric = "auc", early_stopping_rounds = 50, verbose = 100)
    auc_scores.append(clf._best_score["valid_1"]["auc"])
    feature_importance_df["Importance Fold " + str(n_fold + 1)] = pd.Series(clf.feature_importances_)

print("Done.")

#### FOLD NUMBER 1 ####
Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.769697	valid_1's auc: 0.758266
[200]	training's auc: 0.781813	valid_1's auc: 0.765949
[300]	training's auc: 0.792121	valid_1's auc: 0.771756
[400]	training's auc: 0.800171	valid_1's auc: 0.775919
[500]	training's auc: 0.806851	valid_1's auc: 0.778617
[600]	training's auc: 0.812594	valid_1's auc: 0.780668
[700]	training's auc: 0.817951	valid_1's auc: 0.782201
[800]	training's auc: 0.822726	valid_1's auc: 0.783319
[900]	training's auc: 0.827169	valid_1's auc: 0.78411
[1000]	training's auc: 0.831322	valid_1's auc: 0.784781
[1100]	training's auc: 0.835288	valid_1's auc: 0.78532
[1200]	training's auc: 0.839049	valid_1's auc: 0.785684
[1300]	training's auc: 0.842544	valid_1's auc: 0.786053
[1400]	training's auc: 0.845996	valid_1's auc: 0.786328
[1500]	training's auc: 0.849254	valid_1's auc: 0.786506
[1600]	training's auc: 0.852452	valid_1's auc: 0.786694
[1700]	training's auc: 0.8554

[1600]	training's auc: 0.850902	valid_1's auc: 0.793896
Early stopping, best iteration is:
[1576]	training's auc: 0.850098	valid_1's auc: 0.793946
#### FOLD NUMBER 8 ####
Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.769718	valid_1's auc: 0.758576
[200]	training's auc: 0.781697	valid_1's auc: 0.767486
[300]	training's auc: 0.791904	valid_1's auc: 0.774185
[400]	training's auc: 0.799495	valid_1's auc: 0.778592
[500]	training's auc: 0.806056	valid_1's auc: 0.781794
[600]	training's auc: 0.811872	valid_1's auc: 0.784394
[700]	training's auc: 0.817071	valid_1's auc: 0.786521
[800]	training's auc: 0.821903	valid_1's auc: 0.788097
[900]	training's auc: 0.8264	valid_1's auc: 0.789258
[1000]	training's auc: 0.830598	valid_1's auc: 0.790092
[1100]	training's auc: 0.834601	valid_1's auc: 0.790794
[1200]	training's auc: 0.838407	valid_1's auc: 0.791201
[1300]	training's auc: 0.842061	valid_1's auc: 0.791636
[1400]	training's auc: 0.845537	valid_1's auc: 0.7

In [10]:
feature_importance_df["Average Importance"] = feature_importance_df.iloc[:,1:].mean(axis=1)
feature_importance_df.to_csv("v10 Feature Importance.csv", index = False)

In [11]:
print(auc_scores)
print(np.mean(auc_scores))

[0.7869814999730124, 0.7875365208437511, 0.7841016129295512, 0.7961649897943535, 0.7950733593522961, 0.7894316915801407, 0.7939455455138253, 0.7942320268633462]
0.7909334058562846


In [10]:
# Last AUC @ 000 iterations: 0.925479

clf2 = LGBMClassifier(**lgbm_parameters)
clf2.fit(train,
         target_df["TARGET"],
         eval_set = [(train, target_df["TARGET"])],
         eval_metric = "auc",
         early_stopping_rounds = 150,
         verbose = 100)

Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.768876
[200]	training's auc: 0.780286
[300]	training's auc: 0.789838
[400]	training's auc: 0.797285
[500]	training's auc: 0.803412
[600]	training's auc: 0.809166
[700]	training's auc: 0.81405
[800]	training's auc: 0.81842
[900]	training's auc: 0.822487
[1000]	training's auc: 0.826238
[1100]	training's auc: 0.829689
[1200]	training's auc: 0.833029
[1300]	training's auc: 0.836336
[1400]	training's auc: 0.839437
[1500]	training's auc: 0.842557
[1600]	training's auc: 0.845602
[1700]	training's auc: 0.848544
[1800]	training's auc: 0.851321
[1900]	training's auc: 0.854017
[2000]	training's auc: 0.856646
[2100]	training's auc: 0.859216
[2200]	training's auc: 0.861688
[2300]	training's auc: 0.864116
[2400]	training's auc: 0.86644
[2500]	training's auc: 0.868723
[2600]	training's auc: 0.87094
[2700]	training's auc: 0.873211
[2800]	training's auc: 0.875142
[2900]	training's auc: 0.877117
[3000]	training's auc:

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_by_tree=0.8,
        colsample_bytree=1.0, importance_type='split', learning_rate=0.01,
        max_depth=7, min_child_samples=20, min_child_weight=2,
        min_split_gain=0.093, n_estimators=6000, n_jobs=-1, num_leaves=32,
        objective=None, random_state=None, reg_alpha=0.084,
        reg_lambda=0.091, silent=True, subsample=0.771,
        subsample_for_bin=200000, subsample_freq=0)

In [11]:
data_predictions = clf2.predict_proba(test, num_iteration = clf2.best_iteration_)
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":data_predictions[:,1]})
submission.to_csv("v10_predictions.csv", index = False)