In [1]:
# Solution attempt 11: LightGBM Stacked with Logistic Regression
# Final submission: 
# Submission score: 0.792

# Additional tweaking with LGBM.  Should stay largely the same as
# v10's version, since I think I've taken it about as far as I can
# understand right now.

In [2]:
# General stuff to work on:
# - figure out where I could stick harmonic/geometric means
# - dimensionality reduction on the _MODE/_MEAN features?

In [1]:
import v11_common as com

In [2]:
import feather
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd

<h2> Load up the data and start combining it all here. </h2>

In [3]:
all_data = pd.read_feather("all_data.feather")
bureau_df = pd.read_feather("bureau_sub.feather").set_index("SK_ID_CURR")
prev_app_df = pd.read_feather("previous_application_sub.feather").set_index("SK_ID_CURR")
cc_df = pd.read_feather("credit_card_sub.feather").set_index("SK_ID_CURR")
install_payment_df = pd.read_feather("installments_payments_sub.feather").set_index("SK_ID_CURR")
POS_cash_df = pd.read_feather("POS_cash_sub.feather").set_index("SK_ID_CURR")
target_df = pd.read_feather("target.feather")

In [4]:
supplemental_file_df = bureau_df.join(prev_app_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(cc_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(install_payment_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(POS_cash_df, how = "outer")

In [5]:
merged_data = all_data.join(supplemental_file_df, how = "left", on = "SK_ID_CURR")

In [6]:
train = merged_data.iloc[:len(target_df),:].copy()
test = merged_data.iloc[len(target_df):,:].copy()
train.reset_index(drop = True).to_feather("v11 train data.feather")
test.reset_index(drop = True).to_feather("v11 test data.feather")

In [7]:
train_IDs = train["SK_ID_CURR"]
test_IDs = test["SK_ID_CURR"]

train.drop(["SK_ID_CURR"], axis = 1, inplace = True)
test.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [8]:
lgbm_parameters = {"n_estimators":6000,
                   "learning_rate":0.01,      #previous: 0.01
                   "num_leaves":20,           #previous: 32, 20 (opt)
                   "colsample_by_tree":0.932, #previous: 0.9315 (opt)
                   "subsample":0.6124,         #previous: 0.771, 0.6124 (opt)
                   "max_depth":4,             #previous: 7, 4 (opt)
                   "reg_alpha":0.0586,        #previous: 0.15, 0.0586 (opt)
                   "reg_lambda":0.2184,       #previous: 0.091, 0.2184 (opt)
                   "min_split_gain":0.0913,   #previous: 0.1, 0.0913 (opt)
                   "min_child_weight":2}      #previous: 2, 49 (opt)

In [24]:
# Train it up!
# 2018-08-14: Removed a bunch of unimportant cols from all_data;
#             added number of apps in past N days to bureau;
#             added num/frac cols for NAME_CONTRACT_TYPE in prev_app;
#             added AMT_DOWN_PAYMENT aggs to prev_app;
#             added AMT_RECEIVABLE_PRINCIPAL, AMT_RECEIVABLE, AMT_RECEIVABLE_TOTAL aggs to credit_card


# Previous val AUC: 0.791502
# Best AUC: 0.791502

from sklearn.model_selection import KFold

folds = KFold(n_splits = 8, shuffle = True)
auc_scores = []

feature_importance_df = pd.DataFrame({"Features":train.columns})

for n_fold, (train_ids, val_ids) in enumerate(folds.split(train)):
    print("#### FOLD NUMBER " + str(n_fold + 1) + " ####")
    train_train = train.iloc[train_ids]
    train_test = train.iloc[val_ids]
    target_train = target_df["TARGET"][train_ids]
    target_test = target_df["TARGET"][val_ids]
    
    clf = LGBMClassifier(**lgbm_parameters, device = "gpu")
    clf.fit(train_train, target_train, eval_set = [(train_train, target_train), (train_test, target_test)],
            eval_metric = "auc", early_stopping_rounds = 50, verbose = 100)
    auc_scores.append(clf._best_score["valid_1"]["auc"])
    feature_importance_df["Importance Fold " + str(n_fold + 1)] = pd.Series(clf.feature_importances_)

print("Done.")

#### FOLD NUMBER 1 ####
Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.760637	valid_1's auc: 0.753878
[200]	training's auc: 0.770123	valid_1's auc: 0.760497
[300]	training's auc: 0.778169	valid_1's auc: 0.766785
[400]	training's auc: 0.783932	valid_1's auc: 0.77104
[500]	training's auc: 0.788379	valid_1's auc: 0.774109
[600]	training's auc: 0.791948	valid_1's auc: 0.776424
[700]	training's auc: 0.794996	valid_1's auc: 0.778158
[800]	training's auc: 0.797775	valid_1's auc: 0.779573
[900]	training's auc: 0.800345	valid_1's auc: 0.780904
[1000]	training's auc: 0.802808	valid_1's auc: 0.782243
[1100]	training's auc: 0.80508	valid_1's auc: 0.783324
[1200]	training's auc: 0.807133	valid_1's auc: 0.784168
[1300]	training's auc: 0.809207	valid_1's auc: 0.784894
[1400]	training's auc: 0.811097	valid_1's auc: 0.785609
[1500]	training's auc: 0.812884	valid_1's auc: 0.786132
[1600]	training's auc: 0.814556	valid_1's auc: 0.786642
[1700]	training's auc: 0.8161

[1600]	training's auc: 0.814029	valid_1's auc: 0.785462
[1700]	training's auc: 0.815652	valid_1's auc: 0.785838
[1800]	training's auc: 0.817194	valid_1's auc: 0.786138
[1900]	training's auc: 0.818752	valid_1's auc: 0.786424
[2000]	training's auc: 0.820242	valid_1's auc: 0.786644
[2100]	training's auc: 0.821723	valid_1's auc: 0.786844
[2200]	training's auc: 0.823086	valid_1's auc: 0.786982
[2300]	training's auc: 0.824379	valid_1's auc: 0.787127
[2400]	training's auc: 0.825615	valid_1's auc: 0.787263
[2500]	training's auc: 0.826813	valid_1's auc: 0.787406
[2600]	training's auc: 0.828125	valid_1's auc: 0.78752
[2700]	training's auc: 0.829374	valid_1's auc: 0.787627
[2800]	training's auc: 0.830536	valid_1's auc: 0.787711
[2900]	training's auc: 0.831732	valid_1's auc: 0.787808
[3000]	training's auc: 0.832882	valid_1's auc: 0.787882
[3100]	training's auc: 0.834006	valid_1's auc: 0.787962
Early stopping, best iteration is:
[3115]	training's auc: 0.83417	valid_1's auc: 0.787979
#### FOLD NUMBE

[100]	training's auc: 0.76031	valid_1's auc: 0.755609
[200]	training's auc: 0.769771	valid_1's auc: 0.763578
[300]	training's auc: 0.77789	valid_1's auc: 0.769645
[400]	training's auc: 0.783686	valid_1's auc: 0.773836
[500]	training's auc: 0.788031	valid_1's auc: 0.776667
[600]	training's auc: 0.79161	valid_1's auc: 0.778722
[700]	training's auc: 0.794698	valid_1's auc: 0.78042
[800]	training's auc: 0.797513	valid_1's auc: 0.781755
[900]	training's auc: 0.800151	valid_1's auc: 0.782969
[1000]	training's auc: 0.802543	valid_1's auc: 0.783937
[1100]	training's auc: 0.804822	valid_1's auc: 0.784907
[1200]	training's auc: 0.806905	valid_1's auc: 0.785717
[1300]	training's auc: 0.808824	valid_1's auc: 0.786467
[1400]	training's auc: 0.810647	valid_1's auc: 0.787172
[1500]	training's auc: 0.812343	valid_1's auc: 0.787789
[1600]	training's auc: 0.813995	valid_1's auc: 0.788132
[1700]	training's auc: 0.815591	valid_1's auc: 0.788426
[1800]	training's auc: 0.817129	valid_1's auc: 0.78872
[1900]

In [26]:
feature_importance_df["Average Importance"] = feature_importance_df.iloc[:,1:].mean(axis=1)
feature_importance_df.to_csv("v11 Feature Importance LGBM.csv", index = False)

In [25]:
print(auc_scores)
print(np.mean(auc_scores))

[0.7912602703806356, 0.7903675326201419, 0.7893833077839807, 0.7879794983564918, 0.7900246389702066, 0.7964600850623501, 0.7938164487641909, 0.792462169567831]
0.7914692439382285


In [9]:
# Last AUC @ 6000 iterations: 0.854995
with com.timer("TEST"):
    clf2 = LGBMClassifier(**lgbm_parameters)
    clf2.fit(train,
             target_df["TARGET"],
             eval_set = [(train, target_df["TARGET"])],
             eval_metric = "auc",
             early_stopping_rounds = 150,
             verbose = 100)

Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.760213
[200]	training's auc: 0.769238
[300]	training's auc: 0.777003
[400]	training's auc: 0.782602
[500]	training's auc: 0.786822
[600]	training's auc: 0.790227
[700]	training's auc: 0.793171
[800]	training's auc: 0.795935
[900]	training's auc: 0.798404
[1000]	training's auc: 0.800619
[1100]	training's auc: 0.802772
[1200]	training's auc: 0.804676
[1300]	training's auc: 0.806442
[1400]	training's auc: 0.808224
[1500]	training's auc: 0.80981
[1600]	training's auc: 0.811356
[1700]	training's auc: 0.812787
[1800]	training's auc: 0.814223
[1900]	training's auc: 0.815566
[2000]	training's auc: 0.81685
[2100]	training's auc: 0.818109
[2200]	training's auc: 0.819396
[2300]	training's auc: 0.820634
[2400]	training's auc: 0.82189
[2500]	training's auc: 0.823052
[2600]	training's auc: 0.824198
[2700]	training's auc: 0.825344
[2800]	training's auc: 0.826466
[2900]	training's auc: 0.827487
[3000]	training's auc

In [10]:
train_predictions = clf2.predict_proba(train, num_iteration = clf2.best_iteration_)
data_predictions = clf2.predict_proba(test, num_iteration = clf2.best_iteration_)

In [11]:
train_preds = pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":train_predictions[:,1]})
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":data_predictions[:,1]})

train_preds.to_csv("v11_predictions_LGBM_train.csv", index = False)
submission.to_csv("v11_predictions_LGBM.csv", index = False)