In [None]:
# Solution attempt 7: lightgbm
# Final submission: 2018-07-01
# Submission score: 0.767

# A first attempt at using the lightgbm package for prediction.  Will also be conducting feature
# importance checks.

In [1]:
import numpy as np
import pandas as pd
import feather
import re
from lightgbm import LGBMClassifier

In [2]:
def has_NAs(pd_series):
    return pd_series.isnull().values.any()

# This is deleting stuff that it shouldn't
def string_col_to_onehot(df, col_name):
    dummy_cols = pd.get_dummies(df[col_name], drop_first = True, prefix = col_name)
    df.drop(columns = [col_name], inplace = True)
    return(pd.concat([df, dummy_cols], axis = 1))

In [76]:
# Load the main data files; don't load anything else yet, 'cause there's a lot
train = pd.read_feather("./../Data Files/application_train.feather")
test = pd.read_feather("./../Data Files/application_test.feather")

# Split off TARGET and establish a couple variables we'll need later
target = train["TARGET"]
train_IDs = train["SK_ID_CURR"]
train.drop("TARGET", inplace = True, axis = 1)
train_rows = len(train)
test_IDs = test["SK_ID_CURR"]

In [31]:
#train["ORGANIZATION_TYPE"].value_counts()

In [78]:
# Merge everything into a single dataset; this'll make processing easier.
all_data = pd.concat([train,test], ignore_index = True)

# These operations are a bit more consistent if they're done on the training & test sets together
building_info_columns = [c for c in train.columns.tolist() if re.search("_AVG$|_MODE$", c)]#("_AVG$|_MODE$|_MEDI$", c)]

# Columns removed due to low importance in lightgbm model
other_columns_to_remove = ["FLAG_MOBIL","FLAG_DOCUMENT_2","FLAG_DOCUMENT_4","FLAG_DOCUMENT_7","FLAG_DOCUMENT_9",
                           "FLAG_DOCUMENT_10","FLAG_DOCUMENT_12","FLAG_DOCUMENT_17","FLAG_DOCUMENT_19","FLAG_DOCUMENT_20"]

all_data.drop(building_info_columns + other_columns_to_remove, inplace = True, axis = 1)

# NEW STUFF
all_data["CREDIT_INCOME_RATIO"] = all_data["AMT_CREDIT"] / all_data["AMT_INCOME_TOTAL"]
all_data["INCOME_PER_HEAD"] = all_data["AMT_INCOME_TOTAL"] / all_data["CNT_FAM_MEMBERS"]
all_data["ANNUITY_INCOME_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_INCOME_TOTAL"]

all_data["AMT_INCOME_TOTAL"] = np.log10(all_data["AMT_INCOME_TOTAL"])
all_data.loc[all_data["CODE_GENDER"] == "XNA", "CODE_GENDER"] = "F"
all_data.loc[all_data["DAYS_EMPLOYED"] == 365243, "DAYS_EMPLOYED"] = np.NaN

conditions = [all_data["CNT_CHILDREN"] == 0, all_data["CNT_CHILDREN"] == 1, all_data["CNT_CHILDREN"] >= 2]
choices = ["0","1","2+"]
all_data["CNT_CHILDREN"] = np.select(conditions, choices)

object_cols = [col for col in all_data.columns if all_data[col].dtype == "O"]
for oc in object_cols:
    all_data = string_col_to_onehot(all_data, oc)

In [79]:
# Add data from bureau file
bureau = pd.read_feather("./../Data Files/bureau.feather")
bureau_sub = bureau.groupby("SK_ID_CURR").agg({"CREDIT_DAY_OVERDUE":lambda x: 1*any(x > 0)})
bureau_sub["CREDIT_COUNT"] = bureau.groupby("SK_ID_CURR").size()
bureau_sub.reset_index(inplace = True)
bureau_sub.rename(columns = {"CREDIT_DAY_OVERDUE":"ANY_OVERDUE"}, inplace = True)

all_data = all_data.join(bureau_sub.set_index("SK_ID_CURR"), on = "SK_ID_CURR")#.fillna(0)
del bureau, bureau_sub

In [80]:
# Add data from previous_application
prev_application = pd.read_feather("./../Data Files/previous_application.feather")

prev_app_grouped = prev_application.groupby("SK_ID_CURR")
prev_app_sub = prev_app_grouped["NAME_CONTRACT_STATUS"].value_counts()
prev_app_sub = prev_app_sub.unstack("NAME_CONTRACT_STATUS")
#prev_app_sub.drop(["Unused offer", "Canceled"], inplace = True, axis = 1)
prev_app_sub.fillna(value = 0, inplace = True)
prev_app_sub.rename(columns = {"Approved":"NUMBER_APPROVED", "Canceled":"NUMBER_CANCELED",
                               "Refused":"NUMBER_REFUSED", "Unused offer":"NUMBER_UNUSED"},
                    inplace = True)
prev_app_sub["NUMBER_APPLICATIONS"] = prev_app_grouped.size()
prev_app_sub["LAST_DECISION_DATE"] = prev_app_grouped.agg({"DAYS_DECISION":min})["DAYS_DECISION"]

all_data = all_data.join(prev_app_sub, on = "SK_ID_CURR")#.fillna(0)
del prev_application, prev_app_sub

In [81]:
# Add credit card balance data
cc_balance = pd.read_feather("./../Data Files/credit_card_balance.feather")
cc_balance_grouped = cc_balance.groupby("SK_ID_CURR")
cc_balance_sub = cc_balance_grouped.agg({"SK_DPD_DEF":lambda x: sum(x != 0), "AMT_CREDIT_LIMIT_ACTUAL":max,
                                         "SK_ID_PREV":lambda x: len(x.unique()), "AMT_BALANCE":max,
                                         "AMT_PAYMENT_TOTAL_CURRENT":max, "CNT_DRAWINGS_CURRENT":max})
cc_balance_sub.rename(columns = {"SK_DPD_DEF":"NUM_LATE_CC_PAYMENTS",
                                 "AMT_CREDIT_LIMIT_ACTUAL":"MAX_CREDIT_LIMIT",
                                 "SK_ID_PREV":"NUM_PREV_CC_LOANS",
                                 "AMT_BALANCE":"MAX_BALANCE",
                                 "AMT_PAYMENT_TOTAL_CURRENT":"MAX_PAID_ON_CREDIT",
                                 "CNT_DRAWINGS_CURRENT":"MAX_DRAWINGS_IN_MONTH", }, inplace = True)
cc_balance_sub["AVG_CC_BALANCE"] = cc_balance_grouped["AMT_BALANCE"].mean()
cc_balance_sub["AVG_DRAWINGS_PER_MONTH"] = cc_balance_grouped["CNT_DRAWINGS_CURRENT"].mean()


cc_balance_sub.head()
all_data = all_data.join(cc_balance_sub, on = "SK_ID_CURR")#.fillna(0)
del cc_balance, cc_balance_grouped, cc_balance_sub

In [82]:
# Add installments_payments.csv data
install_paym = pd.read_feather("./../Data Files/installments_payments.feather")
install_paym["AMT_PAYMENT"].fillna(value = install_paym["AMT_INSTALMENT"], inplace = True)
install_paym["DAYS_ENTRY_PAYMENT"].fillna(value = install_paym["DAYS_INSTALMENT"], inplace = True)

install_paym["PAYMENT_UNDER"] = install_paym["AMT_PAYMENT"] < install_paym["AMT_INSTALMENT"]
install_paym["DAYS_OFF_PAYMENT"] = install_paym["DAYS_INSTALMENT"] - install_paym["DAYS_ENTRY_PAYMENT"]
install_paym["PAYMENT_LATE"] = install_paym["DAYS_ENTRY_PAYMENT"] > install_paym["DAYS_INSTALMENT"]

install_paym_sub = install_paym.groupby("SK_ID_CURR").agg({"PAYMENT_UNDER":sum, "PAYMENT_LATE":sum, "AMT_PAYMENT":[min,max],"DAYS_OFF_PAYMENT":[min,max]})
install_paym_sub.columns = ["NUM_PAYMENTS_UNDER","NUM_PAYMENTS_LATE","MIN_PAYMENT","MAX_PAYMENT","BEST_PAYMENT_DATE","WORST_PAYMENT_DATE"]

all_data = all_data.join(install_paym_sub, on = "SK_ID_CURR")#.fillna(0)

del install_paym, install_paym_sub

In [83]:
# Add POS_CASH_balance.csv data
POS_cash = pd.read_feather("./../Data Files/POS_CASH_balance.feather")
POS_cash_sub = POS_cash.groupby("SK_ID_CURR").agg({"SK_DPD_DEF":lambda x: sum(x == 1)})
POS_cash_sub.columns = ["NUM_LATE_POS_PAYMENTS"]
POS_cash_sub.head()

all_data = all_data.join(POS_cash_sub, on = "SK_ID_CURR")#.fillna(0)

del POS_cash, POS_cash_sub

In [84]:
all_data.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [65]:
train.columns.tolist()

['AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'CNT_FAM_MEMBERS',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'REG_CITY_NOT_LIVE_CITY',
 'REG_CITY_NOT_WORK_CITY',
 'LIVE_CITY_NOT_WORK_CITY',
 'EXT_SOURCE_1',
 'EXT_SOURCE_2',
 'EXT_SOURCE_3',
 'APARTMENTS_MEDI',
 'BASEMENTAREA_MEDI',
 'YEARS_BEGINEXPLUATATION_MEDI',
 'YEARS_BUILD_MEDI',
 'COMMONAREA_MEDI',
 'ELEVATORS_MEDI',
 'ENTRANCES_MEDI',
 'FLOORSMAX_MEDI',
 'FLOORSMIN_MEDI',
 'LANDAREA_MEDI',
 'LIVINGAPARTMENTS_MEDI',
 'LIVINGAREA_MEDI',
 'NONLIVINGAPARTMENTS_MEDI',
 'NONLIVINGAREA_MEDI',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIA

In [94]:
train = all_data.iloc[:train_rows,:].copy()
test = all_data.iloc[train_rows:,:].copy()
train.reset_index(drop = True).to_feather("./../Solution attempts/v07 train data.feather")
test.reset_index(drop = True).to_feather("./../Solution attempts/v07 test data.feather")

In [40]:
pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":target}).to_feather("./../Solution attempts/v07 target.feather")

In [86]:
# Train it up!
# Last validation AUC: 0.772867 (averaged over 5 folds)
# Best validation AUC: 0.772867 (averaged over 5 folds)

from sklearn.model_selection import KFold

folds = KFold(n_splits = 8, shuffle = True)
auc_scores = []

feature_importance_df = pd.DataFrame({"Features":train.columns})

lgbm_parameters = {"n_estimators":5000,
                   "learning_rate":0.01,     #previous: 0.03
                   "num_leaves":25,          #previous: 30
                   "colsample_by_tree":0.8,  #previous: 0.8
                   "subsample":0.9,          #previous: 0.9
                   "max_depth":7,            #previous: 7
                   "reg_alpha":0.15,         #previous: 0.1
                   "reg_lambda":0.01,        #previous: 0.1
                   "min_split_gain":0.1,     #previous: 0.1
                   "min_child_weight":2}     #previous: 0.1

for n_fold, (train_ids, val_ids) in enumerate(folds.split(train)):
    print("#### FOLD NUMBER " + str(n_fold + 1) + " ####")
    train_train = train.iloc[train_ids]
    train_test = train.iloc[val_ids]
    target_train = target[train_ids]
    target_test = target[val_ids]
    
    clf = LGBMClassifier(**lgbm_parameters)
    #clf = LGBMClassifier(n_estimators = 4000, learning_rate = 0.03, num_leaves = 30, colsample_by_tree = 0.75, subsample = 0.8, max_depth = 7, reg_alpha = 0.1, reg_lambda = 0.1, min_split_gain = 0.1, min_child_weight = 2)
    clf.fit(train_train, target_train, eval_set = [(train_train, target_train), (train_test, target_test)], eval_metric = "auc", early_stopping_rounds = 50, verbose = 100)
    auc_scores.append(clf._best_score["valid_1"]["auc"])
    feature_importance_df["Importance Fold " + str(n_fold + 1)] = pd.Series(clf.feature_importances_)

#### FOLD NUMBER 1 ####
Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.733496	valid_1's auc: 0.718445
[200]	training's auc: 0.742186	valid_1's auc: 0.724738
[300]	training's auc: 0.754067	valid_1's auc: 0.735034
[400]	training's auc: 0.765206	valid_1's auc: 0.744907
[500]	training's auc: 0.773674	valid_1's auc: 0.751301
[600]	training's auc: 0.780001	valid_1's auc: 0.755648
[700]	training's auc: 0.785112	valid_1's auc: 0.758546
[800]	training's auc: 0.789459	valid_1's auc: 0.760508
[900]	training's auc: 0.793169	valid_1's auc: 0.761986
[1000]	training's auc: 0.796541	valid_1's auc: 0.763188
[1100]	training's auc: 0.799828	valid_1's auc: 0.764256
[1200]	training's auc: 0.803034	valid_1's auc: 0.764922
[1300]	training's auc: 0.806082	valid_1's auc: 0.765393
[1400]	training's auc: 0.808989	valid_1's auc: 0.765846
[1500]	training's auc: 0.811824	valid_1's auc: 0.766211
[1600]	training's auc: 0.814529	valid_1's auc: 0.766452
[1700]	training's auc: 0.81

[1200]	training's auc: 0.802751	valid_1's auc: 0.770208
[1300]	training's auc: 0.805871	valid_1's auc: 0.770771
[1400]	training's auc: 0.808832	valid_1's auc: 0.771166
[1500]	training's auc: 0.811653	valid_1's auc: 0.771465
[1600]	training's auc: 0.814407	valid_1's auc: 0.771776
[1700]	training's auc: 0.817112	valid_1's auc: 0.772044
[1800]	training's auc: 0.819812	valid_1's auc: 0.772263
[1900]	training's auc: 0.822395	valid_1's auc: 0.772556
[2000]	training's auc: 0.824895	valid_1's auc: 0.772662
[2100]	training's auc: 0.827321	valid_1's auc: 0.772696
[2200]	training's auc: 0.82981	valid_1's auc: 0.772847
Early stopping, best iteration is:
[2168]	training's auc: 0.828993	valid_1's auc: 0.772857
#### FOLD NUMBER 7 ####
Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.732342	valid_1's auc: 0.728704
[200]	training's auc: 0.740805	valid_1's auc: 0.735966
[300]	training's auc: 0.752359	valid_1's auc: 0.746598
[400]	training's auc: 0.763587	valid_1's au

In [97]:
feature_importance_df["Average Importance"] = feature_importance_df.iloc[:,1:].mean(axis=1)
feature_importance_df.to_csv("v07 Feature Importance3.csv", index = False)
feature_importance_df

Unnamed: 0,Features,Importance Fold 1,Importance Fold 2,Importance Fold 3,Importance Fold 4,Importance Fold 5,Importance Fold 6,Importance Fold 7,Importance Fold 8,Average Importance
0,AMT_INCOME_TOTAL,474,908,557,699,495,479,480,702,599.250
1,AMT_CREDIT,1388,1996,1473,1766,1380,1436,1472,1803,1589.250
2,AMT_ANNUITY,1293,2016,1395,1703,1193,1367,1404,1717,1511.000
3,AMT_GOODS_PRICE,1162,1541,1171,1534,1140,1173,1263,1458,1305.250
4,REGION_POPULATION_RELATIVE,605,1270,755,1037,635,780,815,960,857.125
5,DAYS_BIRTH,1635,2444,1706,2193,1541,1790,1889,2237,1929.375
6,DAYS_EMPLOYED,1885,2896,2033,2505,1903,2041,2039,2533,2229.375
7,DAYS_REGISTRATION,920,1786,1093,1493,892,993,1164,1394,1216.875
8,DAYS_ID_PUBLISH,1076,1805,1148,1560,1076,1261,1270,1545,1342.625
9,OWN_CAR_AGE,752,1217,903,1038,780,809,876,1056,928.875


In [98]:
print(auc_scores)
print(np.mean(auc_scores))

[0.7673932855691492, 0.7763848580148329, 0.7765933836344162, 0.779697633926215, 0.7693112899280621, 0.772856955361957, 0.7782752791047841, 0.778328210323834]
0.7748551119829064


In [90]:
# Below code is for second training regimen, should it be useful.
avg_importance = feature_importance_df["Average Importance"].tolist()
important_columns = [col for col,imp in zip(train.columns,avg_importance) if imp > 20]
len(important_columns)
train2 = train[important_columns]

In [92]:
clf2 = LGBMClassifier(**lgbm_parameters)
clf2.fit(train2, target, eval_set = [(train2, target)], eval_metric = "auc", early_stopping_rounds = 150, verbose = 50)

Training until validation scores don't improve for 150 rounds.
[50]	training's auc: 0.726
[100]	training's auc: 0.732058
[150]	training's auc: 0.7363
[200]	training's auc: 0.740857
[250]	training's auc: 0.746216
[300]	training's auc: 0.752102
[350]	training's auc: 0.757798
[400]	training's auc: 0.763153
[450]	training's auc: 0.767654
[500]	training's auc: 0.771383
[550]	training's auc: 0.774602
[600]	training's auc: 0.777527
[650]	training's auc: 0.780082
[700]	training's auc: 0.782393
[750]	training's auc: 0.784424
[800]	training's auc: 0.786407
[850]	training's auc: 0.788285
[900]	training's auc: 0.790027
[950]	training's auc: 0.79174
[1000]	training's auc: 0.793311
[1050]	training's auc: 0.794866
[1100]	training's auc: 0.796295
[1150]	training's auc: 0.797736
[1200]	training's auc: 0.799159
[1250]	training's auc: 0.800512
[1300]	training's auc: 0.801903
[1350]	training's auc: 0.803219
[1400]	training's auc: 0.804556
[1450]	training's auc: 0.805877
[1500]	training's auc: 0.807146
[15

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_by_tree=0.8,
        colsample_bytree=1.0, learning_rate=0.01, max_depth=7,
        min_child_samples=20, min_child_weight=2, min_split_gain=0.1,
        n_estimators=5000, n_jobs=-1, num_leaves=25, objective=None,
        random_state=None, reg_alpha=0.15, reg_lambda=0.01, silent=True,
        subsample=0.9, subsample_for_bin=200000, subsample_freq=1)

In [96]:
data_predictions = clf2.predict_proba(test[important_columns], num_iteration = clf2.best_iteration_)
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":data_predictions[:,1]})
submission.to_csv("v07_predictions.csv", index = False)