In [None]:
# Solution attempt 8: further lightgbm
# Final submission: 2018-07-08 
# Submission score: 0.788

# Further work with lightgbm, mostly feature engineering.
# Started this partially to snapshot the previous solution.

In [1]:
import numpy as np
import pandas as pd
import feather
import re
from lightgbm import LGBMClassifier

In [11]:
def has_NAs(pd_series):
    return pd_series.isnull().values.any()

# This is deleting stuff that it shouldn't
def string_col_to_onehot(df, col_name):
    dummy_cols = pd.get_dummies(df[col_name], drop_first = True, prefix = col_name)
    df.drop(columns = [col_name], inplace = True)
    return(pd.concat([df, dummy_cols], axis = 1))

In [12]:
# Load the main data files; don't load anything else yet, 'cause there's a lot
train = pd.read_feather("./../Data Files/application_train.feather")
test = pd.read_feather("./../Data Files/application_test.feather")

# Split off TARGET and establish a couple variables we'll need later
target = train["TARGET"]
train_IDs = train["SK_ID_CURR"]
train.drop("TARGET", inplace = True, axis = 1)
train_rows = len(train)
test_IDs = test["SK_ID_CURR"]

In [13]:
# Merge everything into a single dataset; this'll make processing easier.
all_data = pd.concat([train,test], ignore_index = True)

# These operations are a bit more consistent if they're done on the training & test sets together
building_info_columns = [c for c in train.columns.tolist() if re.search("_AVG$|_MODE$", c)]#("_AVG$|_MODE$|_MEDI$", c)]

# Columns removed due to low importance in lightgbm model
other_columns_to_remove = ["FLAG_MOBIL","FLAG_DOCUMENT_2","FLAG_DOCUMENT_4","FLAG_DOCUMENT_7","FLAG_DOCUMENT_9",
                           "FLAG_DOCUMENT_10","FLAG_DOCUMENT_12","FLAG_DOCUMENT_17","FLAG_DOCUMENT_19","FLAG_DOCUMENT_20",
                           "FLAG_CONT_MOBILE", "FLAG_EMP_PHONE"]

all_data.drop(building_info_columns + other_columns_to_remove, inplace = True, axis = 1)

# NEW STUFF
all_data["CREDIT_INCOME_RATIO"] = all_data["AMT_CREDIT"] / all_data["AMT_INCOME_TOTAL"]
all_data["INCOME_PER_HEAD"] = all_data["AMT_INCOME_TOTAL"] / all_data["CNT_FAM_MEMBERS"]
all_data["ANNUITY_INCOME_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_INCOME_TOTAL"]
all_data["ANNUITY_CREDIT_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_CREDIT"]

all_data["AMT_INCOME_TOTAL"] = np.log10(all_data["AMT_INCOME_TOTAL"])
all_data.loc[all_data["CODE_GENDER"] == "XNA", "CODE_GENDER"] = "F"
all_data.loc[all_data["DAYS_EMPLOYED"] == 365243, "DAYS_EMPLOYED"] = np.NaN

conditions = [all_data["CNT_CHILDREN"] == 0, all_data["CNT_CHILDREN"] == 1, all_data["CNT_CHILDREN"] >= 2]
choices = ["0","1","2+"]
all_data["CNT_CHILDREN"] = np.select(conditions, choices)

name_type_suite = all_data["NAME_TYPE_SUITE"]
all_data.loc[(name_type_suite != "Unaccompanied") & (~name_type_suite.isnull()), "NAME_TYPE_SUITE"] = "Accompanied"

object_cols = [col for col in all_data.columns if all_data[col].dtype == "O"]
for oc in object_cols:
    all_data = string_col_to_onehot(all_data, oc)

del name_type_suite

In [14]:
# Add data from bureau file
bureau = pd.read_feather("./../Data Files/bureau.feather")
bureau_balance = pd.read_feather("./../Data Files/bureau_balance.feather")
bureau = bureau.join(bureau_balance, on = "SK_ID_BUREAU", rsuffix = "bb")

bureau_grouped = bureau.groupby("SK_ID_CURR")
bureau_sub = bureau_grouped.agg({"CREDIT_DAY_OVERDUE":[lambda x: 1*any(x > 0),max],
                                 "AMT_CREDIT_SUM_OVERDUE":[max,sum],
                                 "CNT_CREDIT_PROLONG":sum,
                                 "AMT_CREDIT_SUM":sum,
                                 "AMT_CREDIT_SUM_DEBT":sum})
bureau_sub.columns = ["ANY_OVERDUE", "MAX_DAYS_OVERDUE", "MAX_CREDIT_OVERDUE", "NUM_TIMES_OVERDUE", "NUM_TIMES_PROLONGED",
                      "TOTAL_CURRENT_CREDIT_AMT", "TOTAL_CURRENT_CREDIT_DEBT"]
bureau_sub["CREDIT_COUNT"] = bureau_grouped.size()


all_data = all_data.join(bureau_sub, on = "SK_ID_CURR")#.fillna(0)
del bureau, bureau_sub, bureau_grouped, bureau_balance

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY,SK_ID_BUREAUbb,MONTHS_BALANCE,STATUS
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,,6066882,-24,0
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,,6066882,-25,0
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,,6066882,-26,0
3,215354,5714465,Active,currency 1,-203,0,,,,0,90000.0,,,0.0,Credit card,-16,,6066884,0,C
4,215354,5714466,Active,currency 1,-629,0,1197.0,,77674.5,0,2700000.0,,,0.0,Consumer credit,-21,,6066884,-1,C


In [15]:
# Add data from previous_application
prev_application = pd.read_feather("./../Data Files/previous_application.feather")

app_credit_ratio = prev_application["AMT_APPLICATION"] - prev_application["AMT_CREDIT"]
prev_application["APP_CREDIT_RATIO"] = app_credit_ratio
conditions = [app_credit_ratio < 0, app_credit_ratio == 0, app_credit_ratio > 0]
choices = ["MORE_CREDIT_THAN_ASKED","EQUAL_TO_CREDIT_ASKED","LESS_CREDIT_THAN_ASKED"]
prev_application["RECEIVED_VS_APPLIED_CREDIT"] = np.select(conditions, choices)
del app_credit_ratio

prev_app_grouped = prev_application.groupby("SK_ID_CURR")
prev_app_sub = prev_app_grouped["NAME_CONTRACT_STATUS"].value_counts().unstack("NAME_CONTRACT_STATUS")

prev_app_sub.rename(columns = {"Approved":"NUMBER_APPROVED", "Canceled":"NUMBER_CANCELED",
                               "Refused":"NUMBER_REFUSED", "Unused offer":"NUMBER_UNUSED"},
                    inplace = True)
prev_app_sub["NUMBER_APPLICATIONS"] = prev_app_grouped.size()
prev_app_sub["LAST_DECISION_DATE"] = prev_app_grouped.agg({"DAYS_DECISION":min})["DAYS_DECISION"]

prev_app_sub2 = prev_app_grouped.agg({"AMT_CREDIT":[max,min,np.mean], "APP_CREDIT_RATIO":[max,min]})
prev_app_sub2.columns = ["MAX_CREDIT_REQUESTED","MIN_CREDIT_REQUESTED","AVG_CREDIT_REQUESTED","MAX_APP_CREDIT_RATIO",
                         "MIN_APP_CREDIT_RATIO"]

prev_app_sub3 = prev_app_grouped["RECEIVED_VS_APPLIED_CREDIT"].value_counts().unstack("RECEIVED_VS_APPLIED_CREDIT")
prev_app_sub3 = prev_app_sub3.iloc[:,1:]

prev_app_data = pd.concat([prev_app_sub, prev_app_sub2, prev_app_sub3], axis = 1).fillna(value = 0)

all_data = all_data.join(prev_app_data, on = "SK_ID_CURR")#.fillna(0)
del prev_application, prev_app_sub, prev_app_sub2, prev_app_sub3, prev_app_data

In [16]:
# Add credit card balance data
cc_balance = pd.read_feather("./../Data Files/credit_card_balance.feather")
cc_balance_grouped = cc_balance.groupby("SK_ID_CURR")
cc_balance_sub = cc_balance_grouped.agg({"SK_DPD_DEF":lambda x: sum(x != 0), "AMT_CREDIT_LIMIT_ACTUAL":max,
                                         "SK_ID_PREV":lambda x: len(x.unique()), "AMT_BALANCE":max,
                                         "AMT_PAYMENT_TOTAL_CURRENT":max, "CNT_DRAWINGS_CURRENT":max})
cc_balance_sub.rename(columns = {"SK_DPD_DEF":"NUM_LATE_CC_PAYMENTS",
                                 "AMT_CREDIT_LIMIT_ACTUAL":"MAX_CREDIT_LIMIT",
                                 "SK_ID_PREV":"NUM_PREV_CC_LOANS",
                                 "AMT_BALANCE":"MAX_BALANCE",
                                 "AMT_PAYMENT_TOTAL_CURRENT":"MAX_PAID_ON_CREDIT",
                                 "CNT_DRAWINGS_CURRENT":"MAX_DRAWINGS_IN_MONTH", }, inplace = True)
cc_balance_sub["AVG_CC_BALANCE"] = cc_balance_grouped["AMT_BALANCE"].mean()
cc_balance_sub["AVG_DRAWINGS_PER_MONTH"] = cc_balance_grouped["CNT_DRAWINGS_CURRENT"].mean()


all_data = all_data.join(cc_balance_sub, on = "SK_ID_CURR")#.fillna(0)
del cc_balance, cc_balance_grouped, cc_balance_sub

In [19]:
# Add installments_payments.csv data
install_paym = pd.read_feather("./../Data Files/installments_payments.feather")
install_paym["AMT_PAYMENT"].fillna(value = install_paym["AMT_INSTALMENT"], inplace = True)
install_paym["DAYS_ENTRY_PAYMENT"].fillna(value = install_paym["DAYS_INSTALMENT"], inplace = True)

install_paym["FRACTION_INSTALLMENT_PAID"] = install_paym["AMT_PAYMENT"] / install_paym["AMT_INSTALMENT"]
install_paym["PAYMENT_UNDER"] = install_paym["FRACTION_INSTALLMENT_PAID"] < 1
install_paym["DAYS_OFF_PAYMENT"] = install_paym["DAYS_INSTALMENT"] - install_paym["DAYS_ENTRY_PAYMENT"]
install_paym["PAYMENT_LATE"] = install_paym["DAYS_ENTRY_PAYMENT"] > install_paym["DAYS_INSTALMENT"]

install_paym_sub = install_paym.groupby("SK_ID_CURR").agg({"PAYMENT_UNDER":sum, "PAYMENT_LATE":sum,
                                                           "AMT_PAYMENT":[min,max],"DAYS_OFF_PAYMENT":[min,max],
                                                           "FRACTION_INSTALLMENT_PAID":[min,np.mean,max]})
install_paym_sub.columns = ["NUM_PAYMENTS_UNDER","NUM_PAYMENTS_LATE","MIN_PAYMENT","MAX_PAYMENT","BEST_PAYMENT_DATE",
                            "WORST_PAYMENT_DATE","MIN_INSTALL_FRAC_PAID","AVG_INSTALL_FRAC_PAID","MAX_INSTALL_FRAC_PAID"]

all_data = all_data.join(install_paym_sub, on = "SK_ID_CURR")#.fillna(0)

del install_paym, install_paym_sub

In [21]:
# Add POS_CASH_balance.csv data
POS_cash = pd.read_feather("./../Data Files/POS_CASH_balance.feather")
POS_cash_sub = POS_cash.groupby("SK_ID_CURR").agg({"SK_DPD_DEF":[lambda x: sum(x == 1),max,np.mean]})
POS_cash_sub.columns = ["NUM_LATE_POS_PAYMENTS", "MAX_DPD", "AVG_DPD"]
POS_cash_sub.head()

all_data = all_data.join(POS_cash_sub, on = "SK_ID_CURR")#.fillna(0)

del POS_cash, POS_cash_sub

In [22]:
all_data.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [23]:
train = all_data.iloc[:train_rows,:].copy()
test = all_data.iloc[train_rows:,:].copy()
train.reset_index(drop = True).to_feather("./../Solution attempts/v08 train data.feather")
test.reset_index(drop = True).to_feather("./../Solution attempts/v08 test data.feather")

In [40]:
pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":target}).to_feather("./../Solution attempts/v07 target.feather")

In [25]:
# Train it up!
# 2018-07-08: Added "ANNUITY_CREDIT_RATIO", "MAX_DAYS_OVERDUE", "TOTAL_CURRENT_CREDIT", "TOTAL_CURRENT_CREDIT_DEBT", "AVG_CREDIT_REQUESTED","MAX_APP_CREDIT_RATIO",
# "MIN_APP_CREDIT_RATIO", "MAX_DPD", "AVG_DPD", "MIN_INSTALL_FRAC_PAID","AVG_INSTALL_FRAC_PAID","MAX_INSTALL_FRAC_PAID"; increased num_leaves parameter from 25 back to 30
# Previous val AUC: 0.783344

from sklearn.model_selection import KFold

folds = KFold(n_splits = 8, shuffle = True)
auc_scores = []

feature_importance_df = pd.DataFrame({"Features":train.columns})

lgbm_parameters = {"n_estimators":5000,
                   "learning_rate":0.01,     #previous: 0.03
                   "num_leaves":30,          #previous: 30
                   "colsample_by_tree":0.8,  #previous: 0.8
                   "subsample":0.9,          #previous: 0.9
                   "max_depth":7,            #previous: 7
                   "reg_alpha":0.15,         #previous: 0.1
                   "reg_lambda":0.01,        #previous: 0.1
                   "min_split_gain":0.1,     #previous: 0.1
                   "min_child_weight":2}     #previous: 0.1

for n_fold, (train_ids, val_ids) in enumerate(folds.split(train)):
    print("#### FOLD NUMBER " + str(n_fold + 1) + " ####")
    train_train = train.iloc[train_ids]
    train_test = train.iloc[val_ids]
    target_train = target[train_ids]
    target_test = target[val_ids]
    
    clf = LGBMClassifier(**lgbm_parameters)
    #clf = LGBMClassifier(n_estimators = 4000, learning_rate = 0.03, num_leaves = 30, colsample_by_tree = 0.75, subsample = 0.8, max_depth = 7, reg_alpha = 0.1, reg_lambda = 0.1, min_split_gain = 0.1, min_child_weight = 2)
    clf.fit(train_train, target_train, eval_set = [(train_train, target_train), (train_test, target_test)], eval_metric = "auc", early_stopping_rounds = 50, verbose = 100)
    auc_scores.append(clf._best_score["valid_1"]["auc"])
    feature_importance_df["Importance Fold " + str(n_fold + 1)] = pd.Series(clf.feature_importances_)

#### FOLD NUMBER 1 ####
Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.737406	valid_1's auc: 0.721216
[200]	training's auc: 0.747471	valid_1's auc: 0.729946
[300]	training's auc: 0.759985	valid_1's auc: 0.740533
[400]	training's auc: 0.772941	valid_1's auc: 0.752656
[500]	training's auc: 0.782117	valid_1's auc: 0.759772
[600]	training's auc: 0.789116	valid_1's auc: 0.764436
[700]	training's auc: 0.794952	valid_1's auc: 0.767841
[800]	training's auc: 0.800054	valid_1's auc: 0.770095
[900]	training's auc: 0.804567	valid_1's auc: 0.771888
[1000]	training's auc: 0.808742	valid_1's auc: 0.773171
[1100]	training's auc: 0.812824	valid_1's auc: 0.774321
[1200]	training's auc: 0.816715	valid_1's auc: 0.775279
[1300]	training's auc: 0.820366	valid_1's auc: 0.77591
[1400]	training's auc: 0.823767	valid_1's auc: 0.776447
[1500]	training's auc: 0.827174	valid_1's auc: 0.776898
[1600]	training's auc: 0.830373	valid_1's auc: 0.777245
[1700]	training's auc: 0.833

[700]	training's auc: 0.794353	valid_1's auc: 0.776645
[800]	training's auc: 0.799756	valid_1's auc: 0.779198
[900]	training's auc: 0.804484	valid_1's auc: 0.780888
[1000]	training's auc: 0.808702	valid_1's auc: 0.782017
[1100]	training's auc: 0.812751	valid_1's auc: 0.783214
[1200]	training's auc: 0.816474	valid_1's auc: 0.783876
[1300]	training's auc: 0.820154	valid_1's auc: 0.784559
[1400]	training's auc: 0.823534	valid_1's auc: 0.784992
[1500]	training's auc: 0.826916	valid_1's auc: 0.785373
[1600]	training's auc: 0.83009	valid_1's auc: 0.785924
[1700]	training's auc: 0.833157	valid_1's auc: 0.786317
[1800]	training's auc: 0.836187	valid_1's auc: 0.78664
[1900]	training's auc: 0.839083	valid_1's auc: 0.786938
[2000]	training's auc: 0.841925	valid_1's auc: 0.787254
[2100]	training's auc: 0.844743	valid_1's auc: 0.787475
[2200]	training's auc: 0.847471	valid_1's auc: 0.787665
[2300]	training's auc: 0.850115	valid_1's auc: 0.787756
[2400]	training's auc: 0.852694	valid_1's auc: 0.7879

In [27]:
feature_importance_df["Average Importance"] = feature_importance_df.iloc[:,1:].mean(axis=1)
feature_importance_df.to_csv("v08 Feature Importance.csv", index = False)
feature_importance_df

Unnamed: 0,Features,Importance Fold 1,Importance Fold 2,Importance Fold 3,Importance Fold 4,Importance Fold 5,Importance Fold 6,Importance Fold 7,Importance Fold 8,Average Importance
0,AMT_INCOME_TOTAL,717,703,466,429,698,767,570,721,633.875
1,AMT_CREDIT,1137,1121,867,913,1169,1314,998,1149,1083.500
2,AMT_ANNUITY,1608,1518,1199,1191,1625,1669,1258,1438,1438.250
3,AMT_GOODS_PRICE,1168,1311,994,954,1258,1317,1152,1218,1171.500
4,REGION_POPULATION_RELATIVE,1008,1047,700,719,1082,1251,852,1042,962.625
5,DAYS_BIRTH,2145,2225,1828,1901,2291,2454,2031,2153,2128.500
6,DAYS_EMPLOYED,2590,2586,1990,2070,2730,2847,2156,2610,2447.375
7,DAYS_REGISTRATION,1361,1506,1075,1075,1509,1712,1175,1581,1374.250
8,DAYS_ID_PUBLISH,1598,1616,1128,1222,1686,1795,1479,1659,1522.875
9,OWN_CAR_AGE,1060,1047,790,819,1089,1089,985,1084,995.375


In [26]:
print(auc_scores)
print(np.mean(auc_scores))

[0.7794146060820832, 0.7805830642928375, 0.7864012967131647, 0.777448980383821, 0.7848239891889099, 0.7888603200335452, 0.786411450300381, 0.782805929329628]
0.7833437045405464


In [28]:
# Below code is for second training regimen, should it be useful.
avg_importance = feature_importance_df["Average Importance"].tolist()
important_columns = [col for col,imp in zip(train.columns,avg_importance) if imp > 20]
len(important_columns)
train2 = train[important_columns]

In [29]:
clf2 = LGBMClassifier(**lgbm_parameters)
clf2.fit(train2, target, eval_set = [(train2, target)], eval_metric = "auc", early_stopping_rounds = 150, verbose = 50)

Training until validation scores don't improve for 150 rounds.
[50]	training's auc: 0.730812
[100]	training's auc: 0.736344
[150]	training's auc: 0.740771
[200]	training's auc: 0.745989
[250]	training's auc: 0.75184
[300]	training's auc: 0.758559
[350]	training's auc: 0.764983
[400]	training's auc: 0.770935
[450]	training's auc: 0.775912
[500]	training's auc: 0.780049
[550]	training's auc: 0.783606
[600]	training's auc: 0.786798
[650]	training's auc: 0.789738
[700]	training's auc: 0.792464
[750]	training's auc: 0.794995
[800]	training's auc: 0.797378
[850]	training's auc: 0.799476
[900]	training's auc: 0.801581
[950]	training's auc: 0.803597
[1000]	training's auc: 0.805485
[1050]	training's auc: 0.807299
[1100]	training's auc: 0.809087
[1150]	training's auc: 0.810789
[1200]	training's auc: 0.812499
[1250]	training's auc: 0.81417
[1300]	training's auc: 0.815871
[1350]	training's auc: 0.817425
[1400]	training's auc: 0.818988
[1450]	training's auc: 0.820586
[1500]	training's auc: 0.822123

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_by_tree=0.8,
        colsample_bytree=1.0, learning_rate=0.01, max_depth=7,
        min_child_samples=20, min_child_weight=2, min_split_gain=0.1,
        n_estimators=5000, n_jobs=-1, num_leaves=30, objective=None,
        random_state=None, reg_alpha=0.15, reg_lambda=0.01, silent=True,
        subsample=0.9, subsample_for_bin=200000, subsample_freq=1)

In [30]:
data_predictions = clf2.predict_proba(test[important_columns], num_iteration = clf2.best_iteration_)
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":data_predictions[:,1]})
submission.to_csv("v08_predictions.csv", index = False)