In [None]:
# Solution attempt 9: even further lightgbm + stacking
# Final submission: 0.789
# Submission score: 2018-07-29

# Further work with lightgbm, including stacking of other models as well.
# Started this mostly to snapshot the previous solution.

In [1]:
import re

import feather
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import scipy.stats as sstats

In [2]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

  from ._conv import register_converters as _register_converters


In [3]:
def has_NAs(pd_series):
    return pd_series.isnull().values.any()

# adjust function so that two-column results come back as one (redundant info) but three or more are
# all kept
def string_col_to_onehot(df, col_name):
    dummy_cols = pd.get_dummies(df[col_name], drop_first = False, prefix = col_name)
    if len(dummy_cols.columns) == 2:
        df[col_name] = dummy_cols.iloc[:,0]
    else:
        df.drop(columns = [col_name], inplace = True)
        df = pd.concat([df, dummy_cols], axis = 1)
    return df

def log_average(pd_series):
    return np.exp(np.mean(np.log(pd_series)))

In [39]:
# takes a grouped dataframe and a list of columns containing character values
# returns a DF containing counts for all items in each group
def count_frac_cols(df_gr, columns: list, middle_string = ""):
    counted_dfs = []
    group_sizes = df_gr.size()
    for col in columns:
        count_df = df_gr[col].value_counts().unstack(col)
        count_df.fillna(0, inplace = True)
        
        item_names = count_df.columns
        if middle_string != "":
            item_names = [middle_string + "_" + item for item in item_names]
        count_names = ["NUM_" + item for item in item_names]
        frac_names = ["FRAC_" + item for item in item_names]
        
        count_df.rename(columns = {i:c for i,c in zip(item_names, count_names)}, inplace = True)
        count_df[frac_names] = count_df[count_names].div(group_sizes, axis = 0)
        counted_dfs.append(count_df)
    
    return pd.concat(counted_dfs, axis = 1)

In [5]:
# Takes a data frame and a dict of col names as keys and a list of polynomial degrees as arguments
def add_polynomial_terms(df, polynomials = {}):
    for col, degrees in polynomials.items():
        for deg in degrees:
            new_col_name = col + "_DEG_" + str(deg)
            df[new_col_name] = df[col] ** deg
    return df

In [6]:
def tensorflow_logistic_regression(train_data, target, test_df = None, training_epochs = 10, learning_rate = 0.005):
    """
    Performs the tensorflow operations for training a basic logistic regression model on the specified
    dataframe, and returns the model's predictions for the full data set.  It is intended to be used with
    the supplemental files for Kaggle's Credit Risk competition, in order to generate some new features/
    implement model stacking.
    """
    ncol = train_data.shape[1]
    
    X = tf.placeholder(tf.float32, [None, ncol], name = "X")
    Y = tf.placeholder(tf.float32, [None, 1], name = "Y")
    weights = tf.Variable(tf.zeros([ncol,1]))
    bias = tf.Variable(tf.zeros([1]))
    
    pred = tf.sigmoid(tf.add(tf.matmul(X, weights), bias))
    cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = pred, labels = Y))
    auc = tf.metrics.auc(labels = Y, predictions = pred, name = "auc")
    optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)
    
    init = tf.global_variables_initializer()
    init_loc = tf.local_variables_initializer()
    auc_history = []
    
    with tf.Session() as sess:
        sess.run(init)
        sess.run(init_loc)
        
        for epoch in range(training_epochs):
            train_train, train_val, target_train, target_val = train_test_split(train_data,target)
            
            c,_ = sess.run([cost, optimizer], feed_dict = {X:train_train, Y: target_train})
            predictions = sess.run(pred, feed_dict = {X:train_val, Y: target_val})
            auc_history.append(sess.run(auc, feed_dict = {X:train_val, Y: target_val})[1])
        
        if test_df is not None:
            full_train_predictions = sess.run(pred, feed_dict = {X:test_df, Y:target})
        else:
            full_train_predictions = sess.run(pred, feed_dict = {X:train_data, Y:target})
    
    return full_train_predictions, auc_history

In [7]:
def log_regress_other_files(train_IDs, target_df, file_df, epochs):
    sc = StandardScaler()
    file_df = file_df.dropna(axis = 1).copy()
    file_df.iloc[:,1:] = sc.fit_transform(file_df.iloc[:,1:])
    
    full_file_data = file_df.drop("SK_ID_CURR", axis = 1).values.astype("float32")
    
    file_df = file_df.loc[file_df["SK_ID_CURR"].isin(train_IDs),:]
    target = target_df[target_df["SK_ID_CURR"].isin(file_df["SK_ID_CURR"])]["TARGET"]
    
    file_train = file_df.drop("SK_ID_CURR", axis = 1).values.astype("float32")
    target = target.values.reshape([len(target),1]).astype("float32")
    
    predictions, auc_scores = tensorflow_logistic_regression(file_train, target,
                                                             test_df = full_file_data,
                                                             training_epochs = epochs)
    
    return predictions, auc_scores

In [8]:
# Add data from bureau file
# future improvements:
# - sort bureau_balance by ID & MONTHS_BALANCE, get last STATUS for each
# - either ditch STATUS > 2 or fold all into STATUS = 3
def create_bureau_df(bureau_file_path, bureau_bal_file_path):
    bureau = pd.read_feather(bureau_file_path)
    
    bureau_balance = pd.read_feather(bureau_bal_file_path)
    bureau_balance["STATUS"] = bureau_balance["STATUS"].astype(str)
    
    bureau_balance_gr = bureau_balance.groupby("SK_ID_BUREAU")
    bureau_balance_sub = bureau_balance_gr["STATUS"].value_counts().unstack("STATUS")
    bureau_balance_sub.fillna(0, inplace = True)
    
    bureau_balance_count = bureau_balance_gr.size()
    #for sc in bureau_balance_sub.columns.tolist():
    #    bureau_balance_sub["BUREAU_STATUS_FRAC_" + sc] = bureau_balance_sub[sc]/bureau_balance_count
    #    bureau_balance_sub.rename(columns = {sc:"BUREAU_STATUS_COUNT_" + sc}, inplace = True)
        
    bureau_balance_sub["BUREAU_BAL_COUNT"] = bureau_balance_count
    bureau_balance_sub["MONTHS_BALANCE_MIN"] = bureau_balance_gr.agg({"MONTHS_BALANCE":min})
    bureau_balance_sub["MONTHS_BALANCE_MAX"] = bureau_balance_gr.agg({"MONTHS_BALANCE":max})
    
    bureau = bureau.join(bureau_balance_sub, on = "SK_ID_BUREAU", rsuffix = "bb")
    
    bureau_grouped = bureau.groupby("SK_ID_CURR")
    bureau_sub = bureau_grouped.agg({"CREDIT_DAY_OVERDUE":[max,np.mean],
                                     "AMT_CREDIT_SUM_OVERDUE":[max,np.mean],
                                     "CNT_CREDIT_PROLONG":sum,
                                     "AMT_CREDIT_SUM":sum,
                                     "AMT_CREDIT_SUM_DEBT":sum,
                                     "MONTHS_BALANCE_MIN":min,
                                     "MONTHS_BALANCE_MAX":max,
                                     "DAYS_CREDIT":[min,max,np.mean],
                                     "DAYS_CREDIT_UPDATE":[min,max],
                                     "DAYS_CREDIT_ENDDATE":[min,max]})
    bureau_sub.columns = ["MAX_DAYS_OVERDUE", "AVG_DAYS_OVERDUE", "MAX_CREDIT_OVERDUE", "AVG_CREDIT_OVERDUE",
                          "NUM_TIMES_PROLONGED", "TOTAL_CURRENT_CREDIT_AMT", "TOTAL_CURRENT_CREDIT_DEBT",
                          "BUREAU_OLDEST_MONTH","BUREAU_NEWEST_MONTH", "BUREAU_OLDEST_APPL",
                          "BUREAU_NEWEST_APP","BUREAU_AVG_APP_AGE","BUREAU_OLDEST_UPDATE",
                          "BUREAU_NEWEST_UPDATE","OLDEST_CREDIT_ENDDATE","NEWEST_CREDIT_ENDDATE"]
    #for bbs_col in bureau_balance_sub.columns.tolist():
    #    new_cols = [bbs_col + "_MIN", bbs_col + "_AVG", bbs_col + "_MAX"]
    #    bureau_sub[new_cols] = bureau_grouped.agg({bbs_col:[min,np.mean,max]})
    bureau_sub["BUREAU_CREDIT_COUNT"] = bureau_grouped.size()
    
    # useful credit types based on feature importance in lgbm; excluded ones have (near-)zero importance
    useful_credit_types = ["Another type of loan", "Car loan", "Consumer credit", "Credit card",
                           "Microloan", "Mortgage"]
    bureau_credit_types = bureau_grouped["CREDIT_TYPE"].value_counts().unstack("CREDIT_TYPE")
    bureau_credit_types = bureau_credit_types[useful_credit_types]
    bureau_credit_types.columns = ["CREDIT_TYPE_" + col for col in bureau_credit_types.columns]
    bureau_credit_types.fillna(0, inplace = True)
    bureau_sub = pd.concat([bureau_sub, bureau_credit_types], axis = 1)
    
    bureau_status = bureau_grouped["CREDIT_ACTIVE"].value_counts().unstack("CREDIT_ACTIVE").fillna(0)
    bureau_sub[["NUM_CREDIT_ACTIVE","NUM_CREDIT_CLOSED"]] = bureau_status[["Active","Closed"]]
    
    return bureau_sub

In [9]:
# Add data from previous_application
# future ideas:
# - add FRAC versions of NAME_CONTRACT_STATUS columns
def create_prev_app_df(file_path):
    prev_application = pd.read_feather(file_path)
    
    app_credit_diff = prev_application["AMT_APPLICATION"] - prev_application["AMT_CREDIT"]
    prev_application["APP_CREDIT_DIFF"] = app_credit_diff
    app_credit_ratio = prev_application["AMT_APPLICATION"] / prev_application["AMT_CREDIT"]
    prev_application["APP_CREDIT_RATIO"] = app_credit_ratio.replace([np.inf,-np.inf],np.nan)
    conditions = [app_credit_diff < 0, app_credit_diff == 0, app_credit_diff > 0]
    choices = ["MORE_CREDIT_THAN_ASKED","EQUAL_TO_CREDIT_ASKED","LESS_CREDIT_THAN_ASKED"]
    prev_application["RECEIVED_VS_APPLIED_CREDIT"] = np.select(conditions, choices)
    
    prev_app_grouped = prev_application.groupby("SK_ID_CURR")
    prev_app_sub = prev_app_grouped["NAME_CONTRACT_STATUS"].value_counts().unstack("NAME_CONTRACT_STATUS")
    
    prev_app_sub.rename(columns = {"Approved":"NUMBER_APPROVED", "Canceled":"NUMBER_CANCELED",
                                   "Refused":"NUMBER_REFUSED", "Unused offer":"NUMBER_UNUSED"},
                        inplace = True)
    prev_app_sub["NUMBER_APPLICATIONS"] = prev_app_grouped.size()
    
    for col in ["APPROVED","CANCELED","REFUSED","UNUSED"]:
        prev_app_sub["FRAC_" + col] = prev_app_sub["NUMBER_" + col] / prev_app_sub["NUMBER_APPLICATIONS"]
    
    prev_app_sub["LAST_DECISION_DATE"] = prev_app_grouped.agg({"DAYS_DECISION":min})["DAYS_DECISION"]
    
    prev_app_sub2 = prev_app_grouped.agg({"AMT_CREDIT":[max,min,np.mean],
                                          "APP_CREDIT_DIFF":[max,min],
                                          "APP_CREDIT_RATIO":[min,np.mean,max]})
    prev_app_sub2.columns = ["MAX_CREDIT_REQUESTED","MIN_CREDIT_REQUESTED","AVG_CREDIT_REQUESTED","MAX_APP_CREDIT_DIFF",
                             "MIN_APP_CREDIT_DIFF", "MIN_APP_CREDIT_RATIO","AMEAN_APP_CREDIT_RATIO",
                             "MAX_APP_CREDIT_RATIO"]
    
    grouped_cols = ["MORE_CREDIT_THAN_ASKED","EQUAL_TO_CREDIT_ASKED","LESS_CREDIT_THAN_ASKED"]
    count_cols = ["NUMBER_MORE_CREDIT_THAN_ASKED","NUMBER_EQUAL_TO_CREDIT_ASKED","NUMBER_LESS_THAN_CREDIT_ASKED"]
    frac_cols = ["FRAC_MORE_CREDIT_THAN_ASKED","FRAC_EQUAL_TO_CREDIT_ASKED","FRAC_LESS_THAN_CREDIT_ASKED"]
    prev_app_sub3 = prev_app_grouped["RECEIVED_VS_APPLIED_CREDIT"].value_counts().unstack("RECEIVED_VS_APPLIED_CREDIT")
    prev_app_sub3 = prev_app_sub3.iloc[:,1:]
    prev_app_sub[count_cols] = prev_app_sub3[grouped_cols]
    prev_app_sub[frac_cols] = prev_app_sub3[grouped_cols].div(prev_app_sub["NUMBER_APPLICATIONS"], axis = 0)
    
    prev_app_data = pd.concat([prev_app_sub, prev_app_sub2], axis = 1).fillna(value = 0)
    
    return prev_app_data

In [46]:
# Add credit card balance data
def create_cc_df(file_path):
    cc_balance = pd.read_feather(file_path)
    
    cc_balance["DRAWINGS_TOTAL_PAY_RATIO"] = cc_balance["AMT_DRAWINGS_CURRENT"] / cc_balance["AMT_PAYMENT_TOTAL_CURRENT"]
    cc_balance["AVG_PER_DRAWING"] = cc_balance["AMT_DRAWINGS_CURRENT"] / cc_balance["CNT_DRAWINGS_CURRENT"]
    cc_balance["AVG_PER_DRAWING"].replace([np.inf,-np.inf],0, inplace = True)
    cc_balance["AVG_PER_ATM_DRAWING"] = cc_balance["AMT_DRAWINGS_ATM_CURRENT"] / cc_balance["CNT_DRAWINGS_ATM_CURRENT"]
    cc_balance["AVG_PER_ATM_DRAWING"].replace([np.inf,-np.inf],0, inplace = True)

    cc_balance_grouped = cc_balance.groupby("SK_ID_CURR")
    cc_balance_sub = cc_balance_grouped.agg({"SK_DPD_DEF":[lambda x: sum(x != 0),max],
                                             "AMT_CREDIT_LIMIT_ACTUAL":[np.mean,max],
                                             "SK_ID_PREV":lambda x: len(x.unique()),
                                             "AMT_BALANCE":[max,np.mean,np.std],
                                             "AMT_PAYMENT_TOTAL_CURRENT":[max,np.mean,np.std],
                                             "CNT_DRAWINGS_CURRENT":[max,np.mean,sum],
                                             "AMT_DRAWINGS_CURRENT":[min,max,np.mean,sum],
                                             "DRAWINGS_TOTAL_PAY_RATIO":[min,max,np.mean],
                                             "AMT_DRAWINGS_ATM_CURRENT":[np.mean,sum],
                                             "CNT_DRAWINGS_ATM_CURRENT":[np.mean,sum]})
    cc_balance_sub.columns = ["NUM_LATE_CC_PAYMENTS","MOST_OVERDUE_CC",
                              "AVG_CREDIT_LIMIT_ACTUAL","MAX_CREDIT_LIMIT",
                              "NUM_PREV_CC_LOANS",
                              "MAX_BALANCE","AVG_CC_BALANCE","AMT_BALANCE_STDEV",
                              "MAX_PAID_ON_CREDIT","AVG_PAID_ON_CREDIT","AMT_PAID_ON_CREDIT_STDEV",
                              "MAX_DRAWINGS_IN_MONTH","AVG_DRAWINGS_PER_MONTH","TOTAL_NUM_ALL_DRAWINGS",
                              "MIN_MONTHLY_DRAWINGS","MAX_MONTHLY_DRAWINGS","AVG_MONTHLY_DRAWINGS","TOTAL_AMT_ALL_DRAWINGS",
                              "MIN_DRAW_TO_TOTAL_PAY","MAX_DRAW_TO_TOTAL_PAY","AVG_DRAW_TO_TOTAL_PAY",
                              "AVG_AMT_ATM_DRAWINGS", "TOTAL_AMT_ATM_DRAWINGS",
                              "AVG_NUM_ATM_DRAWINGS", "TOTAL_NUM_ATM_DRAWINGS"]
    
    cc_balance_sub["OVERALL_AVG_ALL_DRAWING_AMT"] = cc_balance_sub["TOTAL_AMT_ALL_DRAWINGS"] / cc_balance_sub["TOTAL_NUM_ALL_DRAWINGS"]
    cc_balance_sub["OVERALL_AVG_ATM_DRAWING_AMT"] = cc_balance_sub["TOTAL_AMT_ATM_DRAWINGS"] / cc_balance_sub["TOTAL_NUM_ATM_DRAWINGS"]
    
    # add data for last N months
    for month in [1,6,12,24]:
        cc_recent = cc_balance[cc_balance["MONTHS_BALANCE"] >= -month]
        cc_recent_gr = cc_recent.groupby("SK_ID_CURR")
        cc_recent_sub = cc_recent_gr.agg({"SK_DPD_DEF":lambda x: sum(x != 0),
                                       "AMT_PAYMENT_TOTAL_CURRENT":np.mean,
                                       "AMT_BALANCE":np.mean,
                                       "CNT_DRAWINGS_CURRENT":sum})
                                       
        cc_recent_sub.columns = [f"NUM_LATE_CC_PAYMENTS_{month}_MONTHS",
                                 f"AVG_PAID_ON_CREDIT_{month}_MONTHS",
                                 f"AVG_BALANCE_{month}_MONTHS",
                                 f"NUM_DRAWINGS_{month}_MONTHS"]
        
        cc_balance_sub = cc_balance_sub.join(cc_recent_sub, how = "left")
    
    return cc_balance_sub

In [11]:
# Add installments_payments.csv data
def create_install_payment_df(file_path):
    install_paym = pd.read_feather(file_path)
    install_paym["AMT_PAYMENT"].fillna(value = install_paym["AMT_INSTALMENT"], inplace = True)
    install_paym["DAYS_ENTRY_PAYMENT"].fillna(value = install_paym["DAYS_INSTALMENT"], inplace = True)

    install_paym["FRACTION_INSTALLMENT_PAID"] = install_paym["AMT_PAYMENT"] / install_paym["AMT_INSTALMENT"]
    install_paym["FRACTION_INSTALLMENT_PAID"] = install_paym["FRACTION_INSTALLMENT_PAID"].replace([np.inf,-np.inf],np.nan)
    install_paym["DAYS_OFF_PAYMENT"] = install_paym["DAYS_INSTALMENT"] - install_paym["DAYS_ENTRY_PAYMENT"]
    
    conditions = [install_paym["FRACTION_INSTALLMENT_PAID"] < 1,
                  install_paym["FRACTION_INSTALLMENT_PAID"] == 1,
                  install_paym["FRACTION_INSTALLMENT_PAID"] > 1]
    choices = ["UNDER", "EXACT_AMT", "OVER"]
    install_paym["PAYMENT_LEVEL"] = np.select(conditions, choices)
    
    conditions = [install_paym["DAYS_ENTRY_PAYMENT"] > install_paym["DAYS_INSTALMENT"],
                  install_paym["DAYS_ENTRY_PAYMENT"] == install_paym["DAYS_INSTALMENT"],
                  install_paym["DAYS_ENTRY_PAYMENT"] < install_paym["DAYS_INSTALMENT"]]
    choices = ["LATE", "ON_TIME", "EARLY"]
    install_paym["PAYMENT_TIME"] = np.select(conditions, choices)
    
    install_paym_grouped = install_paym.groupby("SK_ID_CURR")
    install_paym_sub = install_paym_grouped.agg({"AMT_PAYMENT":[min, np.mean, max, log_average],
                                                 "DAYS_OFF_PAYMENT":[min,np.mean,max],
                                                 "FRACTION_INSTALLMENT_PAID":[min,np.mean,max]})
    install_paym_sub.columns = ["MIN_PAYMENT", "AVG_PAYMENT", "MAX_PAYMENT", "LOG_AVG_PAYMENT",
                                "BEST_PAYMENT_DATE", "AVG_DAYS_OFF_PAYMENT", "WORST_PAYMENT_DATE","MIN_INSTALL_FRAC_PAID",
                                "AVG_INSTALL_FRAC_PAID","MAX_INSTALL_FRAC_PAID"]
    
    num_payments_per_prev = install_paym_grouped.size()
    
    count_cols = ["NUM_PAYMENTS_UNDER","NUM_PAYMENTS_EXACT_AMT","NUM_PAYMENTS_OVER_AMT"]
    frac_cols = ["FRAC_PAYMENTS_UNDER","FRAC_PAYMENTS_EXACT_AMT","FRAC_PAYMENTS_OVER_AMT"]
    install_paym_pay_levels = install_paym_grouped["PAYMENT_LEVEL"].value_counts().unstack("PAYMENT_LEVEL")
    install_paym_pay_levels.fillna(0, inplace = True)
    install_paym_sub[count_cols] = install_paym_pay_levels[["UNDER","EXACT_AMT","OVER"]]
    install_paym_sub[frac_cols] = install_paym_pay_levels[["UNDER","EXACT_AMT","OVER"]].div(num_payments_per_prev, axis = 0)
    
    count_cols = ["NUM_PAYMENTS_LATE","NUM_PAYMENTS_ON_TIME","NUM_PAYMENTS_EARLY"]
    frac_cols = ["FRAC_PAYMENTS_LATE","FRAC_PAYMENTS_ON_TIME","FRAC_PAYMENTS_EARLY"]
    install_paym_pay_times = install_paym_grouped["PAYMENT_TIME"].value_counts().unstack("PAYMENT_TIME")
    install_paym_pay_times.fillna(0, inplace = True)
    install_paym_sub[count_cols] = install_paym_pay_times[["LATE", "ON_TIME", "EARLY"]]
    install_paym_sub[frac_cols] = install_paym_pay_times[["LATE", "ON_TIME", "EARLY"]].div(num_payments_per_prev, axis = 0)
    
    # checked in R - there's only one SK_ID_CURR per SK_ID_PREV
    #install_paym_sub_prev = install_paym.groupby("SK_ID_PREV").agg({"NUM_INSTALMENT_VERSION":max})
    
    return install_paym_sub

In [12]:
# Add POS_CASH_balance.csv data
# future additions: 
# - Some columns have NAME_CONTRACT_STATUS = "Completed" despite having non-zero CNT_INSTALMENT_FUTURE
# - operations on number of installments for each SK_ID_REV; should probably .agg with mode() to get it
def create_POS_cash_df(file_path):
    POS_cash = pd.read_feather(file_path)
    POS_cash_grouped = POS_cash.sort_values(["SK_ID_CURR", "MONTHS_BALANCE"]).groupby("SK_ID_CURR")
    POS_cash_sub = POS_cash.groupby("SK_ID_CURR").agg({"SK_DPD_DEF":[lambda x: sum(x == 1),max,np.mean]})
    POS_cash_sub.columns = ["NUM_LATE_POS_PAYMENTS", "MAX_DPD", "AVG_DPD"]
    
    POS_cash_status = POS_cash_grouped["NAME_CONTRACT_STATUS"].value_counts().unstack("NAME_CONTRACT_STATUS").fillna(0)
    POS_cash_sub["NUM_CONTRACTS_COMPLETED"] = POS_cash_status["Completed"]
    
    POS_last_month = POS_cash[POS_cash["MONTHS_BALANCE"] == -1]
    POS_last_month_agg = POS_last_month.groupby("SK_ID_CURR").agg({"CNT_INSTALMENT_FUTURE":[sum,lambda x: sum(x != 0)]})
    POS_last_month_agg.columns = ["NUM_INSTALMENTS_PENDING","NUM_ACCOUNTS_ACTIVE"]
    POS_cash_sub = POS_cash_sub.join(POS_last_month_agg)
    
    POS_cash_sub.fillna(value = {"NUM_INSTALMENTS_PENDING":0,"NUM_ACCOUNTS_ACTIVE":0}, inplace = True)
    
    return POS_cash_sub

In [24]:
# Load the main data files; don't load anything else yet, 'cause there's a lot
train = pd.read_feather("./../Data Files/application_train.feather")
test = pd.read_feather("./../Data Files/application_test.feather")

# Split off TARGET and establish a couple variables we'll need later
target = train["TARGET"]
train_IDs = train["SK_ID_CURR"]
target_df = pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":target})
train.drop("TARGET", inplace = True, axis = 1)
train_rows = len(train)
test_IDs = test["SK_ID_CURR"]

In [25]:
# Merge everything into a single dataset; this'll make processing easier.
all_data = pd.concat([train,test], ignore_index = True)

# These operations are a bit more consistent if they're done on the training & test sets together
building_info_columns = [c for c in train.columns.tolist() if re.search("_AVG$|_MODE$", c)]#("_AVG$|_MODE$|_MEDI$", c)]

# Columns removed due to low importance in lightgbm model
other_columns_to_remove = ["FLAG_MOBIL","FLAG_DOCUMENT_2","FLAG_DOCUMENT_4","FLAG_DOCUMENT_7","FLAG_DOCUMENT_9",
                           "FLAG_DOCUMENT_10","FLAG_DOCUMENT_12","FLAG_DOCUMENT_17","FLAG_DOCUMENT_19","FLAG_DOCUMENT_20",
                           "FLAG_CONT_MOBILE", "FLAG_EMP_PHONE"]

all_data.drop(building_info_columns + other_columns_to_remove, inplace = True, axis = 1)

# NEW STUFF
all_data["CREDIT_INCOME_RATIO"] = all_data["AMT_CREDIT"] / all_data["AMT_INCOME_TOTAL"]
all_data["INCOME_PER_HEAD"] = all_data["AMT_INCOME_TOTAL"] / all_data["CNT_FAM_MEMBERS"]
all_data["ANNUITY_INCOME_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_INCOME_TOTAL"]
all_data["ANNUITY_CREDIT_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_CREDIT"]
all_data["GOODS_INCOME_RATIO"] = all_data["AMT_GOODS_PRICE"] / all_data["AMT_INCOME_TOTAL"]
all_data["FRAC_DAYS_EMPLOYED"] = all_data["DAYS_EMPLOYED"] / all_data["DAYS_BIRTH"]
all_data["FRAC_CHILDREN"] = all_data["CNT_CHILDREN"] / all_data["CNT_FAM_MEMBERS"]

ext_sources = all_data[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]]
for func_name in ["SUM","MEAN","MIN","MAX"]:
    all_data["EXT_SOURCE_" + func_name] = eval(f"np.{func_name.lower()}")(ext_sources, axis = 1)

all_data["AMT_INCOME_TOTAL"] = np.log10(all_data["AMT_INCOME_TOTAL"])
all_data.loc[all_data["CODE_GENDER"] == "XNA", "CODE_GENDER"] = "F"
all_data.loc[all_data["DAYS_EMPLOYED"] == 365243, "DAYS_EMPLOYED"] = np.NaN

conditions = [all_data["CNT_CHILDREN"] == 0, all_data["CNT_CHILDREN"] == 1, all_data["CNT_CHILDREN"] >= 2]
choices = ["0","1","2+"]
all_data["CNT_CHILDREN"] = np.select(conditions, choices)

name_type_suite = all_data["NAME_TYPE_SUITE"]
all_data.loc[(name_type_suite != "Unaccompanied") & (~name_type_suite.isnull()), "NAME_TYPE_SUITE"] = "Accompanied"

object_cols = [col for col in all_data.columns if all_data[col].dtype == "O"]
for oc in object_cols:
    all_data = string_col_to_onehot(all_data, oc)

del name_type_suite, ext_sources

In [47]:
#bureau_df = create_bureau_df("./../Data Files/bureau.feather", "./../Data Files/bureau_balance.feather"); print("Bureau DF complete")
#prev_app_df = create_prev_app_df("./../Data Files/previous_application.feather"); print("Prevapp DF complete")
cc_df = create_cc_df("./../Data Files/credit_card_balance.feather"); print("Credit card DF complete")
#install_payment_df = create_install_payment_df("./../Data Files/installments_payments.feather"); print("Inst paym DF complete")
#POS_cash_df = create_POS_cash_df("./../Data Files/POS_CASH_balance.feather"); print("POS cash DF complete")

Credit card DF complete


In [22]:
all_data.to_feather("./../Data Files/all_data.feather")
bureau_df.reset_index().to_feather("./../Data Files/bureau_sub.feather")
prev_app_df.reset_index().to_feather("./../Data Files/previous_application_sub.feather")
cc_df.reset_index().to_feather("./../Data Files/credit_card_balance_sub.feather")
install_payment_df.reset_index().to_feather("./../Data Files/installments_payments_sub.feather")
POS_cash_df.reset_index().to_feather("./../Data Files/POS_CASH_balance_sub.feather")

In [None]:
#all_data.drop(["TRAIN_DATA_LR_PREDS"], inplace = True, axis = 1)
bureau_df.drop(["BUREAU_LR_PREDS"], inplace = True, axis = 1)
prev_app_df.drop(["PREV_APP_LR_PREDS"], inplace = True, axis = 1)
cc_df.drop(["CC_LR_PREDS"], inplace = True, axis = 1)
install_payment_df.drop(["INST_PAY_LR_PREDS"], inplace = True, axis = 1)
POS_cash_df.drop(["POS_CASH_LR_PREDS"], inplace = True, axis = 1)

In [None]:
"""
AUC values for logistic regressions on individual files:
all_data: AUC=0.6407, 20 epochs
bureau: AUC=0.6047, 15 epochs
prev_app_df: AUC=0.5755, 5 epochs
cc_df: AUC = 0.6004, 4 epochs
install_payment_df: AUC = 0.5832, 4 epochs
POS_cash: AUC = 0.5209, 12 epochs
"""

In [48]:
all_data_pred, auc_list = log_regress_other_files(train_IDs, target_df, all_data, epochs = 20)
all_data["TRAIN_DATA_LR_PREDS"] = all_data_pred; print("all_data AUC: " + str(auc_list[-1]))

bureau_pred, auc_list = log_regress_other_files(train_IDs, target_df, bureau_df.reset_index(), epochs = 15)
bureau_df["BUREAU_LR_PREDS"] = bureau_pred; print("bureau AUC: " + str(auc_list[-1]))

prev_app_pred, auc_list = log_regress_other_files(train_IDs, target_df, prev_app_df.reset_index(), epochs = 5)
prev_app_df["PREV_APP_LR_PREDS"] = prev_app_pred; print("prev_app AUC: " + str(auc_list[-1]))

cc_pred, auc_list = log_regress_other_files(train_IDs, target_df, cc_df.reset_index(), epochs = 4)
cc_df["CC_LR_PREDS"] = cc_pred; print("cc AUC: " + str(auc_list[-1]))

install_payment_pred, auc_list = log_regress_other_files(train_IDs, target_df, install_payment_df.reset_index(), epochs = 4)
install_payment_df["INST_PAY_LR_PREDS"] = install_payment_pred; print("inst_pay AUC: " + str(auc_list[-1]))

POS_cash_pred, auc_list = log_regress_other_files(train_IDs, target_df, POS_cash_df.reset_index(), epochs = 12)
POS_cash_df["POS_CASH_LR_PREDS"] = POS_cash_pred; print("POS_cash AUC: " + str(auc_list[-1]))

all_data AUC: 0.6402261
bureau AUC: 0.6052712
prev_app AUC: 0.5715725
cc AUC: 0.60560316
inst_pay AUC: 0.58530426
POS_cash AUC: 0.52247834


In [49]:
supplemental_file_df = bureau_df.join(prev_app_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(cc_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(install_payment_df, how = "outer")
supplemental_file_df = supplemental_file_df.join(POS_cash_df, how = "outer")

In [50]:
all_data = all_data.join(supplemental_file_df, how = "left", on = "SK_ID_CURR")

In [51]:
train = all_data.iloc[:train_rows,:].copy()
test = all_data.iloc[train_rows:,:].copy()
train.reset_index(drop = True).to_feather("./../Solution attempts/v09 train data.feather")
test.reset_index(drop = True).to_feather("./../Solution attempts/v09 test data.feather")

In [52]:
train.drop(["SK_ID_CURR"], axis = 1, inplace = True)
test.drop(["SK_ID_CURR"], axis = 1, inplace = True)

In [21]:
pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":target}).to_feather("./../Solution attempts/v09 target.feather")

In [53]:
# Train it up!
# Best AUC: 0.787882

from sklearn.model_selection import KFold

folds = KFold(n_splits = 8, shuffle = True)
auc_scores = []

feature_importance_df = pd.DataFrame({"Features":train.columns})

lgbm_parameters = {"n_estimators":7000,
                   "learning_rate":0.01,     #previous: 0.01
                   "num_leaves":28,          #previous: 32, 24 (opt)
                   "colsample_by_tree":0.8,  #previous: 0.8
                   "subsample":0.771,        #previous: 0.9, 0.771 (opt)
                   "max_depth":6,            #previous: 7, 4 (opt)
                   "reg_alpha":0.084,        #previous: 0.15, 0.084 (opt)
                   "reg_lambda":0.091,       #previous: 0.01, 0.091 (opt)
                   "min_split_gain":0.093,   #previous: 0.1, 0.093 (opt)
                   "min_child_weight":2}     #previous: 2

for n_fold, (train_ids, val_ids) in enumerate(folds.split(train)):
    print("#### FOLD NUMBER " + str(n_fold + 1) + " ####")
    train_train = train.iloc[train_ids]
    train_test = train.iloc[val_ids]
    target_train = target[train_ids]
    target_test = target[val_ids]
    
    clf = LGBMClassifier(**lgbm_parameters, device = "gpu")
    clf.fit(train_train, target_train, eval_set = [(train_train, target_train), (train_test, target_test)], eval_metric = "auc", early_stopping_rounds = 50, verbose = 100)
    auc_scores.append(clf._best_score["valid_1"]["auc"])
    feature_importance_df["Importance Fold " + str(n_fold + 1)] = pd.Series(clf.feature_importances_)

print("Done.")

#### FOLD NUMBER 1 ####
Training until validation scores don't improve for 50 rounds.
[100]	training's auc: 0.756955	valid_1's auc: 0.751152
[200]	training's auc: 0.770081	valid_1's auc: 0.762616
[300]	training's auc: 0.781246	valid_1's auc: 0.771429
[400]	training's auc: 0.789295	valid_1's auc: 0.77676
[500]	training's auc: 0.795303	valid_1's auc: 0.780162
[600]	training's auc: 0.800744	valid_1's auc: 0.782805
[700]	training's auc: 0.805333	valid_1's auc: 0.784604
[800]	training's auc: 0.809384	valid_1's auc: 0.786149
[900]	training's auc: 0.813186	valid_1's auc: 0.787273
[1000]	training's auc: 0.816626	valid_1's auc: 0.788158
[1100]	training's auc: 0.819982	valid_1's auc: 0.789146
[1200]	training's auc: 0.823089	valid_1's auc: 0.789934
[1300]	training's auc: 0.826168	valid_1's auc: 0.790637
[1400]	training's auc: 0.829251	valid_1's auc: 0.791295
[1500]	training's auc: 0.832027	valid_1's auc: 0.791734
[1600]	training's auc: 0.834724	valid_1's auc: 0.792161
[1700]	training's auc: 0.837

[100]	training's auc: 0.757577	valid_1's auc: 0.747425
[200]	training's auc: 0.771038	valid_1's auc: 0.75747
[300]	training's auc: 0.781803	valid_1's auc: 0.764977
[400]	training's auc: 0.7898	valid_1's auc: 0.77066
[500]	training's auc: 0.795945	valid_1's auc: 0.774364
[600]	training's auc: 0.801448	valid_1's auc: 0.777009
[700]	training's auc: 0.806116	valid_1's auc: 0.779004
[800]	training's auc: 0.81019	valid_1's auc: 0.780553
[900]	training's auc: 0.813904	valid_1's auc: 0.781698
[1000]	training's auc: 0.817389	valid_1's auc: 0.782714
[1100]	training's auc: 0.820667	valid_1's auc: 0.783642
[1200]	training's auc: 0.823893	valid_1's auc: 0.784361
[1300]	training's auc: 0.826927	valid_1's auc: 0.785076
[1400]	training's auc: 0.829824	valid_1's auc: 0.785492
[1500]	training's auc: 0.832493	valid_1's auc: 0.785785
[1600]	training's auc: 0.835164	valid_1's auc: 0.78619
[1700]	training's auc: 0.837887	valid_1's auc: 0.786584
[1800]	training's auc: 0.840322	valid_1's auc: 0.786816
[1900]	

In [55]:
feature_importance_df["Average Importance"] = feature_importance_df.iloc[:,1:].mean(axis=1)
feature_importance_df.to_csv("v09 Feature Importance.csv", index = False)
feature_importance_df

Unnamed: 0,Features,Importance Fold 1,Importance Fold 2,Importance Fold 3,Importance Fold 4,Importance Fold 5,Importance Fold 6,Importance Fold 7,Importance Fold 8,Average Importance
0,NAME_CONTRACT_TYPE,94,114,111,110,119,112,113,134,113.375
1,CODE_GENDER,364,344,336,341,308,337,321,348,337.375
2,FLAG_OWN_CAR,2,0,0,12,10,3,1,10,4.750
3,FLAG_OWN_REALTY,74,114,72,72,116,83,72,108,88.875
4,AMT_INCOME_TOTAL,414,444,318,370,375,363,251,613,393.500
5,AMT_CREDIT,717,925,674,724,725,828,566,934,761.625
6,AMT_ANNUITY,1049,1233,1027,988,967,1087,779,1401,1066.375
7,AMT_GOODS_PRICE,764,795,786,665,774,846,564,758,744.000
8,NAME_TYPE_SUITE,10,15,27,26,18,31,13,55,24.375
9,REGION_POPULATION_RELATIVE,689,725,704,619,686,705,525,879,691.500


In [54]:
print(auc_scores)
print(np.mean(auc_scores))
#785674

[0.7940682234835258, 0.7841439698910254, 0.7904556656997856, 0.788800528540201, 0.7865488807940229, 0.7881012916783382, 0.7879164425333838, 0.7830232890341303]
0.7878822864568016


In [56]:
# Last AUC @ 7000 iterations: 0.913970
clf2 = LGBMClassifier(**lgbm_parameters)
clf2.fit(train, target, eval_set = [(train, target)], eval_metric = "auc", early_stopping_rounds = 150, verbose = 100)

Training until validation scores don't improve for 150 rounds.
[100]	training's auc: 0.756434
[200]	training's auc: 0.769262
[300]	training's auc: 0.779958
[400]	training's auc: 0.787741
[500]	training's auc: 0.793565
[600]	training's auc: 0.798644
[700]	training's auc: 0.80303
[800]	training's auc: 0.806978
[900]	training's auc: 0.810546
[1000]	training's auc: 0.813851
[1100]	training's auc: 0.816998
[1200]	training's auc: 0.819952
[1300]	training's auc: 0.822786
[1400]	training's auc: 0.825463
[1500]	training's auc: 0.828017
[1600]	training's auc: 0.830407
[1700]	training's auc: 0.832711
[1800]	training's auc: 0.834945
[1900]	training's auc: 0.837211
[2000]	training's auc: 0.839459
[2100]	training's auc: 0.841654
[2200]	training's auc: 0.843927
[2300]	training's auc: 0.846043
[2400]	training's auc: 0.848232
[2500]	training's auc: 0.850157
[2600]	training's auc: 0.852101
[2700]	training's auc: 0.854064
[2800]	training's auc: 0.856014
[2900]	training's auc: 0.857971
[3000]	training's a

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_by_tree=0.8,
        colsample_bytree=1.0, importance_type='split', learning_rate=0.01,
        max_depth=6, min_child_samples=20, min_child_weight=2,
        min_split_gain=0.093, n_estimators=7000, n_jobs=-1, num_leaves=28,
        objective=None, random_state=None, reg_alpha=0.084,
        reg_lambda=0.091, silent=True, subsample=0.771,
        subsample_for_bin=200000, subsample_freq=0)

In [57]:
data_predictions = clf2.predict_proba(test, num_iteration = clf2.best_iteration_)
submission = pd.DataFrame({"SK_ID_CURR":test_IDs, "TARGET":data_predictions[:,1]})
submission.to_csv("v09_predictions.csv", index = False)