<h2>This file is for processing the <TT>application_train</TT> and <TT>application_test</TT> data files.</h2>

In [1]:
import v10_common as com

import feather
import numpy as np
import pandas as pd
import scipy.stats as sstats
import re

In [2]:
# Further work:
# - try making REGION_RATING_CLIENT a three-level factor
# - NAME_FAMILY_STATUS: "Unknown" should be NA, try dividing rest into "Married" and "Not Married"
# - try keeping 

In [3]:
train = pd.read_feather(com.DATA_FILE_FOLDER + "application_train.feather")
test = pd.read_feather(com.DATA_FILE_FOLDER + "application_test.feather")

In [4]:
# There's a couple things we need to hang on to in order to keep things separate
target = train["TARGET"]
train_IDs = train["SK_ID_CURR"]

In [5]:
train.drop("TARGET", inplace = True, axis = 1)

<h4> Engineering of the train and test data.  For consistency's sake, the train and test data are combined into a single dataframe, the needed operations are run on it, and then they are split apart. </h4>

In [6]:
all_data = pd.concat([train,test], ignore_index = True)

In [7]:
# Add column for total number of docs provided
document_columns = ["FLAG_DOCUMENT_" + str(x) for x in range(2,22)]
all_data["NUM_DOCUMENTS_PROVIDED"] = all_data[document_columns].sum(axis = 1)

In [8]:
# Simplify ORGANIZATION_TYPE a bit; see if this helps anything
industry_types = ["Industry: type " + str(x) for x in range(1,14)]
trade_types = ["Trade: type " + str(x) for x in range(1,8)]
transport_types = ["Transport: type " + str(x) for x in [1,2,3,4]]
useless_ORGANIZATION_TYPE = ["Culture","Electricity","Emergency","Hotel","Insurance","Legal Services",
                             "Mobile", "Religion", "Telecom"]

all_data["ORGANIZATION_TYPE"].replace(industry_types, "Industry", inplace = True)
all_data["ORGANIZATION_TYPE"].replace(trade_types, "Trade", inplace = True)
all_data["ORGANIZATION_TYPE"].replace(transport_types, "Transport", inplace = True)
all_data["ORGANIZATION_TYPE"].replace("XNA", np.nan, inplace = True)
all_data["ORGANIZATION_TYPE"].replace(useless_ORGANIZATION_TYPE, np.nan, inplace = True)

In [9]:
# Transform some original columns
all_data.loc[all_data["CODE_GENDER"] == "XNA", "CODE_GENDER"] = "F"
all_data.loc[all_data["DAYS_EMPLOYED"] == 365243, "DAYS_EMPLOYED"] = np.NaN

name_type_suite = all_data["NAME_TYPE_SUITE"]
all_data.loc[(name_type_suite != "Unaccompanied") & (~name_type_suite.isnull()), "NAME_TYPE_SUITE"] = "Accompanied"

# Remove useless NAME_INCOME_TYPE values
useless_NAME_INCOME_TYPE = ["Businessman","Pensioner","Student","Unemployed"]
all_data["NAME_INCOME_TYPE"].replace(useless_NAME_INCOME_TYPE, np.nan, inplace = True)

# Remove useless OCCUPATION_TYPE values (hang on to 'Private service staff' since LR might find it useful)
useless_OCCUPATION_TYPE = ["Cooking staff", "HR staff", "Private service staff", "Realty agents"]
all_data["OCCUPATION_TYPE"].replace(useless_OCCUPATION_TYPE, np.nan, inplace = True)

In [10]:
# All-new features
all_data["CREDIT_INCOME_RATIO"] = all_data["AMT_CREDIT"] / all_data["AMT_INCOME_TOTAL"]
all_data["INCOME_PER_HEAD"] = all_data["AMT_INCOME_TOTAL"] / all_data["CNT_FAM_MEMBERS"]
all_data["ANNUITY_INCOME_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_INCOME_TOTAL"]
all_data["ANNUITY_CREDIT_RATIO"] = all_data["AMT_ANNUITY"] / all_data["AMT_CREDIT"]
all_data["GOODS_INCOME_RATIO"] = all_data["AMT_GOODS_PRICE"] / all_data["AMT_INCOME_TOTAL"]
all_data["GOODS_CREDIT_RATIO"] = all_data["AMT_GOODS_PRICE"] / all_data["AMT_CREDIT"]
all_data["FRAC_DAYS_EMPLOYED"] = all_data["DAYS_EMPLOYED"] / all_data["DAYS_BIRTH"]
all_data["FRAC_CHILDREN"] = all_data["CNT_CHILDREN"] / all_data["CNT_FAM_MEMBERS"]

ext_sources = all_data[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]]
for func_name in ["SUM","MEAN","MIN","MAX"]:
    all_data["EXT_SOURCE_" + func_name] = eval(f"np.{func_name.lower()}")(ext_sources, axis = 1)

In [11]:
conditions = [all_data["CNT_CHILDREN"] == 0, all_data["CNT_CHILDREN"] == 1, all_data["CNT_CHILDREN"] >= 2]
choices = ["0","1","2+"]
all_data["CNT_CHILDREN"] = np.select(conditions, choices)

all_data["AMT_INCOME_TOTAL"] = np.log10(all_data["AMT_INCOME_TOTAL"])

In [12]:
# Delete columns unimportant to both the logistic regression and LGBM
unimportant_columns = ["FLAG_DOCUMENT_4", "FLAG_DOCUMENT_7", "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_12",
                       "FLAG_DOCUMENT_21", "FONDKAPREMONT_MODE", "FLAG_MOBIL", "FLAG_EMP_PHONE",
                       "AMT_REQ_CREDIT_BUREAU_HOUR", "HOUSETYPE_MODE", "EMERGENCYSTATE_MODE"]
all_data.drop(unimportant_columns, axis = 1, inplace = True)

In [13]:
# Turn all of the remaining string columns into onehot data

object_cols = [col for col in all_data.columns if all_data[col].dtype == "O"]
onehot_dfs = []
for oc in object_cols:
    onehot_dfs.append(com.string_col_to_onehot(all_data, oc))
all_data.drop(object_cols, inplace = True, axis = 1)

In [14]:
all_data = pd.concat([all_data] + onehot_dfs, axis = 1)

<h4> Logistic regression predictions.  Several columns are stripped out due to multicollinearity issues (the threshold point for this is a correlation of greater than 0.75 between two variables).  A few other columns are transformed so that they have higher correlations with the TARGET variable (which seems to help the models along); since the ranking order of the points stays the same in these circumstances, the gradient boosting shouldn't be particularly affected. </h4>

<h4> LAST AUC VALUE: 0.7160 </h4>

High Correlation Pairs:
    
NOT CHECKED YET

In [15]:
# Load objects needed for logistic regression
target_df = pd.read_feather("target.feather")

all_data_poly = {"DAYS_BIRTH":2,"DAYS_REGISTRATION":2,"DAYS_ID_PUBLISH":2}

high_cor_columns = []

In [16]:
# Make logistic regression predictions
test_aucs = []
for _ in range(5):
    pred, auc = com.log_regress_other_files(com.add_polynomial_terms(all_data.copy(), all_data_poly),
                                            target_df,
                                            high_cor_columns)
    test_aucs.append(auc)
    print(auc)
print("Avg AUC: " + str(np.mean(test_aucs)))

0.7172525981704014
0.7159998776126388
0.715340976041085
0.7128817480973778
0.7149935648610101
Avg AUC: 0.7152937529565027


In [17]:
all_data["TRAIN_DATA_LR_PREDS"] = pred

In [18]:
all_data.to_feather("all_data.feather")
pd.DataFrame({"SK_ID_CURR":train_IDs, "TARGET":target}).to_feather("target.feather")