<h2>This file is for processing the data from the supplementary <TT>installments_payments</TT> data file.</h2>

In [1]:
import v11_common as com

import feather
import numpy as np
import pandas as pd
import re

In [2]:
# Future work:
#  - find the divide by 0 problem in the aggregation df

In [3]:
IP_FILE_PATH = com.DATA_FILE_FOLDER + "installments_payments.feather"

In [4]:
install_pay = pd.read_feather(IP_FILE_PATH)

In [5]:
# New features
install_pay["AMT_PAYMENT"].fillna(value = install_pay["AMT_INSTALMENT"], inplace = True)
install_pay["DAYS_ENTRY_PAYMENT"].fillna(value = install_pay["DAYS_INSTALMENT"], inplace = True)

install_pay["FRACTION_INSTALLMENT_PAID"] = install_pay["AMT_PAYMENT"] / install_pay["AMT_INSTALMENT"]
install_pay["FRACTION_INSTALLMENT_PAID"] = install_pay["FRACTION_INSTALLMENT_PAID"].replace([np.inf,-np.inf],np.nan)

install_pay["DAYS_OFF_PAYMENT"] = install_pay["DAYS_INSTALMENT"] - install_pay["DAYS_ENTRY_PAYMENT"]

conditions = [install_pay["AMT_PAYMENT"] < install_pay["AMT_INSTALMENT"],
              install_pay["AMT_PAYMENT"] == install_pay["AMT_INSTALMENT"],
              install_pay["AMT_PAYMENT"] > install_pay["AMT_INSTALMENT"]]
choices = ["UNDER", "EXACT_AMT", "OVER"]
install_pay["PAYMENT_LEVEL"] = np.select(conditions, choices)

conditions = [install_pay["DAYS_ENTRY_PAYMENT"] > install_pay["DAYS_INSTALMENT"],
              install_pay["DAYS_ENTRY_PAYMENT"] == install_pay["DAYS_INSTALMENT"],
              install_pay["DAYS_ENTRY_PAYMENT"] < install_pay["DAYS_INSTALMENT"]]
choices = ["LATE", "ON_TIME", "EARLY"]
install_pay["PAYMENT_TIME"] = np.select(conditions, choices)

In [6]:
install_pay_gr = install_pay.groupby("SK_ID_CURR")

In [7]:
install_pay_sub = install_pay_gr.agg({"AMT_PAYMENT":[min, np.mean, max, com.log_average],
                                      "AMT_INSTALMENT":[min, np.mean, max, com.log_average],
                                      "DAYS_OFF_PAYMENT":[min,np.mean,max],
                                      "FRACTION_INSTALLMENT_PAID":[min,np.mean,max, com.geom_mean]})
install_pay_sub.columns = ["MIN_PAYMENT", "AVG_PAYMENT", "MAX_PAYMENT", "LOG_AVG_PAYMENT",
                           "MIN_INSTALLMENT", "AVG_INSTALLMENT", "MAX_INSTALLMENT", "LOG_AVG_INSTALLMENT",
                           "BEST_PAYMENT_DATE", "AVG_DAYS_OFF_PAYMENT", "WORST_PAYMENT_DATE",
                           "MIN_INSTALL_FRAC_PAID", "AVG_INSTALL_FRAC_PAID", "MAX_INSTALL_FRAC_PAID", "GMEAN_INSTALL_FRAC_PAID"]

install_pay_sub["NUM_INSTALLMENT_ENTRIES"] = install_pay_gr.size()

  log_a = np.log(a)


In [8]:
install_pay_level = com.count_frac_cols(install_pay_gr,
                                        col = "PAYMENT_LEVEL",
                                        middle_string = "PAYMENTS")

In [9]:
install_pay_times = com.count_frac_cols(install_pay_gr,
                                        col = "PAYMENT_TIME",
                                        middle_string = "PAYMENTS")

In [10]:
install_pay_n_days = []
for days in [30,120,365,730]:
    install_pay_recent = install_pay[install_pay["DAYS_INSTALMENT"] >= -days]
    install_pay_recent_gr = install_pay_recent.groupby("SK_ID_CURR")
    
    install_pay_recent_sub = install_pay_recent_gr.agg({"AMT_PAYMENT":[min, np.mean, max, com.log_average]})
    install_pay_recent_sub.columns = [f"MIN_PAYMENT_{days}_DAYS",
                                      f"AVG_PAYMENT_{days}_DAYS",
                                      f"MAX_PAYMENT_{days}_DAYS",
                                      f"LOG_AVG_PAYMENT_{days}_DAYS"]
    
    install_pay_recent_sub[f"NUM_INSTAL_PAY_{days}_DAYS"] = install_pay_recent_gr.size()
    
    install_pay_recent_level = com.count_frac_cols(install_pay_recent_gr,
                                                   col = "PAYMENT_LEVEL",
                                                   middle_string = f"PAYMENTS_{days}_DAYS")
    
    install_pay_recent_times = com.count_frac_cols(install_pay_recent_gr,
                                                   col = "PAYMENT_TIME",
                                                   middle_string = f"PAYMENTS_{days}_DAYS")
    
    install_pay_n_days.append(pd.concat([install_pay_recent_sub, install_pay_recent_level, install_pay_recent_times], axis = 1))
    print(f"{days} days complete.")
    
install_pay_recent_data = pd.DataFrame(index = install_pay_sub.index).join(install_pay_n_days, how = "left")

30 days complete.
120 days complete.
365 days complete.
730 days complete.


In [11]:
install_pay_dfs = [install_pay_sub, install_pay_level, install_pay_times, install_pay_recent_data]
install_pay_data = pd.concat(install_pay_dfs, axis = 1)

<h4> Logistic regression predictions.  Several columns are stripped out due to multicollinearity issues (the threshold point for this is a correlation of greater than 0.75 between two variables).  A few other columns are transformed so that they have higher correlations with the TARGET variable (which seems to help the models along); since the ranking order of the points stays the same in these circumstances, the gradient boosting shouldn't be particularly affected. </h4>

<h4> LAST AUC VALUE: 0.6100 </h4>

High Correlation Pairs:
    
NOT CHECKED YET

In [12]:
# Load objects needed for logistic regression
target_df = pd.read_feather("target.feather")

install_pay_poly = {"MIN_PAYMENT":0.5,"LOG_AVG_PAYMENT":0.5,"NUM_PAYMENTS_UNDER":0.1}

high_cor_columns = []

In [13]:
# Make logistic regression predictions
test_aucs = []
for _ in range(1):
    pred, auc = com.log_regress_other_files(com.add_polynomial_terms(install_pay_data.reset_index().copy(), install_pay_poly),
                                            target_df,
                                            high_cor_columns)
    test_aucs.append(auc)
    print(auc)
print("Avg AUC: " + str(np.mean(test_aucs)))

0.6097531476928157
Avg AUC: 0.6097531476928157


In [14]:
install_pay_data["INST_PAY_LR_PREDS"] = pred

In [15]:
install_pay_data.reset_index().to_feather("installments_payments_sub.feather")