<h2>This file is for processing the data from the supplementary <TT>POS_CASH_balance</TT> data file. </h2>

In [9]:
import v10_common as com

import feather
import numpy as np
import pandas as pd
import scipy.stats as sstats

In [10]:
# Future work:
# - Some columns have NAME_CONTRACT_STATUS = "Completed" despite having non-zero CNT_INSTALMENT_FUTURE
# - operations on number of installments for each SK_ID_PREV; should probably .agg with mode() to get it

In [11]:
POS_CASH_FILE = com.DATA_FILE_FOLDER + "POS_CASH_balance.feather"

In [12]:
POS_cash = pd.read_feather(POS_CASH_FILE)

In [13]:
POS_cash_gr = POS_cash.sort_values(["SK_ID_CURR", "MONTHS_BALANCE"]).groupby("SK_ID_CURR")

In [15]:
POS_cash_sub = POS_cash_gr.agg({"SK_DPD_DEF":[lambda x: sum(x > 0),max,np.mean]})
POS_cash_sub.columns = ["NUM_LATE_POS_PAYMENTS", "MAX_POS_DPD", "AVG_POS_DPD"]
POS_cash_sub["NUM_POS_ENTRIES"] = POS_cash_gr.size()

In [16]:
POS_cash_status = POS_cash_gr["NAME_CONTRACT_STATUS"].value_counts().unstack("NAME_CONTRACT_STATUS")
POS_cash_sub["NUM_CONTRACTS_COMPLETED"] = POS_cash_status["Completed"]

In [17]:
# Figure out current statuses 
POS_last_month = POS_cash[POS_cash["MONTHS_BALANCE"] == -1]
POS_last_month_agg = POS_last_month.groupby("SK_ID_CURR").agg({"CNT_INSTALMENT_FUTURE":[sum,lambda x: sum(x != 0)]})
POS_last_month_agg.columns = ["NUM_INSTALMENTS_PENDING","NUM_ACCOUNTS_ACTIVE"]

In [18]:
POS_n_months = []
for month in [3,6,12,24]:
    POS_recent = POS_cash[POS_cash["MONTHS_BALANCE"] > -month]
    POS_recent_gr = POS_recent.groupby("SK_ID_CURR")
    POS_recent_sub = POS_recent_gr.agg({"SK_DPD_DEF":lambda x: sum(x != 0),
                                        "SK_DPD": lambda x: sum(x != 0)})
    POS_recent_sub.columns = [f"NUM_LATE_POS_PAYMENTS_{month}_MONTHS_DEF",
                              f"NUM_LATE_POS_PAYMENTS_{month}_MONTHS"]
    
    POS_recent_status = POS_recent_gr["NAME_CONTRACT_STATUS"].value_counts().unstack("NAME_CONTRACT_STATUS")
    POS_recent_sub[f"NUM_CONTRACTS_COMPLETED_{month}_MONTHS"] = POS_recent_status["Completed"]
    
    POS_n_months.append(POS_recent_sub)
    print(f"{month} months completed.")

POS_months_data = pd.DataFrame(index = POS_cash_sub.index).join(POS_n_months, how = "left")

3 months completed.
6 months completed.
12 months completed.
24 months completed.


In [19]:
POS_cash_data = POS_cash_sub.join([POS_last_month_agg, POS_months_data])
POS_cash_data.fillna(value = {"NUM_INSTALMENTS_PENDING":0,"NUM_ACCOUNTS_ACTIVE":0}, inplace = True)
POS_cash_data.fillna(value = {x:0 for x in POS_months_data.columns}, inplace = True)

<h4> Logistic regression predictions.  Several columns are stripped out due to multicollinearity issues (the threshold point for this is a correlation of greater than 0.75 between two variables).  A few other columns are transformed so that they have higher correlations with the TARGET variable (which seems to help the models along); since the ranking order of the points stays the same in these circumstances, the gradient boosting shouldn't be particularly affected. </h4>

<h4> LAST AUC VALUE: 0.5790 </h4>

High Correlation Pairs:
    
- MAX_POS_DPD & AVG_POS_DPD (0.9649)

In [20]:
# Load objects needed for logistic regression
target_df = pd.read_feather("target.feather")

POS_cash_poly = {"AVG_POS_DPD":0.1, "NUM_CONTRACTS_COMPLETED":0.2}

high_cor_columns = ["MAX_POS_DPD"]

In [21]:
# Make logistic regression predictions
test_aucs = []
for _ in range(8):
    pred, auc = com.log_regress_other_files(com.add_polynomial_terms(POS_cash_data.reset_index().copy(), POS_cash_poly),
                                            target_df,
                                            high_cor_columns)
    test_aucs.append(auc)
    print(auc)
print("Avg AUC: " + str(np.mean(test_aucs)))

0.5736604031045014
0.5851072603962046
0.5777737829410226
0.5762792800271173
0.5816914231023006
0.5802944711172141
0.5793405388935406
0.57810867349179
Avg AUC: 0.5790319791342113


In [22]:
POS_cash_data["POS_CASH_LR_PREDS"] = pred

In [23]:
POS_cash_data.reset_index().to_feather("POS_cash_sub.feather")