<h2>This notebook is for processing the data from the <TT>credit_card</TT> supplementary data file.</h2>

In [16]:
import v10_common as com

import feather
import numpy as np
import pandas as pd
import scipy.stats as sstats

In [17]:
# Future improvements:
# - check which features are highest importance in full data, then add them to last N months aggregations
# - add monthly versions of POS/ATM/other drawing columns

In [18]:
CC_FILE = com.DATA_FILE_FOLDER + "credit_card_balance.feather"

In [19]:
cc = pd.read_feather(CC_FILE)

<h4> Correcting Apparent Errors </h4>

In [20]:
for col in ["AMT_DRAWINGS_ATM_CURRENT","AMT_DRAWINGS_CURRENT"]:
    cc[col] = cc[col].abs()

<h4> Feature Engineering </h4>

In [21]:
# Completely new features
cc["DRAWINGS_TOTAL_PAY_RATIO"] = cc["AMT_DRAWINGS_CURRENT"] / cc["AMT_PAYMENT_TOTAL_CURRENT"]
cc["PERCENT_AMT_ATM_DRAWINGS"] = cc["AMT_DRAWINGS_ATM_CURRENT"] / cc["AMT_DRAWINGS_CURRENT"]
cc["PERCENT_AMT_POS_DRAWINGS"] = cc["AMT_DRAWINGS_POS_CURRENT"] / cc["AMT_DRAWINGS_CURRENT"]
cc["PERCENT_AMT_OTHER_DRAWINGS"] = cc["AMT_DRAWINGS_OTHER_CURRENT"] / cc["AMT_DRAWINGS_CURRENT"]

cc["AVG_PER_DRAWING"] = cc["AMT_DRAWINGS_CURRENT"] / cc["CNT_DRAWINGS_CURRENT"]
cc["AVG_PER_DRAWING"].replace([np.inf,-np.inf],0, inplace = True)

cc["AVG_PER_ATM_DRAWING"] = cc["AMT_DRAWINGS_ATM_CURRENT"] / cc["CNT_DRAWINGS_ATM_CURRENT"]
cc["AVG_PER_ATM_DRAWING"].replace([np.inf,-np.inf],0, inplace = True)

In [22]:
cc_gr = cc.groupby("SK_ID_CURR")

In [23]:
# possible geometric mean targets
cc_sub = cc_gr.agg({"SK_DPD_DEF":[lambda x: sum(x != 0),max],
                    "AMT_CREDIT_LIMIT_ACTUAL":[np.mean,max],
                    "SK_ID_PREV":lambda x: len(x.unique()),
                    "AMT_BALANCE":[max,np.mean,np.std],
                    "AMT_PAYMENT_TOTAL_CURRENT":[max,np.mean,np.std],
                    "CNT_DRAWINGS_CURRENT":[max,np.mean,sum],
                    "AMT_DRAWINGS_CURRENT":[min,max,np.mean,sum],
                    "DRAWINGS_TOTAL_PAY_RATIO":[min,max,np.mean],
                    "AMT_DRAWINGS_ATM_CURRENT":[np.mean,sum,np.std],
                    "CNT_DRAWINGS_ATM_CURRENT":[np.mean,sum,np.std],
                    "AMT_DRAWINGS_POS_CURRENT":[np.mean,sum,np.std],
                    "CNT_DRAWINGS_POS_CURRENT":[np.mean,sum,np.std],
                    "AMT_DRAWINGS_OTHER_CURRENT":[np.mean,sum,np.std],
                    "CNT_DRAWINGS_OTHER_CURRENT":[np.mean,sum,np.std],
                    "MONTHS_BALANCE":max,
                    "AMT_RECEIVABLE_PRINCIPAL":[min,sum,max,np.mean],
                    "AMT_RECIVABLE":[min,sum,max,np.mean],
                    "AMT_TOTAL_RECEIVABLE":[min,sum,max,np.mean]})

cc_sub.columns = ["NUM_LATE_CC_PAYMENTS","MOST_OVERDUE_CC",
                  "AVG_CREDIT_LIMIT_ACTUAL","MAX_CREDIT_LIMIT",
                  "NUM_PREV_CC_LOANS",
                  "MAX_BALANCE","AVG_CC_BALANCE","AMT_BALANCE_STDEV",
                  "MAX_PAID_ON_CREDIT","AVG_PAID_ON_CREDIT","AMT_PAID_ON_CREDIT_STDEV",
                  "MAX_DRAWINGS_IN_MONTH","AVG_DRAWINGS_PER_MONTH","TOTAL_NUM_ALL_DRAWINGS",
                  "MIN_MONTHLY_DRAWING_AMT","MAX_MONTHLY_DRAWING_AMT","AVG_MONTHLY_DRAWING_AMT","TOTAL_AMT_ALL_DRAWINGS",
                  "MIN_DRAW_TO_TOTAL_PAY","MAX_DRAW_TO_TOTAL_PAY","AVG_DRAW_TO_TOTAL_PAY",
                  "AVG_AMT_ATM_DRAWINGS", "TOTAL_AMT_ATM_DRAWINGS", "STD_AMT_ATM_DRAWINGS",
                  "AVG_NUM_ATM_DRAWINGS", "TOTAL_NUM_ATM_DRAWINGS", "STD_NUM_ATM_DRAWINGS",
                  "AVG_AMT_POS_DRAWINGS", "TOTAL_AMT_POS_DRAWINGS", "STD_AMT_POS_DRAWINGS",
                  "AVG_NUM_POS_DRAWINGS", "TOTAL_NUM_POS_DRAWINGS", "STD_NUM_POS_DRAWINGS",
                  "AVG_AMT_OTHER_DRAWINGS","TOTAL_AMT_OTHER_DRAWINGS", "STD_AMT_OTHER_DRAWINGS",
                  "AVG_NUM_OTHER_DRAWINGS","TOTAL_NUM_OTHER_DRAWINGS", "STD_NUM_OTHER_DRAWINGS",
                  "MOST_RECENT_CC_MONTH",
                  "AMT_RECEIVABLE_PRINCIPAL_MIN","AMT_RECEIVABLE_PRINCIPAL_TOTAL","AMT_RECEIVABLE_PRINCIPAL_MAX","AMT_RECEIVABLE_PRINCIPAL_AVG",
                  "AMT_RECEIVABLE_MIN", "AMT_RECEIVABLE_TOTAL", "AMT_RECEIVABLE_MAX", "AMT_RECEIVABLE_AVG",
                  "AMT_TOTAL_RECEIVABLE_MIN", "AMT_TOTAL_RECEIVABLE_TOTAL", "AMT_TOTAL_RECEIVABLE_MAX", "AMT_TOTAL_RECEIVABLE_AVG"]

cc_sub["OVERALL_AVG_ALL_DRAWING_AMT"] = cc_sub["TOTAL_AMT_ALL_DRAWINGS"] / cc_sub["TOTAL_NUM_ALL_DRAWINGS"]
cc_sub["OVERALL_AVG_ATM_DRAWING_AMT"] = cc_sub["TOTAL_AMT_ATM_DRAWINGS"] / cc_sub["TOTAL_NUM_ATM_DRAWINGS"]

cc_sub["OVERALL_PERCENT_AMT_ATM_DRAWINGS"] = cc_sub["TOTAL_AMT_ATM_DRAWINGS"] / cc_sub["TOTAL_AMT_ALL_DRAWINGS"]
cc_sub["OVERALL_PERCENT_AMT_POS_DRAWINGS"] = cc_sub["TOTAL_AMT_POS_DRAWINGS"] / cc_sub["TOTAL_AMT_ALL_DRAWINGS"]
cc_sub["OVERALL_PERCENT_AMT_OTHER_DRAWINGS"] = cc_sub["TOTAL_AMT_OTHER_DRAWINGS"] / cc_sub["TOTAL_AMT_ALL_DRAWINGS"]

In [24]:
# add data for last N months
cc_n_months = []
for month in [1,6,12,24]:
    cc_recent = cc[cc["MONTHS_BALANCE"] >= -month]
    cc_recent_gr = cc_recent.groupby("SK_ID_CURR")
    cc_recent_sub = cc_recent_gr.agg({"SK_DPD_DEF":lambda x: sum(x != 0),
                                      "AMT_PAYMENT_TOTAL_CURRENT":np.mean,
                                      "AMT_BALANCE":[np.std,np.mean],
                                      "CNT_DRAWINGS_CURRENT":sum,
                                      "AMT_DRAWINGS_CURRENT":sum})

    cc_recent_sub.columns = [f"NUM_LATE_CC_PAYMENTS_{month}_MONTHS",
                             f"AVG_PAID_ON_CREDIT_{month}_MONTHS",
                             f"AMT_BALANCE_STD_{month}_MONTHS",
                             f"AVG_BALANCE_{month}_MONTHS",
                             f"NUM_DRAWINGS_{month}_MONTHS",
                             f"AMT_DRAWINGS_{month}_MONTHS"]
    
    cc_recent_sub[f"AVG_AMT_PER_DRAWING_{month}_MONTHS"] = cc_recent_sub[f"AMT_DRAWINGS_{month}_MONTHS"] / cc_recent_sub[f"NUM_DRAWINGS_{month}_MONTHS"]

    cc_n_months.append(cc_recent_sub)
    print(f"{month} months complete.")

cc_recent_data = pd.DataFrame(index = cc_sub.index).join(cc_n_months, how = "left")

1 months complete.
6 months complete.
12 months complete.
24 months complete.


In [25]:
cc_data = pd.concat([cc_sub, cc_recent_data], axis = 1)

<h4> Logistic regression predictions.  Several columns are stripped out due to multicollinearity issues (the threshold point for this is a correlation of greater than 0.75 between two variables).  A few other columns are transformed so that they have higher correlations with the TARGET variable (which seems to help the models along); since the ranking order of the points stays the same in these circumstances, the gradient boosting shouldn't be particularly affected. </h4>

<h4> LAST AUC VALUE: 0.6676 </h4>

High Correlation Pairs:
    
NOT CHECKED YET

In [26]:
# Load objects needed for logistic regression
target_df = pd.read_feather("target.feather")

cc_poly = {"AVG_DRAWINGS_PER_MONTH":0.5,"TOTAL_AMT_ALL_DRAWINGS":0.1,"TOTAL_AMT_ATM_DRAWINGS":0.1,
           "TOTAL_NUM_ATM_DRAWINGS":0.1,"AVG_MONTHLY_DRAWING_AMT":0.3,"NUM_DRAWINGS_12_MONTHS":0.2,
           "NUM_DRAWINGS_24_MONTHS":0.2}

high_cor_columns = []

In [27]:
# Make logistic regression predictions
test_aucs = []
for _ in range(8):
    pred, auc = com.log_regress_other_files(com.add_polynomial_terms(cc_data.reset_index().copy(), cc_poly),
                                            target_df,
                                            high_cor_columns)
    test_aucs.append(auc)
    print(auc)
print("Avg AUC: " + str(np.mean(test_aucs)))

0.6633906960057725
0.6645702654659691
0.669772328068106
0.672654807992719
0.667683189620813
0.671168647775882
0.6657614754393216
0.6659611375850988
Avg AUC: 0.6676203184942103


In [28]:
cc_data["CC_LR_PREDS"] = pred

In [29]:
cc_data.reset_index().to_feather("credit_card_sub.feather")