In [1]:
%reload_ext autoreload
%autoreload 1
import joblib
from ray import tune, train
import copy
import shap
import matplotlib.pyplot as plt
import polars as pl
import sys
import numpy as np
from sklearn.model_selection import train_test_split
import functions.transformers as tr
from functions.transformers import PolarsColumnTransformer as PCT
import functions.data_cleaning as dmf
import functions.plot_functions as pf
import functions.tuning as tunes
import functions.feature_engineering as feats
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from IPython.display import display,Markdown
import lightgbm as lgb
%aimport functions.tuning
%aimport functions.transformers
%aimport functions.data_cleaning
%aimport functions.plot_functions
%aimport functions.feature_engineering

In [2]:
data_train_single=pl.read_parquet('data/data_accepted_single_train.parquet')
data_val_single=pl.read_parquet('data/data_accepted_single_val.parquet')
data_train_joint=pl.read_parquet('data/data_accepted_joint_train.parquet')
data_val_joint=pl.read_parquet('data/data_accepted_joint_val.parquet')

In [3]:
data_train_single=dmf.clean_accepted_single(data_train_single)
data_val_single=dmf.clean_accepted_single(data_val_single)
data_train_joint=dmf.clean_accepted_joint(data_train_joint)
data_val_joint=dmf.clean_accepted_joint(data_val_joint)

### Feature Engineering

In [4]:
data_train_single=feats.date_features(data_train_single,'issue_d')
data_val_single=feats.date_features(data_val_single,'issue_d')
data_train_joint=feats.date_features(data_train_joint,'issue_d')
data_val_joint=feats.date_features(data_val_joint,'issue_d')

In [5]:
X_train_single=data_train_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_single=data_train_single['grade']

X_val_single=data_val_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_single=data_val_single['grade']

X_train_single=data_train_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_single=data_train_joint['grade']

X_val_joint=data_val_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_joint=data_train_joint['grade']


#### Imputers

In [6]:
full_pipline_single=Pipeline([])

In [7]:
emp_title_imp = tr.NotInImputerPolars(cat_no=20, fill_value="other")
emp_length_imp = tr.NotInImputerPolars(cat_no=12, fill_value=None)
zip_imp = tr.NotInImputerPolars(cat_no=12, fill_value='other')
imputer = PCT(
    [
        PCT.Step("emp_title", emp_title_imp, "emp_title"),
        PCT.Step("emp_length", emp_length_imp, "emp_length"),
        PCT.Step("zip_code", zip_imp,'zip_code')
    ]
)
full_pipline_single.steps.append(("imputer",imputer))

### Encoders

In [10]:
emp_t_enc = tr.TargetMeanOrderedLabeler()
emp_l_order = [
    None,
    "< 1 year",
    "1 year",
    "2 years",
    "3 years",
    "4 years",
    "5 years",
    "6 years",
    "7 years",
    "8 years",
    "9 years",
    "10+ years",
]
emp_l_enc = tr.PolarsOrdinalEncoder(order=emp_l_order)
home_enc = tr.TargetMeanOrderedLabeler(how="label")
verif_enc = tr.TargetMeanOrderedLabeler(how="label")
paymnt_enc = tr.PolarsOneHotEncoder(drop=True)
purpose_enc = tr.TargetMeanOrderedLabeler(how="mean")
zip_enc = tr.TargetMeanOrderedLabeler(how="mean")
state_enc = tr.TargetMeanOrderedLabeler(how="mean")
init_stat_enc = tr.PolarsOneHotEncoder(drop=True)
disbursment_enc = tr.PolarsOneHotEncoder(drop=True)

encoders = PCT(
    [
        PCT.Step("emp_t_enc", emp_t_enc, "emp_title"),
        PCT.Step("emp_l_enc", emp_l_enc, "emp_title"),
        PCT.Step("home_enc", home_enc, "emp_title"),
        PCT.Step("verif_enc", verif_enc, "emp_title"),
        PCT.Step("paymnt_enc", paymnt_enc, "emp_title"),
        PCT.Step("purpose_enc", purpose_enc, "emp_title"),
        PCT.Step("zip_enc", zip_enc, "emp_title"),
        PCT.Step("state_enc", state_enc, "emp_title"),
        PCT.Step("init_stat_enc", init_stat_enc, "emp_title"),
        PCT.Step("disbursment_enc", disbursment_enc, "emp_title"),
    ]
)

In [15]:
data_train_single.head()

loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,pymnt_plan,purpose,zip_code,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,last_credit_pull_d,last_fico_range_high,last_fico_range_low,mths_since_last_major_derog,acc_now_delinq,tot_cur_bal,…,bc_util,chargeoff_within_12_mths,delinq_amnt,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,disbursement_method,month_sin,month_cos
f64,f64,f64,f32,f64,f64,str,str,str,str,str,f64,str,str,str,str,str,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64
3600.0,3600.0,3600.0,36.0,13.99,123.03,"""C""","""C4""","""leadman""","""10+ years""","""MORTGAGE""",55000.0,"""Not Verified""","""n""","""debt_consolida…","""190xx""","""PA""",5.91,0.0,4505,675.0,679.0,1.0,30.0,,7.0,0.0,2765.0,29.7,13.0,"""w""",-1186,564.0,560.0,30.0,0.0,144904.0,…,37.2,0.0,0.0,148.0,128.0,3.0,3.0,1.0,4.0,69.0,4.0,69.0,2.0,2.0,4.0,2.0,5.0,3.0,4.0,9.0,4.0,7.0,0.0,0.0,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,"""Cash""",-2.4493e-16,1.0
24700.0,24700.0,24700.0,36.0,11.99,820.28,"""C""","""C1""","""engineer""","""10+ years""","""MORTGAGE""",65000.0,"""Not Verified""","""n""","""small_business…","""577xx""","""SD""",16.06,1.0,5844,715.0,719.0,4.0,6.0,,22.0,0.0,21470.0,19.2,38.0,"""w""",-1186,699.0,695.0,,0.0,204396.0,…,27.1,0.0,0.0,113.0,192.0,2.0,2.0,4.0,2.0,,0.0,6.0,0.0,5.0,5.0,13.0,17.0,6.0,20.0,27.0,5.0,22.0,0.0,0.0,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,"""Cash""",-2.4493e-16,1.0
35000.0,35000.0,35000.0,60.0,14.85,829.9,"""C""","""C5""","""information_sy…","""10+ years""","""MORTGAGE""",110000.0,"""Source Verifie…","""n""","""debt_consolida…","""076xx""","""NJ""",17.06,0.0,2647,785.0,789.0,0.0,,,13.0,0.0,7802.0,11.6,17.0,"""w""",-1186,679.0,675.0,,0.0,301500.0,…,12.1,0.0,0.0,36.0,87.0,2.0,2.0,1.0,2.0,,,,0.0,4.0,5.0,8.0,10.0,2.0,10.0,13.0,5.0,13.0,0.0,0.0,0.0,1.0,100.0,0.0,0.0,0.0,381215.0,52226.0,62500.0,18000.0,"""Cash""",-2.4493e-16,1.0
10400.0,10400.0,10400.0,60.0,22.45,289.91,"""F""","""F1""","""contract_speci…","""3 years""","""MORTGAGE""",104433.0,"""Source Verifie…","""n""","""major_purchase…","""174xx""","""PA""",25.37,1.0,6392,695.0,699.0,3.0,12.0,,12.0,0.0,21929.0,64.5,35.0,"""w""",-821,704.0,700.0,,0.0,331730.0,…,77.5,0.0,0.0,128.0,210.0,4.0,4.0,6.0,4.0,12.0,1.0,12.0,0.0,4.0,6.0,5.0,9.0,10.0,7.0,19.0,6.0,12.0,0.0,0.0,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,"""Cash""",-2.4493e-16,1.0
11950.0,11950.0,11950.0,36.0,13.44,405.18,"""C""","""C3""","""veterinary_tec…","""4 years""","""RENT""",34000.0,"""Source Verifie…","""n""","""debt_consolida…","""300xx""","""GA""",10.2,0.0,10288,690.0,694.0,0.0,,,5.0,0.0,8822.0,68.4,6.0,"""w""",-517,759.0,755.0,,0.0,12798.0,…,91.0,0.0,0.0,338.0,54.0,32.0,32.0,0.0,36.0,,,,0.0,2.0,3.0,2.0,2.0,2.0,4.0,4.0,3.0,5.0,0.0,0.0,0.0,0.0,100.0,100.0,0.0,0.0,16900.0,12798.0,9400.0,4000.0,"""Cash""",-2.4493e-16,1.0


In [12]:
data_train_single['initial_list_status'].value_counts()

initial_list_status,counts
str,u32
"""f""",647968
"""w""",1065733
