In [12]:
%reload_ext autoreload
%autoreload 1
import joblib
from ray import tune, train
import copy
import shap
import matplotlib.pyplot as plt
import polars as pl
import sys
import numpy as np
from sklearn.model_selection import train_test_split
import functions.transformers as tr
from functions.transformers import PolarsColumnTransformer as PCT
import functions.data_cleaning as dmf
import functions.plot_functions as pf
import functions.tuning as tunes
import functions.feature_engineering as feats
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from IPython.display import display,Markdown
import lightgbm as lgb
%aimport functions.tuning
%aimport functions.transformers
%aimport functions.data_cleaning
%aimport functions.plot_functions
%aimport functions.feature_engineering

In [13]:
data_train_single=pl.read_parquet('data/data_accepted_single_train.parquet')
data_val_single=pl.read_parquet('data/data_accepted_single_val.parquet')
data_train_joint=pl.read_parquet('data/data_accepted_joint_train.parquet')
data_val_joint=pl.read_parquet('data/data_accepted_joint_val.parquet')

In [14]:
data_train_single=dmf.clean_accepted_single(data_train_single)
data_val_single=dmf.clean_accepted_single(data_val_single)
data_train_joint=dmf.clean_accepted_joint(data_train_joint)
data_val_joint=dmf.clean_accepted_joint(data_val_joint)

### Feature Engineering

In [15]:
data_train_single=feats.date_features(data_train_single,'issue_d')
data_val_single=feats.date_features(data_val_single,'issue_d')
data_train_joint=feats.date_features(data_train_joint,'issue_d')
data_val_joint=feats.date_features(data_val_joint,'issue_d')

In [16]:
data_train_single=dmf.label_target_grades(data_train_single,)
data_val_single=dmf.label_target_grades(data_val_single,)
data_train_joint=dmf.label_target_grades(data_train_joint,)
data_val_joint=dmf.label_target_grades(data_val_joint,)

In [17]:
X_train_single=data_train_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_single=data_train_single['grade']

X_val_single=data_val_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_single=data_val_single['grade']

X_train_joint=data_train_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_joint=data_train_joint['grade']

X_val_joint=data_val_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_joint=data_val_joint['grade']


#### Imputers

In [18]:
preprocess_single=Pipeline([])

In [19]:
emp_title_imp = tr.NotInImputerPolars(cat_no=20, fill_value="other")
emp_length_imp = tr.NotInImputerPolars(cat_no=12, fill_value=None)
zip_imp = tr.NotInImputerPolars(cat_no=12, fill_value='other')
imputers = PCT(
    [
        PCT.Step("emp_title", emp_title_imp, "emp_title"),
        PCT.Step("emp_length", emp_length_imp, "emp_length"),
        PCT.Step("zip_code", zip_imp,'zip_code')
    ]
)
preprocess_single.steps.append(("imputers",imputers))

### Encoders

In [20]:
emp_t_enc = tr.TargetMeanOrderedLabeler(how="label")
emp_l_order = [
    None,
    "< 1 year",
    "1 year",
    "2 years",
    "3 years",
    "4 years",
    "5 years",
    "6 years",
    "7 years",
    "8 years",
    "9 years",
    "10+ years",
]
emp_l_enc = tr.PolarsOrdinalEncoder(order=emp_l_order)
home_enc = tr.TargetMeanOrderedLabeler(how="label")
verif_enc = tr.TargetMeanOrderedLabeler(how="label")
paymnt_enc = tr.PolarsOneHotEncoder(drop=True)
purpose_enc = tr.TargetMeanOrderedLabeler(how="label")
zip_enc = tr.TargetMeanOrderedLabeler(how="label")
state_enc = tr.TargetMeanOrderedLabeler(how="label")
init_stat_enc = tr.PolarsOneHotEncoder(drop=True)
disbursement_enc = tr.PolarsOneHotEncoder(drop=True)

encoders = PCT(
    [
        PCT.Step("emp_t_enc", emp_t_enc, "emp_title"),
        PCT.Step("emp_l_enc", emp_l_enc, "emp_length"),
        PCT.Step("home_enc", home_enc, "home_ownership"),
        PCT.Step("verif_enc", verif_enc, "verification_status"),
        PCT.Step("paymnt_enc", paymnt_enc, "pymnt_plan"),
        PCT.Step("purpose_enc", purpose_enc, "purpose"),
        PCT.Step("zip_enc", zip_enc, "zip_code"),
        PCT.Step("state_enc", state_enc, "addr_state"),
        PCT.Step("init_stat_enc", init_stat_enc, "initial_list_status"),
        PCT.Step("disbursement_enc", disbursement_enc, "disbursement_method"),
    ]
)

preprocess_single.steps.append(("encoders",encoders))

In [21]:
imputers.fit_transform(X_train_single.sample(10000,seed=1),y_train_single.sample(10000,seed=1)).head()

loan_amnt,funded_amnt,funded_amnt_inv,term,installment,home_ownership,annual_inc,verification_status,pymnt_plan,purpose,addr_state,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,initial_list_status,last_credit_pull_d,last_fico_range_high,last_fico_range_low,mths_since_last_major_derog,acc_now_delinq,tot_cur_bal,open_acc_6m,open_act_il,open_il_12m,open_il_24m,mths_since_rcnt_il,total_bal_il,…,mo_sin_old_il_acct,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_bc_dlq,mths_since_recent_inq,mths_since_recent_revol_delinq,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_il_tl,num_op_rev_tl,num_rev_accts,num_rev_tl_bal_gt_0,num_sats,num_tl_120dpd_2m,num_tl_30dpd,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,disbursement_method,month_sin,month_cos,emp_title,emp_length,zip_code
f64,f64,f64,f32,f64,str,f64,str,str,str,str,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,str,str,str
20000.0,20000.0,20000.0,36.0,758.14,"""OWN""",45548.0,"""Verified""","""n""","""debt_consolida…","""NY""",30.01,3.0,8584,660.0,664.0,0.0,9.0,,14.0,0.0,14443.0,72.2,22.0,"""w""",-516,634.0,630.0,32.0,0.0,40261.0,1.0,1.0,0.0,1.0,16.0,25818.0,…,137.0,282.0,2.0,2.0,1.0,2.0,32.0,1.0,20.0,0.0,7.0,12.0,7.0,10.0,4.0,13.0,17.0,12.0,14.0,0.0,0.0,0.0,2.0,77.3,85.7,0.0,0.0,47542.0,40261.0,8700.0,27542.0,"""Cash""",-0.866025,0.5,,,"""other"""
13250.0,13250.0,13250.0,60.0,294.68,"""MORTGAGE""",115000.0,"""Not Verified""","""n""","""home_improveme…","""MA""",4.89,0.0,10318,660.0,664.0,0.0,,41.0,9.0,1.0,7917.0,56.5,42.0,"""w""",-1064,664.0,660.0,,0.0,17309.0,0.0,1.0,0.0,0.0,63.0,9392.0,…,89.0,339.0,17.0,17.0,6.0,17.0,,17.0,,0.0,3.0,5.0,3.0,15.0,4.0,8.0,32.0,5.0,8.0,0.0,0.0,0.0,0.0,100.0,66.7,1.0,0.0,30120.0,17309.0,9500.0,16120.0,"""Cash""",0.866025,-0.5,"""other""","""< 1 year""","""other"""
13300.0,13300.0,13300.0,60.0,302.55,"""MORTGAGE""",220000.0,"""Source Verifie…","""n""","""medical""","""NJ""",19.09,2.0,12145,705.0,709.0,0.0,10.0,,24.0,0.0,7422.0,60.8,45.0,"""w""",-1706,704.0,700.0,,0.0,826111.0,,,,,,,…,139.0,397.0,44.0,10.0,5.0,44.0,10.0,9.0,10.0,0.0,2.0,4.0,3.0,8.0,18.0,7.0,20.0,4.0,24.0,0.0,0.0,0.0,1.0,70.5,66.7,0.0,0.0,870190.0,267207.0,4800.0,267089.0,"""Cash""",1.2246e-16,-1.0,"""other""","""6 years""","""other"""
3900.0,3900.0,3900.0,36.0,145.34,"""RENT""",72000.0,"""Not Verified""","""n""","""debt_consolida…","""TN""",5.0,1.0,5632,680.0,684.0,1.0,9.0,,14.0,0.0,5117.0,68.2,34.0,"""f""",-882,659.0,655.0,,0.0,275744.0,,,,,,,…,143.0,185.0,27.0,26.0,2.0,27.0,9.0,2.0,9.0,0.0,2.0,4.0,2.0,4.0,24.0,5.0,8.0,4.0,14.0,0.0,0.0,0.0,0.0,92.3,100.0,0.0,0.0,256919.0,275744.0,4500.0,249419.0,"""Cash""",-0.866025,0.5,"""nurse""","""3 years""","""other"""
30000.0,30000.0,30000.0,36.0,988.86,"""RENT""",150000.0,"""Source Verifie…","""n""","""debt_consolida…","""TX""",5.65,0.0,2192,730.0,734.0,0.0,,,6.0,0.0,11382.0,40.9,8.0,"""w""",-396,679.0,675.0,,0.0,22070.0,0.0,1.0,0.0,1.0,16.0,10688.0,…,42.0,72.0,13.0,13.0,1.0,13.0,,13.0,,0.0,3.0,4.0,4.0,4.0,2.0,5.0,5.0,4.0,6.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,50536.0,22070.0,25300.0,22736.0,"""Cash""",1.0,6.1232e-17,"""other""","""7 years""","""750xx"""


In [22]:
model=lgb.LGBMClassifier(objective='multiclass',class_weight='balanced')

In [23]:
boruta = tr.BorutaFeatureSelectorPolars(model, random_state=1, perc=100)

In [None]:
boruta.fit_transform(preprocess_single.fit_transform(X_train_single,y_train_single),y_train_single)

In [13]:
model.fit(preprocess_single.fit_transform(X_train_single,y_train_single),y_train_single)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9664
[LightGBM] [Info] Number of data points in the train set: 1199590, number of used features: 89
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910


In [14]:
preds=model.predict(preprocess_single.transform(X_val_single))

In [15]:
print(classification_report(y_val_single,preds))

              precision    recall  f1-score   support

           1       0.81      0.90      0.86     87225
           2       0.81      0.78      0.79    152406
           3       0.83      0.77      0.80    151476
           4       0.72      0.73      0.72     74272
           5       0.63      0.62      0.63     34120
           6       0.48      0.61      0.54     11301
           7       0.40      0.84      0.54      3311

    accuracy                           0.78    514111
   macro avg       0.67      0.75      0.70    514111
weighted avg       0.78      0.78      0.78    514111



In [17]:
from sklearn.metrics import f1_score
print(f1_score(y_val_single,preds,average=None))

[0.85519554 0.79180167 0.79754131 0.72293842 0.62717295 0.53962027
 0.54191148]
