In [1]:
%reload_ext autoreload
%autoreload 1
import joblib
from ray import tune, train
import copy
import shap
import matplotlib.pyplot as plt
import polars as pl
import sys
import numpy as np
from sklearn.model_selection import train_test_split
import functions.transformers as tr
from functions.transformers import PolarsColumnTransformer as PCT
import functions.data_cleaning as dmf
import functions.plot_functions as pf
import functions.tuning as tunes
import functions.feature_engineering as feats
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from IPython.display import display,Markdown
import lightgbm as lgb
%aimport functions.tuning
%aimport functions.transformers
%aimport functions.data_cleaning
%aimport functions.plot_functions
%aimport functions.feature_engineering

In [2]:
data_train_single=pl.read_parquet('data/data_accepted_single_train.parquet')
data_val_single=pl.read_parquet('data/data_accepted_single_val.parquet')
data_train_joint=pl.read_parquet('data/data_accepted_joint_train.parquet')
data_val_joint=pl.read_parquet('data/data_accepted_joint_val.parquet')

In [3]:
data_train_single=dmf.clean_accepted_single(data_train_single)
data_val_single=dmf.clean_accepted_single(data_val_single)
data_train_joint=dmf.clean_accepted_joint(data_train_joint)
data_val_joint=dmf.clean_accepted_joint(data_val_joint)

### Feature Engineering

In [4]:
data_train_single=feats.date_features(data_train_single,'issue_d')
data_val_single=feats.date_features(data_val_single,'issue_d')
data_train_joint=feats.date_features(data_train_joint,'issue_d')
data_val_joint=feats.date_features(data_val_joint,'issue_d')

In [5]:
data_train_single=dmf.label_target_grades(data_train_single,)
data_val_single=dmf.label_target_grades(data_val_single,)
data_train_joint=dmf.label_target_grades(data_train_joint,)
data_val_joint=dmf.label_target_grades(data_val_joint,)

In [6]:
X_train_single=data_train_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_single=data_train_single['grade']

X_val_single=data_val_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_single=data_val_single['grade']

X_train_joint=data_train_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_joint=data_train_joint['grade']

X_val_joint=data_val_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_joint=data_val_joint['grade']


#### Imputers

In [7]:
preprocess_single=Pipeline([])

In [8]:
emp_title_imp = tr.NotInImputerPolars(cat_no=20, fill_value="other")
emp_length_imp = tr.NotInImputerPolars(cat_no=12, fill_value=None)
zip_imp = tr.NotInImputerPolars(cat_no=12, fill_value='other')
imputers = PCT(
    [
        PCT.Step("emp_title", emp_title_imp, "emp_title"),
        PCT.Step("emp_length", emp_length_imp, "emp_length"),
        PCT.Step("zip_code", zip_imp,'zip_code')
    ]
)
preprocess_single.steps.append(("imputers",imputers))

### Encoders

In [9]:
emp_t_enc = tr.TargetMeanOrderedLabeler(how="label")
emp_l_order = [
    None,
    "< 1 year",
    "1 year",
    "2 years",
    "3 years",
    "4 years",
    "5 years",
    "6 years",
    "7 years",
    "8 years",
    "9 years",
    "10+ years",
]
emp_l_enc = tr.PolarsOrdinalEncoder(order=emp_l_order)
home_enc = tr.TargetMeanOrderedLabeler(how="label")
verif_enc = tr.TargetMeanOrderedLabeler(how="label")
paymnt_enc = tr.PolarsOneHotEncoder(drop=True)
purpose_enc = tr.TargetMeanOrderedLabeler(how="label")
zip_enc = tr.TargetMeanOrderedLabeler(how="label")
state_enc = tr.TargetMeanOrderedLabeler(how="label")
init_stat_enc = tr.PolarsOneHotEncoder(drop=True)
disbursement_enc = tr.PolarsOneHotEncoder(drop=True)

encoders = PCT(
    [
        PCT.Step("emp_t_enc", emp_t_enc, "emp_title"),
        PCT.Step("emp_l_enc", emp_l_enc, "emp_length"),
        PCT.Step("home_enc", home_enc, "home_ownership"),
        PCT.Step("verif_enc", verif_enc, "verification_status"),
        PCT.Step("paymnt_enc", paymnt_enc, "pymnt_plan"),
        PCT.Step("purpose_enc", purpose_enc, "purpose"),
        PCT.Step("zip_enc", zip_enc, "zip_code"),
        PCT.Step("state_enc", state_enc, "addr_state"),
        PCT.Step("init_stat_enc", init_stat_enc, "initial_list_status"),
        PCT.Step("disbursement_enc", disbursement_enc, "disbursement_method"),
    ]
)

preprocess_single.steps.append(("encoders",encoders))
null_imp = tr.PolarsNullImputer(fill_value=-9)
preprocess_single.steps.append(("null_imputer",null_imp))

In [10]:
model=lgb.LGBMClassifier(objective='multiclass',class_weight='balanced',verbose=True)

In [11]:
boruta = tr.BorutaFeatureSelectorPolars(model, random_state=1, perc=100)
original_columns=preprocess_single.fit_transform(X_train_single.sample(10000,seed=1),y_train_single.sample(10000,seed=1)).columns

In [12]:
boruta.fit(
    preprocess_single.fit_transform(
        X_train_single.sample(10000, seed=1), y_train_single.sample(10000, seed=1)
    ),
    y_train_single.sample(10000, seed=1),
)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16864
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 176
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 16864
[LightGBM] [Info] Number of data points in the train set: 10000, number of used features: 176
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Star

In [17]:
boruta.estimator.ranking_

AttributeError: 'LGBMClassifier' object has no attribute 'ranking_'

In [13]:
boruta.transform(preprocess_single.fit_transform(
        X_train_single.sample(10000, seed=1), y_train_single.sample(10000, seed=1)
    ))

loan_amnt,funded_amnt_inv,term,installment,annual_inc,purpose,dti,fico_range_low,last_credit_pull_d,last_fico_range_high,bc_open_to_buy
f64,f64,f32,f64,f64,i64,f64,f64,i64,f64,f64
20000.0,20000.0,36.0,758.14,45548.0,8,30.01,660.0,-516,634.0,1941.0
13250.0,13250.0,60.0,294.68,115000.0,5,4.89,660.0,-1064,664.0,2577.0
13300.0,13300.0,60.0,302.55,220000.0,7,19.09,705.0,-1706,704.0,803.0
3900.0,3900.0,36.0,145.34,72000.0,8,5.0,680.0,-882,659.0,226.0
30000.0,30000.0,36.0,988.86,150000.0,8,5.65,730.0,-396,679.0,15231.0
5200.0,5200.0,36.0,183.57,84000.0,8,24.36,700.0,-1704,689.0,2404.0
20000.0,20000.0,36.0,616.54,115000.0,5,17.44,740.0,-1186,804.0,35890.0
19125.0,19125.0,36.0,582.52,60000.0,8,20.7,785.0,-485,814.0,66354.0
24000.0,24000.0,36.0,867.54,150000.0,8,24.66,670.0,-702,729.0,251.0
10000.0,10000.0,36.0,322.63,31000.0,1,28.84,665.0,-1249,699.0,2209.0


In [13]:
model.fit(preprocess_single.fit_transform(X_train_single,y_train_single),y_train_single)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9664
[LightGBM] [Info] Number of data points in the train set: 1199590, number of used features: 89
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910
[LightGBM] [Info] Start training from score -1.945910


In [14]:
preds=model.predict(preprocess_single.transform(X_val_single))

In [15]:
print(classification_report(y_val_single,preds))

              precision    recall  f1-score   support

           1       0.81      0.90      0.86     87225
           2       0.81      0.78      0.79    152406
           3       0.83      0.77      0.80    151476
           4       0.72      0.73      0.72     74272
           5       0.63      0.62      0.63     34120
           6       0.48      0.61      0.54     11301
           7       0.40      0.84      0.54      3311

    accuracy                           0.78    514111
   macro avg       0.67      0.75      0.70    514111
weighted avg       0.78      0.78      0.78    514111



In [17]:
from sklearn.metrics import f1_score
print(f1_score(y_val_single,preds,average=None))

[0.85519554 0.79180167 0.79754131 0.72293842 0.62717295 0.53962027
 0.54191148]
