In [2]:
%reload_ext autoreload
%autoreload 1
import joblib
from ray import tune, train
import copy
import shap
import matplotlib.pyplot as plt
import polars as pl
import sys
import numpy as np
from sklearn.model_selection import train_test_split
import functions.transformers as tr
from functions.transformers import PolarsColumnTransformer as PCT
import functions.data_cleaning as dmf
import functions.plot_functions as pf
import functions.tuning as tunes
import functions.feature_engineering as feats
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from IPython.display import display,Markdown
import lightgbm as lgb
%aimport functions.tuning
%aimport functions.transformers
%aimport functions.data_cleaning
%aimport functions.plot_functions
%aimport functions.feature_engineering

In [3]:
data_train_single=pl.read_parquet('data/data_accepted_single_train.parquet')
data_val_single=pl.read_parquet('data/data_accepted_single_val.parquet')
data_train_joint=pl.read_parquet('data/data_accepted_joint_train.parquet')
data_val_joint=pl.read_parquet('data/data_accepted_joint_val.parquet')

In [4]:
data_train_single=dmf.clean_accepted_single(data_train_single)
data_val_single=dmf.clean_accepted_single(data_val_single)
data_train_joint=dmf.clean_accepted_joint(data_train_joint)
data_val_joint=dmf.clean_accepted_joint(data_val_joint)

### Feature Engineering

In [5]:
data_train_single=feats.date_features(data_train_single,'issue_d')
data_val_single=feats.date_features(data_val_single,'issue_d')
data_train_joint=feats.date_features(data_train_joint,'issue_d')
data_val_joint=feats.date_features(data_val_joint,'issue_d')

In [6]:
data_train_single=dmf.label_target_grades(data_train_single,)
data_val_single=dmf.label_target_grades(data_val_single,)
data_train_joint=dmf.label_target_grades(data_train_joint,)
data_val_joint=dmf.label_target_grades(data_val_joint,)

In [7]:
X_train_single=data_train_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_single=data_train_single['grade']

X_val_single=data_val_single.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_single=data_val_single['grade']

X_train_joint=data_train_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_train_joint=data_train_joint['grade']

X_val_joint=data_val_joint.drop(columns=['grade','sub_grade','int_rate','issue_d'])
y_val_joint=data_val_joint['grade']


#### Imputers

In [8]:
preprocess_single=Pipeline([])

In [9]:
emp_title_imp = tr.NotInImputerPolars(cat_no=20, fill_value="other")
emp_length_imp = tr.NotInImputerPolars(cat_no=12, fill_value=None)
zip_imp = tr.NotInImputerPolars(cat_no=12, fill_value='other')
imputers = PCT(
    [
        PCT.Step("emp_title", emp_title_imp, "emp_title"),
        PCT.Step("emp_length", emp_length_imp, "emp_length"),
        PCT.Step("zip_code", zip_imp,'zip_code')
    ]
)
preprocess_single.steps.append(("imputers",imputers))

### Encoders

In [10]:
emp_t_enc = tr.TargetMeanOrderedLabeler(how="label")
emp_l_order = [
    None,
    "< 1 year",
    "1 year",
    "2 years",
    "3 years",
    "4 years",
    "5 years",
    "6 years",
    "7 years",
    "8 years",
    "9 years",
    "10+ years",
]
emp_l_enc = tr.PolarsOrdinalEncoder(order=emp_l_order)
home_enc = tr.TargetMeanOrderedLabeler(how="label")
verif_enc = tr.TargetMeanOrderedLabeler(how="label")
paymnt_enc = tr.PolarsOneHotEncoder(drop=True)
purpose_enc = tr.TargetMeanOrderedLabeler(how="label")
zip_enc = tr.TargetMeanOrderedLabeler(how="label")
state_enc = tr.TargetMeanOrderedLabeler(how="label")
init_stat_enc = tr.PolarsOneHotEncoder(drop=True)
disbursement_enc = tr.PolarsOneHotEncoder(drop=True)

encoders = PCT(
    [
        PCT.Step("emp_t_enc", emp_t_enc, "emp_title"),
        PCT.Step("emp_l_enc", emp_l_enc, "emp_length"),
        PCT.Step("home_enc", home_enc, "home_ownership"),
        PCT.Step("verif_enc", verif_enc, "verification_status"),
        PCT.Step("paymnt_enc", paymnt_enc, "pymnt_plan"),
        PCT.Step("purpose_enc", purpose_enc, "purpose"),
        PCT.Step("zip_enc", zip_enc, "zip_code"),
        PCT.Step("state_enc", state_enc, "addr_state"),
        PCT.Step("init_stat_enc", init_stat_enc, "initial_list_status"),
        PCT.Step("disbursement_enc", disbursement_enc, "disbursement_method"),
    ]
)

preprocess_single.steps.append(("encoders",encoders))
null_imp = tr.PolarsNullImputer(fill_value=-9)
preprocess_single.steps.append(("null_imputer",null_imp))

In [11]:
model_single=lgb.LGBMClassifier(objective='multiclass',class_weight='balanced',verbose=-1,force_col_wise=True)

##### Getting an space of sets of columns to drop:

In [12]:
original_columns=preprocess_single.fit_transform(X_train_single.sample(10000,seed=1),y_train_single.sample(10000,seed=1)).columns

boruta_selected_cols={}

transformed_sample_X = preprocess_single.fit_transform(
        X_train_single.sample(50000, seed=1), y_train_single.sample(50000, seed=1)
    )
transformed_sample_y=y_train_single.sample(50000, seed=1)
for perc in np.arange(stop=100,start=10,step=10):
    boruta = tr.BorutaFeatureSelectorPolars(model_single, random_state=1, perc=perc)
        
    boruta.fit(
        transformed_sample_X,
        transformed_sample_y,
    )
    boruta_selected_cols[perc]=boruta.transform(preprocess_single.fit_transform(
        X_train_single.sample(10000, seed=1), y_train_single.sample(10000, seed=1)
    )).columns

In [23]:
cols_to_drop_space=[]
for columns in boruta_selected_cols.values():
    print(len(columns))
    cols_to_drop_space.append([col for col in original_columns if col not in columns])
cols_to_drop_space.append([])
joblib.dump(cols_to_drop_space,'temp/cols_to_drop.joblib')

75
67
57
50
45
40
33
27
26


['temp/cols_to_drop.joblib']

##### Dropper

In [14]:
data_train_single['home_ownership'].value_counts()

home_ownership,counts
str,u32
"""OWN""",139847
"""OTHER""",345
"""MORTGAGE""",648733
"""RENT""",523622


In [15]:
feature_dropper = tr.FeatureRemover([])
preprocess_single.steps.append(('feature_dropper',feature_dropper))
full_pipeline_single = Pipeline(
    [("preprocess", preprocess_single), ("model", model_single)]
)

In [21]:
preds=full_pipeline_single.predict(X_val_single)
print(classification_report(y_val_single,preds))
from sklearn.metrics import f1_score
print(f1_score(y_val_single,preds,average='macro'))

              precision    recall  f1-score   support

           1       0.57      0.89      0.69     72529
           2       0.74      0.64      0.69    121365
           3       0.83      0.68      0.75    130148
           4       0.58      0.55      0.57     50567
           5       0.28      0.17      0.21     17982
           6       0.03      0.04      0.03      5509
           7       0.25      0.78      0.37      3054

    accuracy                           0.66    401154
   macro avg       0.47      0.53      0.47    401154
weighted avg       0.68      0.66      0.66    401154

0.4727289392724257


In [16]:
models = tunes.Models()
search_space_lgbm = {
    "preprocess__feature_dropper__feats_to_drop": tune.choice(cols_to_drop_space),
    "model__num_leaves": tune.randint(10, 250),
    "model__n_estimators": tune.randint(10, 250),
    "model__learning_rate": tune.loguniform(0.001, 0.1),
    "model__bagging_freq": tune.randint(0, 10),
    "model__colsample_bytree": tune.uniform(0.2, 1.0),
    "model__subsample": tune.uniform(0.2, 1.0),
    "model__reg_alpha": tune.loguniform(0.001, 100),
    "model__reg_lambda": tune.loguniform(0.001, 100),
    "model__boosting_type": tune.choice(["gbdt", "dart", "rf"]),
    "model__class_weight": tune.choice(["balanced", None]),
    "model__max_bin": tune.randint(5, 200),
}

models.add_model("lgbm_grade_single", full_pipeline_single, search_space_lgbm, metric_threshold=0.4)

In [24]:
models.tune_all(X_train_single,y_train_single,X_val_single,y_val_single,n=20,sample_size=10000,average='macro')

0,1
Current time:,2023-10-12 12:21:31
Running for:,00:04:00.65
Memory:,12.0/31.2 GiB

Trial name,status,loc,model__bagging_freq,model__boosting_type,model__class_weight,model__colsample_byt ree,model__learning_rate,model__max_bin,model__n_estimators,model__num_leaves,model__reg_alpha,model__reg_lambda,model__subsample,...ocess__feature_dr opper__feats_to_drop,iter,total time (s),score
Trainable_e9183ce3,TERMINATED,192.168.0.103:19500,7,gbdt,balanced,0.645035,0.0111429,192,134,28,82.2006,0.00909809,0.336229,['home_ownershi_7380,3,0.684761,0.162838
Trainable_48ce93b1,TERMINATED,192.168.0.103:19545,0,rf,balanced,0.54535,0.0381468,185,90,170,0.884491,0.00106831,0.763687,"['delinq_2yrs',_9e40",3,5.71969,0.354988
Trainable_1cac6658,TERMINATED,192.168.0.103:19601,1,rf,balanced,0.928115,0.0538549,98,230,194,0.00193928,35.7895,0.430891,['home_ownershi_0380,3,3.53596,0.275327
Trainable_0c52134e,TERMINATED,192.168.0.103:19655,7,gbdt,balanced,0.618703,0.0556487,129,218,21,4.5849,0.0170802,0.78594,['home_ownershi_11c0,3,3.94585,0.406647
Trainable_1dc2cb7d,TERMINATED,192.168.0.103:19707,7,gbdt,balanced,0.983048,0.00261553,122,235,105,0.252589,17.8163,0.786424,['home_ownershi_a800,3,7.13656,0.307921
Trainable_84e5af11,TERMINATED,192.168.0.103:19802,3,rf,,0.220323,0.052315,126,11,84,19.1544,0.00249373,0.853591,['home_ownershi_2c80,3,0.314125,0.206669
Trainable_d3c11957,TERMINATED,192.168.0.103:19862,4,dart,balanced,0.267598,0.00815006,94,76,62,0.00692338,1.10084,0.403924,['home_ownershi_7600,3,2.04868,0.296038
Trainable_3c3b0133,TERMINATED,192.168.0.103:19915,1,gbdt,,0.649104,0.00108699,169,109,19,47.9802,0.00464048,0.550626,['home_ownershi_bc00,3,1.06332,0.118032
Trainable_eb8f406a,TERMINATED,192.168.0.103:19979,1,dart,,0.378812,0.0245077,87,195,141,12.1218,0.520988,0.38492,['home_ownershi_3480,3,4.12599,0.235042
Trainable_09c70bda,TERMINATED,192.168.0.103:20031,4,dart,balanced,0.795956,0.0230489,106,10,245,0.048353,0.0168488,0.514953,['home_ownershi_4b40,3,0.701465,0.327085


[2m[36m(Trainable pid=19500)[0m Step 0 F-1 Score: 0.16160797819441686
[2m[36m(Trainable pid=19500)[0m Step 1 F-1 Score: 0.19385723441207228
[2m[36m(Trainable pid=19500)[0m Step 2 F-1 Score: 0.16283795408468626
[2m[36m(Trainable pid=19545)[0m Step 0 F-1 Score: 0.3643678607135766
[2m[36m(Trainable pid=19545)[0m Step 1 F-1 Score: 0.371014149583794
[2m[36m(Trainable pid=19545)[0m Step 2 F-1 Score: 0.35498780413858205


[2m[36m(Trainable pid=19601)[0m Trainable.setup took 10.115 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=19601)[0m Step 0 F-1 Score: 0.27825031316164067
[2m[36m(Trainable pid=19601)[0m Step 1 F-1 Score: 0.28565060095901484
[2m[36m(Trainable pid=19601)[0m Step 2 F-1 Score: 0.2753270631114353
[2m[36m(Trainable pid=19655)[0m Step 0 F-1 Score: 0.40289077678216284
[2m[36m(Trainable pid=19655)[0m Step 1 F-1 Score: 0.39200295696535786
[2m[36m(Trainable pid=19655)[0m Step 2 F-1 Score: 0.4066474262070133


[2m[36m(Trainable pid=19707)[0m Trainable.setup took 10.635 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=19707)[0m Step 0 F-1 Score: 0.32004979024299784
[2m[36m(Trainable pid=19707)[0m Step 1 F-1 Score: 0.32192491716036137
[2m[36m(Trainable pid=19707)[0m Step 2 F-1 Score: 0.3079205639655122
[2m[36m(Trainable pid=19802)[0m Step 0 F-1 Score: 0.21464033102564642
[2m[36m(Trainable pid=19802)[0m Step 1 F-1 Score: 0.21954858344984549
[2m[36m(Trainable pid=19802)[0m Step 2 F-1 Score: 0.20666883714118997


[2m[36m(Trainable pid=19862)[0m Trainable.setup took 10.027 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=19862)[0m Step 0 F-1 Score: 0.30560920130656094
[2m[36m(Trainable pid=19862)[0m Step 1 F-1 Score: 0.3163896928666006
[2m[36m(Trainable pid=19862)[0m Step 2 F-1 Score: 0.2960376942800328


[2m[36m(Trainable pid=19915)[0m Trainable.setup took 10.152 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=19915)[0m Step 0 F-1 Score: 0.11142397835144734
[2m[36m(Trainable pid=19915)[0m Step 1 F-1 Score: 0.114646168201794
[2m[36m(Trainable pid=19915)[0m Step 2 F-1 Score: 0.11803170935744732


[2m[36m(Trainable pid=19979)[0m Trainable.setup took 10.224 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=19979)[0m Step 0 F-1 Score: 0.24313949166091114
[2m[36m(Trainable pid=19979)[0m Step 1 F-1 Score: 0.2356367648572089
[2m[36m(Trainable pid=19979)[0m Step 2 F-1 Score: 0.2350420646266366


[2m[36m(Trainable pid=20031)[0m Trainable.setup took 10.108 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=20031)[0m Step 0 F-1 Score: 0.3040643322441361
[2m[36m(Trainable pid=20031)[0m Step 1 F-1 Score: 0.3259734014111011
[2m[36m(Trainable pid=20031)[0m Step 2 F-1 Score: 0.32708511257039957


[2m[36m(Trainable pid=20092)[0m Trainable.setup took 10.048 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=20092)[0m Step 0 F-1 Score: 0.32159221695701995
[2m[36m(Trainable pid=20092)[0m Step 1 F-1 Score: 0.3180190058675227
[2m[36m(Trainable pid=20092)[0m Step 2 F-1 Score: 0.3466106775937181


[2m[36m(Trainable pid=20152)[0m Trainable.setup took 10.303 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=20152)[0m Step 0 F-1 Score: 0.3099598880031735
[2m[36m(Trainable pid=20152)[0m Step 1 F-1 Score: 0.33118986038088544
[2m[36m(Trainable pid=20152)[0m Step 2 F-1 Score: 0.3502403099304881
[2m[36m(Trainable pid=20205)[0m Step 0 F-1 Score: 0.33848063262183753
[2m[36m(Trainable pid=20205)[0m Step 1 F-1 Score: 0.3469062347210686
[2m[36m(Trainable pid=20205)[0m Step 2 F-1 Score: 0.3460589601973972


[2m[36m(Trainable pid=20319)[0m Trainable.setup took 10.067 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=20319)[0m Step 0 F-1 Score: 0.33634850359082497
[2m[36m(Trainable pid=20319)[0m Step 1 F-1 Score: 0.338726439891441
[2m[36m(Trainable pid=20319)[0m Step 2 F-1 Score: 0.3323433397189505


[2m[36m(Trainable pid=20372)[0m Trainable.setup took 10.189 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=20372)[0m Step 0 F-1 Score: 0.3465238090306429
[2m[36m(Trainable pid=20372)[0m Step 1 F-1 Score: 0.3567269882965193
[2m[36m(Trainable pid=20372)[0m Step 2 F-1 Score: 0.34856368726365716


[2m[36m(Trainable pid=20425)[0m Trainable.setup took 10.254 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=20425)[0m Step 0 F-1 Score: 0.29008576265287267
[2m[36m(Trainable pid=20425)[0m Step 1 F-1 Score: 0.310666421914931
[2m[36m(Trainable pid=20425)[0m Step 2 F-1 Score: 0.3144851836434609
[2m[36m(Trainable pid=20483)[0m Step 0 F-1 Score: 0.3234068229334325
[2m[36m(Trainable pid=20483)[0m Step 1 F-1 Score: 0.3122260467253711
[2m[36m(Trainable pid=20483)[0m Step 2 F-1 Score: 0.31707902567012525
[2m[36m(Trainable pid=20542)[0m Step 0 F-1 Score: 0.39537119032890533
[2m[36m(Trainable pid=20542)[0m Step 1 F-1 Score: 0.39012866679544794
[2m[36m(Trainable pid=20542)[0m Step 2 F-1 Score: 0.3892699212134033
[2m[36m(Trainable pid=20594)[0m Step 0 F-1 Score: 0.3053622763210117
[2m[36m(Trainable pid=20594)[0m Step 1 F-1 Score: 0.30794718239788116
[2m[36m(Trainable pid=20594)[0m Step 2 F-1 Score: 0.31036457212766233


[2m[36m(Trainable pid=20648)[0m Trainable.setup took 10.040 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


[2m[36m(Trainable pid=20648)[0m Step 0 F-1 Score: 0.3182956886944798
[2m[36m(Trainable pid=20648)[0m Step 1 F-1 Score: 0.3312540667873666


2023-10-12 12:21:31,727	INFO tune.py:1143 -- Total run time: 240.66 seconds (240.60 seconds for the tuning loop).


[2m[36m(Trainable pid=20648)[0m Step 2 F-1 Score: 0.3503459583137092
lgbm_grade_single tuned.
