In [58]:
%reload_ext autoreload
%autoreload 1
import polars as pl
from sklearn.pipeline import Pipeline
import auxiliary.transformers as tr
from auxiliary.transformers import PolarsColumnTransformer as PCT
from lightgbm import LGBMClassifier
import auxiliary.tuning as tunes
from ray import tune
%aimport auxiliary.transformers
%aimport auxiliary.tuning

In [59]:
train_data=pl.read_parquet('temp/application_train_filtered.parquet')
id_and_target=['SK_ID_CURR','TARGET']
X_train=train_data.drop(columns=id_and_target)
y_train=train_data['TARGET']

In [60]:
bool_features = []
for feature in X_train.columns:
    if train_data[feature].drop_nulls().n_unique() == 2:
        bool_features.append(feature)

cat_features = [
    feature
    for feature in X_train.select(pl.col(pl.Utf8)).columns
    if feature not in bool_features
]

In [61]:
# for feature in cat_features:
#     display(X_train[feature].value_counts())

In [62]:
preprocessing=Pipeline([])
encoders=tr.PolarsColumnTransformer([])
for feature in bool_features:
    encoders.steps[feature]=PCT.Step(feature,tr.PolarsOneHotEncoder(drop=True),feature)
for feature in cat_features:
    encoders.steps[feature]=PCT.Step(feature,tr.TargetMeanOrderedLabeler(how='label'),feature)
preprocessing.steps.append(('encoders',encoders))

In [63]:
model_lgb=LGBMClassifier(n_jobs=1,verbosity=-1,force_col_wise=True)
full_pipeline=Pipeline([('preprocessing', preprocessing),('model',model_lgb)])

In [64]:
scores

[0.772704068028358,
 0.7740375742673732,
 0.7676323685114121,
 0.7766943413310012,
 0.7756533490237596]

In [65]:
models = tunes.Models()
search_space_lgbm = {
    "model__max_depth": tune.randint(5, 50),
    "model__num_leaves": tune.randint(10, 3051),
    "model__n_estimators": tune.randint(10, 251),
    "model__learning_rate": tune.loguniform(0.001, 0.1),
    "model__bagging_freq": tune.randint(0, 11),
    "model__colsample_bytree": tune.uniform(0.2, 1.0),
    "model__subsample": tune.uniform(0.2, 1.0),
    "model__reg_alpha": tune.loguniform(0.001, 100),
    "model__reg_lambda": tune.loguniform(0.001, 100),
    "model__boosting_type": tune.choice(["gbdt", "dart", "rf"]),
    "model__class_weight": tune.choice(["balanced", None]),
    "model__max_bin": tune.randint(5, 201),
}

models.add_model(
    "lgbm_grade_single", full_pipeline, search_space_lgbm,
)


In [66]:
models.tune_all(X_train,y_train,sample_size=10000,metric='roc_auc')

0,1
Current time:,2023-10-30 14:21:01
Running for:,00:09:20.39
Memory:,21.4/31.2 GiB

Trial name,status,loc,model__bagging_freq,model__boosting_type,model__class_weight,model__colsample_byt ree,model__learning_rate,model__max_bin,model__max_depth,model__n_estimators,model__num_leaves,model__reg_alpha,model__reg_lambda,model__subsample,iter,total time (s),score
TrainableCV_d612d46a,TERMINATED,192.168.0.103:19742,0,gbdt,,0.409936,0.00622936,88,37,240,2853,44.184,0.00700383,0.393534,5,7.09788,0.724731
TrainableCV_08d1821d,TERMINATED,192.168.0.103:19791,9,gbdt,,0.45725,0.00882252,48,37,129,1169,0.963016,0.620286,0.97611,5,8.46433,0.738559
TrainableCV_e3ae7aa6,TERMINATED,192.168.0.103:19840,4,dart,balanced,0.780011,0.00411284,189,44,148,2206,0.0118418,0.0076308,0.86375,5,40.8356,0.726146
TrainableCV_cb2426d7,TERMINATED,192.168.0.103:19896,7,dart,,0.913526,0.00132964,168,36,206,2476,0.0190122,0.00578964,0.538238,5,37.6488,0.733727
TrainableCV_a2141e4a,TERMINATED,192.168.0.103:19945,5,dart,,0.84542,0.0271416,167,41,95,2755,0.00113475,0.11134,0.822965,5,22.7619,0.734129
TrainableCV_c29f2104,TERMINATED,192.168.0.103:19998,8,gbdt,balanced,0.280002,0.00530379,32,45,163,2454,0.902017,3.90159,0.38044,5,6.5841,0.730407
TrainableCV_53184209,TERMINATED,192.168.0.103:20059,0,gbdt,,0.54576,0.0752809,165,23,90,1898,1.53221,0.264107,0.747688,5,12.6117,0.738074
TrainableCV_e7f77a7f,TERMINATED,192.168.0.103:20113,9,rf,,0.735656,0.0151603,64,15,59,2747,0.105575,0.00183413,0.213358,5,3.96684,0.717276
TrainableCV_bfd6127c,TERMINATED,192.168.0.103:20166,0,rf,,0.48476,0.00362237,69,37,88,930,0.00312732,0.0154423,0.205759,5,9.73534,0.729464
TrainableCV_c969d1b1,TERMINATED,192.168.0.103:20265,5,rf,balanced,0.858857,0.00693284,102,31,105,1415,0.0361215,0.609291,0.752364,5,14.1812,0.731214


[2m[36m(TrainableCV pid=19742)[0m Step 0 F-1 Score: 0.7558275712404214
[2m[36m(TrainableCV pid=19742)[0m Step 1 F-1 Score: 0.6895991847826087
[2m[36m(TrainableCV pid=19742)[0m Step 2 F-1 Score: 0.7443478260869565
[2m[36m(TrainableCV pid=19742)[0m Step 3 F-1 Score: 0.7064970714223272
[2m[36m(TrainableCV pid=19791)[0m Step 0 F-1 Score: 0.772737817289272
[2m[36m(TrainableCV pid=19742)[0m Step 4 F-1 Score: 0.7273815979754157
[2m[36m(TrainableCV pid=19791)[0m Step 4 F-1 Score: 0.7493707937037656[32m [repeated 4x across cluster][0m
[2m[36m(TrainableCV pid=19896)[0m Step 0 F-1 Score: 0.7563944264846744[32m [repeated 2x across cluster][0m
[2m[36m(TrainableCV pid=19998)[0m Step 1 F-1 Score: 0.7061073369565217[32m [repeated 4x across cluster][0m
[2m[36m(TrainableCV pid=19945)[0m Step 2 F-1 Score: 0.7636005434782609[32m [repeated 7x across cluster][0m
[2m[36m(TrainableCV pid=20113)[0m Step 1 F-1 Score: 0.6769497282608696[32m [repeated 6x across cluster][

2023-10-30 14:21:01,334	INFO tune.py:1143 -- Total run time: 560.41 seconds (559.52 seconds for the tuning loop).


lgbm_grade_single tuned.


In [69]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    models.models["lgbm_grade_single"].pipeline.fit(
        X_train[train_index], y_train[train_index]
    )
    scores.append(
        roc_auc_score(
            y_train[test_index],
            models.models["lgbm_grade_single"].pipeline.predict_proba(
                X_train[test_index]
            )[:,1],
        )
    )

In [70]:
scores

[0.7613917144800472,
 0.7621113184868856,
 0.7550835899762174,
 0.7663219521634728,
 0.7639852457126839]