In [23]:
%reload_ext autoreload
%autoreload 1
import polars as pl
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from rgf.sklearn import RGFClassifier
import auxiliary.transformers as tr
from auxiliary.transformers import PolarsColumnTransformer as PCT
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import copy
import auxiliary.tuning as tunes
import auxiliary.eda_functions as eda
from ray import tune
import joblib
import numpy as np
from BorutaShap import BorutaShap
%aimport auxiliary.transformers
%aimport auxiliary.tuning
%aimport auxiliary.eda_functions

In [2]:
train_data=pl.read_parquet('temp/application_train_filtered.parquet')
id_and_target=['SK_ID_CURR','TARGET']
X_train=train_data.drop(columns=id_and_target)
y_train=train_data['TARGET']

In [3]:
bool_features = []
for feature in X_train.select(pl.col(pl.Utf8)).columns:
    if X_train[feature].n_unique() == 2:
        bool_features.append(feature)

cat_features = [
    feature
    for feature in X_train.select(pl.col(pl.Utf8)).columns
    if feature not in bool_features
]

numeric_features_with_nulls = (
    pl.Series(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES)).columns
    )
    .filter(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES))
        .select(pl.all().is_null().any())
        .transpose()
        .to_series()
    )
    .to_list()
)

In [4]:
preprocessing = Pipeline([])

num_imputer = tr.PolarsColumnTransformer([])
for feature in numeric_features_with_nulls:
    num_imputer.steps[feature] = PCT.Step(
        feature, tr.NumDiffFromRestImputer(), feature
    )
preprocessing.steps.insert(0,('num_imputer', num_imputer))

cat_imputers = tr.PolarsColumnTransformer([])
for feature in cat_features:
    cat_imputers.steps[feature] = PCT.Step(
        feature, tr.NotInImputerPolars(min_values=100, fill_value="other"), feature
    )
preprocessing.steps.append(("cat_imputers", cat_imputers))

encoders = tr.PolarsColumnTransformer([])
for feature in bool_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.PolarsOneHotEncoder(drop=True), feature
    )
for feature in cat_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.TargetMeanOrderedLabeler(how="label"), feature
    )
preprocessing.steps.append(("encoders", encoders))
feature_remover = tr.FeatureRemover([])
preprocessing.steps.append(("feature_removal", feature_remover))

In [5]:
model_lgb=LGBMClassifier(n_jobs=1,verbosity=-1,force_col_wise=True)
sampler_model=tr.SamplingModelWrapper(model_lgb)
full_pipeline=Pipeline([('preprocess', preprocessing),('model',sampler_model)])

**Fitting Boruta Shap algorithm on LGBM models with different alpha regularization values**

In [6]:
model_for_selection_reg=LGBMClassifier(verbose=-1,random_state=1,reg_alpha=1)
selector_with_alpha=BorutaShap(importance_measure='shap',model=model_for_selection_reg)
selector_with_alpha.fit(
    full_pipeline["preprocess"]
    .fit_transform(X_train, y_train)
    .to_pandas(),
    y_train.to_pandas(),
)
joblib.dump(selector_with_alpha,'temp/model_1_selector_alpha1.joblib')


  0%|          | 0/20 [00:00<?, ?it/s]

54 attributes confirmed important: ['bureau_DAYS_CREDIT_ENDDATE_mode', 'NAME_CONTRACT_TYPE_Cash loans', 'bureau_AMT_CREDIT_MAX_OVERDUE_mean', 'bureau_AMT_CREDIT_SUM_sum', 'CODE_GENDER', 'bureau_DAYS_CREDIT_ENDDATE_max', 'DAYS_ID_PUBLISH', 'bureau_AMT_CREDIT_SUM_min_Active', 'bureau_DAYS_ENDDATE_FACT_max', 'APARTMENTS_MEDI', 'FLAG_DOCUMENT_3', 'prev_payment_left', 'prev_AMT_CREDIT_sum_Approved', 'REGION_RATING_CLIENT_W_CITY', 'AMT_CREDIT', 'prev_AMT_CREDIT_mean_Consumer_loans', 'bureau_AMT_CREDIT_SUM_DEBT_mean', 'ORGANIZATION_TYPE', 'DAYS_REGISTRATION', 'EXT_SOURCE_3', 'DEF_30_CNT_SOCIAL_CIRCLE', 'bureau_DAYS_CREDIT_mean', 'bureau_DAYS_CREDIT_ENDDATE_max_Active', 'DAYS_EMPLOYED', 'prev_AMT_CREDIT_mean_Refused', 'AMT_GOODS_PRICE', 'prev_AMT_BALANCE_CURR_sum', 'bureau_AMT_CREDIT_SUM_max', 'prev_AMT_CREDIT_sum_Consumer_loans', 'bureau_AMT_CREDIT_SUM_DEBT_min_Active', 'OCCUPATION_TYPE', 'DAYS_BIRTH', 'bureau_DAYS_ENDDATE_FACT_std', 'AMT_REQ_CREDIT_BUREAU_QRT', 'bureau_DAYS_CREDIT_ENDDATE_me

['temp/model_1_selector_alpha1.joblib']

In [7]:
model_for_selection_high_reg=LGBMClassifier(verbose=-1,random_state=1,reg_alpha=10)
selector_high_alpha=BorutaShap(importance_measure='shap',model=model_for_selection_high_reg)
selector_high_alpha.fit(
    full_pipeline["preprocess"]
    .fit_transform(X_train, y_train)
    .to_pandas(),
    y_train.to_pandas(),
)
joblib.dump(selector_high_alpha,'temp/model_1_selector_alpha10.joblib')

  0%|          | 0/20 [00:00<?, ?it/s]

55 attributes confirmed important: ['bureau_DAYS_CREDIT_ENDDATE_mode', 'NAME_CONTRACT_TYPE_Cash loans', 'bureau_AMT_CREDIT_MAX_OVERDUE_mean', 'bureau_AMT_CREDIT_SUM_sum', 'CODE_GENDER', 'bureau_DAYS_CREDIT_ENDDATE_max', 'DAYS_ID_PUBLISH', 'bureau_AMT_CREDIT_SUM_min_Active', 'bureau_DAYS_ENDDATE_FACT_max', 'APARTMENTS_MEDI', 'FLAG_DOCUMENT_3', 'prev_payment_left', 'prev_AMT_CREDIT_sum_Approved', 'REGION_RATING_CLIENT_W_CITY', 'AMT_CREDIT', 'prev_AMT_CREDIT_mean_Consumer_loans', 'bureau_AMT_CREDIT_SUM_DEBT_mean', 'ORGANIZATION_TYPE', 'DAYS_REGISTRATION', 'EXT_SOURCE_3', 'DEF_30_CNT_SOCIAL_CIRCLE', 'bureau_DAYS_CREDIT_mean', 'bureau_DAYS_CREDIT_ENDDATE_max_Active', 'DAYS_EMPLOYED', 'prev_AMT_CREDIT_mean_Refused', 'AMT_GOODS_PRICE', 'prev_AMT_BALANCE_CURR_sum', 'bureau_AMT_CREDIT_SUM_max', 'prev_AMT_CREDIT_sum_Consumer_loans', 'bureau_AMT_CREDIT_SUM_DEBT_min_Active', 'OCCUPATION_TYPE', 'DAYS_BIRTH', 'bureau_AMT_CREDIT_MAX_OVERDUE_sum', 'bureau_DAYS_ENDDATE_FACT_std', 'AMT_REQ_CREDIT_BUREAU

['temp/model_1_selector_alpha10.joblib']

In [8]:
model_for_selection_noreg=LGBMClassifier(verbose=-1,random_state=1,reg_alpha=0)
selector_no_alpha=BorutaShap(importance_measure='shap',model=model_for_selection_noreg)
selector_no_alpha.fit(
    full_pipeline["preprocess"]
    .fit_transform(X_train, y_train)
    .to_pandas(),
    y_train.to_pandas(),
)
joblib.dump(selector_no_alpha,'temp/model_1_selector_alpha0.joblib')

  0%|          | 0/20 [00:00<?, ?it/s]

54 attributes confirmed important: ['bureau_DAYS_CREDIT_ENDDATE_mode', 'NAME_CONTRACT_TYPE_Cash loans', 'bureau_AMT_CREDIT_MAX_OVERDUE_mean', 'bureau_AMT_CREDIT_SUM_sum', 'CODE_GENDER', 'bureau_DAYS_CREDIT_ENDDATE_max', 'DAYS_ID_PUBLISH', 'bureau_AMT_CREDIT_SUM_min_Active', 'bureau_DAYS_ENDDATE_FACT_max', 'APARTMENTS_MEDI', 'FLAG_DOCUMENT_3', 'prev_payment_left', 'prev_AMT_CREDIT_sum_Approved', 'REGION_RATING_CLIENT_W_CITY', 'AMT_CREDIT', 'prev_AMT_CREDIT_mean_Consumer_loans', 'bureau_AMT_CREDIT_SUM_DEBT_mean', 'ORGANIZATION_TYPE', 'DAYS_REGISTRATION', 'EXT_SOURCE_3', 'DEF_30_CNT_SOCIAL_CIRCLE', 'bureau_DAYS_CREDIT_mean', 'bureau_DAYS_CREDIT_ENDDATE_max_Active', 'DAYS_EMPLOYED', 'prev_AMT_CREDIT_mean_Refused', 'AMT_GOODS_PRICE', 'prev_AMT_BALANCE_CURR_sum', 'bureau_AMT_CREDIT_SUM_max', 'prev_AMT_CREDIT_sum_Consumer_loans', 'bureau_AMT_CREDIT_SUM_DEBT_min_Active', 'OCCUPATION_TYPE', 'DAYS_BIRTH', 'bureau_DAYS_ENDDATE_FACT_std', 'AMT_REQ_CREDIT_BUREAU_QRT', 'bureau_DAYS_CREDIT_ENDDATE_me

['temp/model_1_selector_alpha0.joblib']

In [10]:
selector_with_alpha=joblib.load('temp/model_1_selector_alpha1.joblib')
selector_high_alpha=joblib.load('temp/model_1_selector_alpha10.joblib')
selector_no_alpha=joblib.load('temp/model_1_selector_alpha0.joblib')
bad_features_with_alpha=selector_with_alpha.features_to_remove.tolist()
bad_and_tentative_features_alpha=bad_features_with_alpha.copy()
bad_and_tentative_features_alpha.extend(selector_with_alpha.tentative.copy())


bad_features_high_alpha=selector_high_alpha.features_to_remove.tolist()
bad_and_tentative_features_high_alpha=bad_features_high_alpha.copy()
bad_and_tentative_features_high_alpha.extend(selector_high_alpha.tentative.copy())


bad_feature_no_alpha=selector_no_alpha.features_to_remove.tolist()
bad_and_tentative_features_no_alpha=bad_feature_no_alpha.copy()
bad_and_tentative_features_no_alpha.extend(selector_no_alpha.tentative.copy())
# feature_removal_list=[bad_features,bad_and_tentative_features_no_alpha,[]]

In [14]:
[feature for feature in bad_and_tentative_features_no_alpha if feature not in bad_and_tentative_features_high_alpha]

['bureau_AMT_CREDIT_MAX_OVERDUE_sum']

In [17]:
feature_quality = {"feature": preprocessing.fit_transform(X_train, y_train).columns}
feature_quality["high_alpha"] = []
feature_quality["alpha"] = []
feature_quality["no_alpha"] = []

for feature in feature_quality["feature"]:
    if feature in bad_features_high_alpha:
        feature_quality["high_alpha"].append("bad")
    elif feature in selector_high_alpha.tentative:
        feature_quality["high_alpha"].append("tentative")
    else:
        feature_quality["high_alpha"].append("good")

for feature in feature_quality["feature"]:
    if feature in bad_features_with_alpha:
        feature_quality["alpha"].append("bad")
    elif feature in selector_with_alpha.tentative:
        feature_quality["alpha"].append("tentative")
    else:
        feature_quality["alpha"].append("good")

for feature in feature_quality["feature"]:
    if feature in bad_features_high_alpha:
        feature_quality["no_alpha"].append("bad")
    elif feature in selector_no_alpha.tentative:
        feature_quality["no_alpha"].append("tentative")
    else:
        feature_quality["no_alpha"].append("good")

**Features with no consensus**

In [24]:
eda.table_display(pl.DataFrame(feature_quality).filter(
    (pl.col("high_alpha") != pl.col("alpha"))
    | (pl.col("high_alpha") != pl.col("no_alpha"))
    | (pl.col("alpha") != pl.col("no_alpha"))
))


| feature                                | high_alpha   | alpha     | no_alpha   |
|:---------------------------------------|:-------------|:----------|:-----------|
| bureau_AMT_CREDIT_MAX_OVERDUE_sum      | good         | tentative | tentative  |
| bureau_AMT_CREDIT_MAX_OVERDUE_std      | tentative    | bad       | good       |
| bureau_AMT_CREDIT_SUM_LIMIT_sum_Active | bad          | tentative | bad        |
| bureau_DAYS_CREDIT_mean_Active         | bad          | tentative | bad        |
| bureau_count_Closed                    | bad          | tentative | bad        |

In [8]:
models = tunes.Models()
model__model_params = {
    "max_depth": tune.randint(5, 50),
    "num_leaves": tune.randint(10, 3051),
    "n_estimators": tune.randint(10, 251),
    "learning_rate": tune.loguniform(0.001, 0.1),
    "bagging_freq": tune.randint(0, 11),
    "colsample_bytree": tune.uniform(0.2, 1.0),
    "subsample": tune.uniform(0.2, 1.0),
    "reg_alpha": tune.loguniform(0.001, 100),
    "reg_lambda": tune.loguniform(0.001, 100),
    "boosting_type": tune.choice(["gbdt", "dart", "rf"]),
    "class_weight": tune.choice(["balanced", None]),
    "max_bin": tune.randint(5, 201),
}

search_space_lgbm = {
    "preprocess__feature_removal__feats_to_drop": tune.choice(feature_removal_list),
    "model__model_params": model__model_params,
    "model__sampler": tune.choice(['smote','adasyn','random',None])
}

models.add_model(
    "lgbm", full_pipeline, search_space_lgbm, metric_threshold=0.77
)


In [9]:
preprocessing_etrees=copy.deepcopy(preprocessing)
# Create an Extra Trees classifier instance with hyperparameters
model_extra_trees = ExtraTreesClassifier(random_state=1)     # Random seed for reproducibility
full_pipeline_etrees=Pipeline([('preprocess',preprocessing_etrees),('model',model_extra_trees)])


In [10]:
preprocessing_rgf = copy.deepcopy(preprocessing)
model_rgf = RGFClassifier()
full_pipeline_rgf = Pipeline([("preprocess", preprocessing_rgf), ("model", model_rgf)])

In [11]:
models.tune_all(X_train,y_train,metric='roc_auc')

0,1
Current time:,2023-11-08 13:25:52
Running for:,00:02:18.10
Memory:,25.2/31.2 GiB

Trial name,status,loc,model__model_params/ bagging_freq,model__model_params/ boosting_type,model__model_params/ class_weight,model__model_params/ colsample_bytree,model__model_params/ learning_rate,model__model_params/ max_bin,model__model_params/ max_depth,model__model_params/ n_estimators,model__model_params/ num_leaves,model__model_params/ reg_alpha,model__model_params/ reg_lambda,model__model_params/ subsample,model__sampler,...ocess__feature_re moval__feats_to_drop,iter,total time (s),score
TrainableCV_8620eb49,RUNNING,192.168.0.103:21611,4,dart,balanced,0.956688,0.00292668,103,34,136,1422,3.74521,0.828468,0.553318,random,[],2.0,93.2451,0.773658
TrainableCV_dcbd6895,RUNNING,192.168.0.103:22238,5,dart,,0.759625,0.00188067,31,36,60,2862,39.264,0.922452,0.461732,,[],3.0,82.537,0.771011
TrainableCV_803e15b8,RUNNING,192.168.0.103:22338,9,gbdt,,0.461003,0.00160387,5,44,183,512,69.2313,0.0303841,0.66189,,['CNT_CHILDREN'_9300,4.0,54.8934,0.771111
TrainableCV_1a62ad97,RUNNING,192.168.0.103:22426,8,gbdt,balanced,0.863882,0.0514676,16,13,115,2555,15.1544,3.55327,0.983853,random,[],1.0,48.1363,0.772443
TrainableCV_ff7f0de3,RUNNING,192.168.0.103:22517,4,dart,,0.895504,0.0011411,158,9,194,59,0.00370398,1.88456,0.616041,,['CNT_CHILDREN'_a500,1.0,14.9833,0.772883
TrainableCV_0c2b470d,RUNNING,192.168.0.103:22592,4,dart,balanced,0.903328,0.0109824,197,46,218,1605,0.0303968,0.0136955,0.352853,smote,['CNT_CHILDREN'_23c0,,,
TrainableCV_d81b9865,PENDING,,5,rf,balanced,0.446643,0.0695402,62,44,248,2970,0.0347169,0.18361,0.329285,,[],,,
TrainableCV_72e46b97,TERMINATED,192.168.0.103:21540,5,dart,,0.739618,0.0560553,116,10,178,2131,7.44568,0.00154289,0.789695,adasyn,['CNT_CHILDREN'_d680,1.0,32.0372,0.765282
TrainableCV_aa6a2b20,TERMINATED,192.168.0.103:21725,2,dart,balanced,0.860437,0.00833566,198,18,150,2760,7.12336,0.0124325,0.307178,smote,['CNT_CHILDREN'_e9c0,1.0,22.9379,0.764082
TrainableCV_a18456f7,TERMINATED,192.168.0.103:21780,8,rf,,0.619685,0.0243159,86,17,112,2812,89.91,0.809482,0.283771,smote,[],1.0,55.5601,0.766259


[2m[36m(TrainableCV pid=21540)[0m Step 0 F-1 Score: 0.7652824360355878
[2m[36m(TrainableCV pid=21901)[0m Step 0 F-1 Score: 0.771457619588134[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(TrainableCV pid=21845)[0m Step 0 F-1 Score: 0.7645489917626007[32m [repeated 2x across cluster][0m
[2m[36m(TrainableCV pid=21780)[0m Step 0 F-1 Score: 0.7662593797033441[32m [repeated 2x across cluster][0m
[2m[36m(TrainableCV pid=21901)[0m Step 3 F-1 Score: 0.7733208961305292[32m [repeated 4x across cluster][0m
[2m[36m(TrainableCV pid=22149)[0m Step 0 F-1 Score: 0.7662668963262133[32m [repeated 2x across cluster][0m
[2m[36m(TrainableCV pid=22238)[0m Step 1 F-1 Score: 0.7740474671730182[32m [repeated 3x across cluster][0m
[2m[36m(TrainableCV pid=22338)[0m Step 3 F-1 Score: 0.773



[2m[36m(TrainableCV pid=22517)[0m Step 1 F-1 Score: 0.7738898789216292[32m [repeated 3x across cluster][0m


2023-11-08 13:26:03,228	INFO tune.py:1143 -- Total run time: 148.46 seconds (137.89 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/tmp/tune_results/lgbm", trainable=...)
- TrainableCV_d81b9865: FileNotFoundError('Could not fetch metrics for TrainableCV_d81b9865: both result.json and progress.csv were not found at /tmp/tune_results/lgbm/TrainableCV_d81b9865_13_bagging_freq=5,boosting_type=rf,class_weight=balanced,colsample_bytree=0.4466,learning_rate=0.0695,max_bin_2023-11-08_13-25-32')


lgbm tuned.


In [None]:
models.models['lgbm'].pipeline.fit(X_train,y_train)

In [None]:
models.models['lgbm'].pipeline['model'].model.feature_importances_

array([ 42, 128,  96,  73, 123,  63,  46,  66,  49,  10,  27,  10, 137,
       143, 133,  16,   6,   6,  13,   4,   5,   8,   4,  11,  12,  20,
        10,  52,  10,  14,  27,  27,  26,  22,  18,  23,  10,  35,  32,
        34,  45,  22,  10,  15,  27,  27,  22,  17,  16,  12,  37,  40,
        18,  36,   9,  37,  12,  13,  32,  19,  15,   5,  28,  64,   3,
         3,  43,  95,  33,  28,   1,  27,  68,  13,  25,  31, 122,  52,
        55,  33,  10,  30,  23,  38,  11,  67,   5,  15], dtype=int32)

In [None]:
feats=pl.DataFrame({'imp':models.models['lgbm'].pipeline['model'].model.feature_importances_,'feat':cols})
feats.sort('imp')[-6,1]

'prev_payment_left'

In [None]:
cols=models.models['lgbm'].pipeline['preprocess'].fit_transform(X_train,y_train).columns

In [None]:
models.models['lgbm_grade_single'].best_params

182

In [None]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    full_pipeline_etrees.fit(
        X_train[train_index], y_train[train_index]
    )
    try:
        scores.append(
            roc_auc_score(
                y_train[test_index],
                full_pipeline_etrees.predict_proba(
                    X_train[test_index]
                )[:,1],
            )
        )
    except:
        bad_df=full_pipeline_etrees['preprocess'].transform(X_train[test_index])

In [None]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    models.models["lgbm_grade_single"].pipeline.fit(
        X_train[train_index], y_train[train_index]
    )
    scores.append(
        roc_auc_score(
            y_train[test_index],
            models.models["lgbm_grade_single"].pipeline.predict_proba(
                X_train[test_index]
            )[:,1],
        )
    )