In [1]:
%reload_ext autoreload
%autoreload 1
import random
import polars as pl
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from rgf.sklearn import RGFClassifier
import auxiliary.transformers as tr
from auxiliary.transformers import PolarsColumnTransformer as PCT
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import copy
import auxiliary.tuning as tunes
import auxiliary.eda_functions as eda
from ray import tune
import joblib
import numpy as np
from BorutaShap import BorutaShap
import statistics
from sklearn.utils.validation import check_random_state
%aimport auxiliary.transformers
%aimport auxiliary.tuning
%aimport auxiliary.eda_functions

In [2]:
random.seed(1)

In [3]:
train_data=pl.read_parquet('temp/application_train_filtered.parquet')
id_and_target=['SK_ID_CURR','TARGET']
X_train=train_data.drop(columns=id_and_target)
y_train=train_data['TARGET']

In [4]:
bool_features = []
for feature in X_train.select(pl.col(pl.Utf8)).columns:
    if X_train[feature].n_unique() == 2:
        bool_features.append(feature)

cat_features = [
    feature
    for feature in X_train.select(pl.col(pl.Utf8)).columns
    if feature not in bool_features
]

numeric_features_with_nulls = (
    pl.Series(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES)).columns
    )
    .filter(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES))
        .select(pl.all().is_null().any())
        .transpose()
        .to_series()
    )
    .to_list()
)

In [5]:
preprocessing = Pipeline([])

num_imputer = tr.PolarsColumnTransformer([])
for feature in numeric_features_with_nulls:
    num_imputer.steps[feature] = PCT.Step(
        feature, tr.NumDiffFromRestImputer(), feature
    )
preprocessing.steps.insert(0,('num_imputer', num_imputer))

cat_imputers = tr.PolarsColumnTransformer([])
for feature in cat_features:
    cat_imputers.steps[feature] = PCT.Step(
        feature, tr.NotInImputerPolars(min_values=100, fill_value="other"), feature
    )
preprocessing.steps.append(("cat_imputers", cat_imputers))

encoders = tr.PolarsColumnTransformer([])
for feature in bool_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.PolarsOneHotEncoder(drop=True), feature
    )
for feature in cat_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.TargetMeanOrderedLabeler(how="label"), feature
    )
preprocessing.steps.append(("encoders", encoders))
feature_remover = tr.FeatureRemover([])
preprocessing.steps.append(("feature_removal", feature_remover))

In [6]:
model_lgb=LGBMClassifier(n_jobs=1,verbosity=-1,force_col_wise=True)
sampler_model=tr.SamplingModelWrapper(model_lgb)
full_pipeline=Pipeline([('preprocess', preprocessing),('model',sampler_model)])

In [7]:
preprocessing_etrees=copy.deepcopy(preprocessing)
model_extra_trees = ExtraTreesClassifier(random_state=1)
sampler_model_etrees=tr.SamplingModelWrapper(model_extra_trees)
full_pipeline_etrees=Pipeline([('preprocess',preprocessing_etrees),('model',sampler_model_etrees)])


In [8]:
preprocessing_rgf = copy.deepcopy(preprocessing)
model_rgf = RGFClassifier()
sampler_model_rgf=tr.SamplingModelWrapper(model_rgf)
full_pipeline_rgf = Pipeline([("preprocess", preprocessing_rgf), ("model", sampler_model_rgf)])

**Fitting Boruta Shap algorithm on LGBM models with different alpha regularization values**

In [9]:
selector_with_alpha=joblib.load('temp/model_1_selector_alpha1.joblib')
selector_high_alpha=joblib.load('temp/model_1_selector_alpha10.joblib')
selector_no_alpha=joblib.load('temp/model_1_selector_alpha0.joblib')
bad_features_with_alpha=selector_with_alpha.features_to_remove.tolist()
bad_and_tentative_features_alpha=bad_features_with_alpha.copy()
bad_and_tentative_features_alpha.extend(selector_with_alpha.tentative.copy())


bad_features_high_alpha=selector_high_alpha.features_to_remove.tolist()
bad_and_tentative_features_high_alpha=bad_features_high_alpha.copy()
bad_and_tentative_features_high_alpha.extend(selector_high_alpha.tentative.copy())


bad_feature_no_alpha=selector_no_alpha.features_to_remove.tolist()
bad_and_tentative_features_no_alpha=bad_feature_no_alpha.copy()
bad_and_tentative_features_no_alpha.extend(selector_no_alpha.tentative.copy())


In [10]:
feature_quality = {"feature": preprocessing.fit_transform(X_train, y_train).columns}
feature_quality["high_alpha"] = []
feature_quality["alpha"] = []
feature_quality["no_alpha"] = []

for feature in feature_quality["feature"]:
    if feature in bad_features_high_alpha:
        feature_quality["high_alpha"].append("bad")
    elif feature in selector_high_alpha.tentative:
        feature_quality["high_alpha"].append("tentative")
    else:
        feature_quality["high_alpha"].append("good")

for feature in feature_quality["feature"]:
    if feature in bad_features_with_alpha:
        feature_quality["alpha"].append("bad")
    elif feature in selector_with_alpha.tentative:
        feature_quality["alpha"].append("tentative")
    else:
        feature_quality["alpha"].append("good")

for feature in feature_quality["feature"]:
    if feature in bad_features_high_alpha:
        feature_quality["no_alpha"].append("bad")
    elif feature in selector_no_alpha.tentative:
        feature_quality["no_alpha"].append("tentative")
    else:
        feature_quality["no_alpha"].append("good")

feature_quality=pl.DataFrame(feature_quality)


**Features with no consensus**

In [11]:
eda.table_display(feature_quality.filter(
    (pl.col("high_alpha") != pl.col("alpha"))
    | (pl.col("high_alpha") != pl.col("no_alpha"))
    | (pl.col("alpha") != pl.col("no_alpha"))
))


| feature                                   | high_alpha   | alpha     | no_alpha   |
|:------------------------------------------|:-------------|:----------|:-----------|
| AMT_INCOME_TOTAL                          | good         | good      | tentative  |
| DAYS_REGISTRATION                         | tentative    | good      | good       |
| FLAG_WORK_PHONE                           | tentative    | good      | good       |
| LIVINGAREA_MEDI                           | bad          | tentative | bad        |
| DEF_60_CNT_SOCIAL_CIRCLE                  | bad          | tentative | bad        |
| bureau_DAYS_CREDIT_ENDDATE_max            | tentative    | tentative | good       |
| bureau_AMT_CREDIT_MAX_OVERDUE_max         | tentative    | tentative | good       |
| bureau_AMT_CREDIT_SUM_mean                | tentative    | good      | good       |
| bureau_AMT_CREDIT_SUM_max                 | tentative    | tentative | good       |
| bureau_AMT_CREDIT_SUM_LIMIT_mean          | bad          | tentative | bad        |
| bureau_DAYS_CREDIT_mean                   | tentative    | good      | good       |
| bureau_DAYS_CREDIT_sum                    | bad          | tentative | bad        |
| bureau_DAYS_CREDIT_UPDATE_std             | bad          | tentative | bad        |
| bureau_DAYS_CREDIT_ENDDATE_mean_Active    | tentative    | good      | good       |
| bureau_DAYS_CREDIT_ENDDATE_mode_Active    | bad          | tentative | bad        |
| bureau_AMT_CREDIT_MAX_OVERDUE_mean_Active | bad          | tentative | bad        |
| bureau_AMT_CREDIT_SUM_mean_Active         | bad          | tentative | bad        |
| bureau_AMT_CREDIT_SUM_DEBT_sum_Active     | bad          | tentative | bad        |
| bureau_DAYS_CREDIT_mean_Active            | bad          | tentative | bad        |
| prev_AMT_CREDIT_mean_Approved             | bad          | tentative | bad        |
| prev_AMT_CREDIT_sum_Revolving_loans       | tentative    | tentative | good       |
| NAME_INCOME_TYPE                          | tentative    | good      | good       |
| NAME_HOUSING_TYPE                         | bad          | tentative | bad        |
| WALLSMATERIAL_MODE                        | bad          | tentative | bad        |
| NAME_CONTRACT_TYPE_Cash loans             | tentative    | good      | good       |

In [12]:
feature_quality = feature_quality.with_columns(
    feature_quality.map_rows(lambda x: statistics.mode(x[1:]))
    .to_series()
    .alias("consensus")
)

In [13]:
consensus_bad_tentative_features=feature_quality.filter(pl.col("consensus")!="good")['feature'].to_list()
consensus_bad_features=feature_quality.filter(pl.col("consensus")=="bad")['feature'].to_list()
feature_removal_list=[consensus_bad_features,consensus_bad_tentative_features]

In [14]:
models = tunes.Models()
model_params_lgb = {
    "max_depth": tune.randint(5, 30),
    "num_leaves": tune.randint(10, 1000),
    "n_estimators": tune.randint(10, 251),
    "learning_rate": tune.loguniform(0.001, 0.1),
    "bagging_freq": tune.randint(0, 11),
    "colsample_bytree": tune.uniform(0.2, 1.0),
    "subsample": tune.uniform(0.2, 1.0),
    "reg_alpha": tune.loguniform(0.001, 100),
    "reg_lambda": tune.loguniform(0.001, 100),
    "boosting_type": tune.choice(["gbdt", "dart", "rf"]),
    "class_weight": tune.choice(["balanced", None]),
    "max_bin": tune.randint(5, 201),
}

search_space_lgbm = {
    "preprocess__feature_removal__feats_to_drop": tune.choice(feature_removal_list),
    "model__model_params": model_params_lgb,
    "model__sampler": tune.choice(['smote','adasyn','random',None])
}

models.add_model(
    "lgbm", full_pipeline, search_space_lgbm, metric_threshold=0.77
)


In [15]:
model_params_extra_trees = {
    "n_estimators": tune.randint(10, 251),
    "max_depth": tune.randint(5, 30),
    "max_leaf_nodes": tune.randint(30,1000),
    "min_samples_split": tune.randint(2, 21),
    "min_samples_leaf": tune.randint(1, 21),
    "max_features": tune.uniform(0.1,1.0),
    "class_weight": tune.choice(["balanced","balanced_subsample" , None]),
    "max_samples": tune.uniform(0.1, 1.0),
}

search_space_extra_trees = {
    "preprocess__feature_removal__feats_to_drop": tune.choice(feature_removal_list),
    "model__model_params": model_params_extra_trees,
    "model__sampler": tune.choice(['smote', 'adasyn', 'random', None])
}

models.add_model(
    "extra_trees", full_pipeline_etrees, search_space_extra_trees, metric_threshold=0.76
)


In [16]:
model_params_rgf = {
    "algorithm": tune.choice(["RGF", "RGF Opt", "RGF Sib"]),
    "loss": tune.choice(["LS", "Expo", "Log", "Abs"]),
    "l2": tune.loguniform(1e-6, 1.0),
    "max_leaf": tune.randint(10, 1000),
    "test_interval": tune.quniform(100,1000,100),
    "reg_depth": tune.randint(1, 21),
    "learning_rate": tune.loguniform(1e-6, 1.0),
}

search_space_rgf = {
    "preprocess__feature_removal__feats_to_drop": tune.choice(feature_removal_list),
    "model__model_params": model_params_rgf,
    "model__sampler": tune.choice(['smote', 'adasyn', 'random', None])
}

models.add_model(
    "rgf", full_pipeline_rgf, search_space_rgf, metric_threshold=0.76
)


In [17]:
models.models['lgbm'].tune_model(X_train,y_train, n=200,metric='roc_auc')
joblib.dump(models,"temp/model_1_tuned_models.joblib")

# models.tune_all(X_train,y_train,metric='roc_auc',sample_size=10000)

0,1
Current time:,2023-11-09 15:18:02
Running for:,00:01:02.95
Memory:,21.8/31.2 GiB

Trial name,status,loc,model__model_params/ bagging_freq,model__model_params/ boosting_type,model__model_params/ class_weight,model__model_params/ colsample_bytree,model__model_params/ learning_rate,model__model_params/ max_bin,model__model_params/ max_depth,model__model_params/ n_estimators,model__model_params/ num_leaves,model__model_params/ reg_alpha,model__model_params/ reg_lambda,model__model_params/ subsample,model__sampler,...ocess__feature_re moval__feats_to_drop,iter,total time (s),score
TrainableCV_ea5189b1,RUNNING,192.168.0.103:26217,1,gbdt,,0.397327,0.0238248,109,21,242,193,15.3468,26.2006,0.769356,random,['CNT_CHILDREN'_5340,2.0,36.5835,0.77342
TrainableCV_b706b2b0,RUNNING,192.168.0.103:26448,3,rf,,0.850676,0.00674048,35,5,104,131,88.5617,0.00674419,0.791271,,['CNT_CHILDREN'_1340,2.0,27.0144,0.772484
TrainableCV_d8d13bfc,RUNNING,192.168.0.103:26590,9,gbdt,balanced,0.651771,0.0267957,148,12,202,46,0.119196,0.00688171,0.696967,random,['CNT_CHILDREN'_f180,1.0,20.9788,0.772106
TrainableCV_54e070fe,RUNNING,192.168.0.103:26676,3,gbdt,,0.712771,0.0200429,131,23,177,122,0.0246296,0.02065,0.79101,,['CNT_CHILDREN'_c5c0,,,
TrainableCV_f2f8dd55,RUNNING,192.168.0.103:26766,9,rf,balanced,0.604498,0.0384677,109,17,136,857,0.00304225,31.8991,0.31509,random,['CNT_CHILDREN'_ae80,,,
TrainableCV_a9d67e5d,PENDING,,2,dart,,0.669694,0.0158136,156,16,179,553,0.0136239,94.3423,0.981056,,['CNT_CHILDREN'_bd40,,,
TrainableCV_e9912157,TERMINATED,192.168.0.103:26155,3,rf,,0.424213,0.00246181,12,18,221,910,0.00543091,1.86241,0.726843,adasyn,['CNT_CHILDREN'_9e80,1.0,27.2485,0.763463
TrainableCV_5b6b374e,TERMINATED,192.168.0.103:26330,10,dart,,0.528867,0.0363016,148,25,193,389,19.2358,0.00515137,0.88752,smote,['CNT_CHILDREN'_cd40,1.0,20.4833,0.764033
TrainableCV_5facb45e,TERMINATED,192.168.0.103:26385,7,dart,,0.631754,0.00101147,57,22,102,782,29.8233,0.112821,0.870298,adasyn,['CNT_CHILDREN'_6600,1.0,31.2188,0.763568
TrainableCV_38cc22b2,TERMINATED,192.168.0.103:26508,6,rf,balanced,0.351185,0.0220387,144,13,118,96,0.00159164,61.8599,0.687625,smote,['CNT_CHILDREN'_5ec0,1.0,24.7512,0.764121


[2m[36m(TrainableCV pid=26217)[0m Step 0 F-1 Score: 0.772106432387425
[2m[36m(TrainableCV pid=26155)[0m Step 0 F-1 Score: 0.7634633491784456
[2m[36m(TrainableCV pid=26330)[0m Step 0 F-1 Score: 0.7640332089385684
[2m[36m(TrainableCV pid=26217)[0m Step 1 F-1 Score: 0.7747325891851945[32m [repeated 2x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[2m[36m(TrainableCV pid=26385)[0m Step 0 F-1 Score: 0.7635681152267649
[2m[36m(TrainableCV pid=26448)[0m Step 1 F-1 Score: 0.773430484017979




[2m[36m(TrainableCV pid=26590)[0m Step 0 F-1 Score: 0.772106432387425[32m [repeated 2x across cluster][0m
[2m[36m(TrainableCV pid=26448)[0m Step 2 F-1 Score: 0.7668438893254961[32m [repeated 2x across cluster][0m


2023-11-09 15:18:13,055	INFO tune.py:1143 -- Total run time: 73.42 seconds (62.73 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/tmp/tune_results/lgbm", trainable=...)
- TrainableCV_a9d67e5d: FileNotFoundError('Could not fetch metrics for TrainableCV_a9d67e5d: both result.json and progress.csv were not found at /tmp/tune_results/lgbm/TrainableCV_a9d67e5d_10_bagging_freq=2,boosting_type=dart,class_weight=None,colsample_bytree=0.6697,learning_rate=0.0158,max_bin=1_2023-11-09_15-18-00')


['temp/model_1_tuned_models.joblib']

In [None]:
models.models['lgbm'].pipeline.fit(X_train,y_train)

In [None]:
models.models['lgbm'].pipeline['model'].model.feature_importances_

array([ 42, 128,  96,  73, 123,  63,  46,  66,  49,  10,  27,  10, 137,
       143, 133,  16,   6,   6,  13,   4,   5,   8,   4,  11,  12,  20,
        10,  52,  10,  14,  27,  27,  26,  22,  18,  23,  10,  35,  32,
        34,  45,  22,  10,  15,  27,  27,  22,  17,  16,  12,  37,  40,
        18,  36,   9,  37,  12,  13,  32,  19,  15,   5,  28,  64,   3,
         3,  43,  95,  33,  28,   1,  27,  68,  13,  25,  31, 122,  52,
        55,  33,  10,  30,  23,  38,  11,  67,   5,  15], dtype=int32)

In [None]:
feats=pl.DataFrame({'imp':models.models['lgbm'].pipeline['model'].model.feature_importances_,'feat':cols})
feats.sort('imp')[-6,1]

'prev_payment_left'

In [None]:
cols=models.models['lgbm'].pipeline['preprocess'].fit_transform(X_train,y_train).columns

In [None]:
models.models['lgbm_grade_single'].best_params

182

In [None]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    full_pipeline_etrees.fit(
        X_train[train_index], y_train[train_index]
    )
    try:
        scores.append(
            roc_auc_score(
                y_train[test_index],
                full_pipeline_etrees.predict_proba(
                    X_train[test_index]
                )[:,1],
            )
        )
    except:
        bad_df=full_pipeline_etrees['preprocess'].transform(X_train[test_index])

In [None]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    models.models["lgbm_grade_single"].pipeline.fit(
        X_train[train_index], y_train[train_index]
    )
    scores.append(
        roc_auc_score(
            y_train[test_index],
            models.models["lgbm_grade_single"].pipeline.predict_proba(
                X_train[test_index]
            )[:,1],
        )
    )