In [8]:
%reload_ext autoreload
%autoreload 1
import polars as pl
from sklearn.pipeline import Pipeline
import auxiliary.transformers as tr
from auxiliary.transformers import PolarsColumnTransformer as PCT
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import copy
import auxiliary.tuning as tunes
from ray import tune
import joblib
import numpy as np
from BorutaShap import BorutaShap
%aimport auxiliary.transformers
%aimport auxiliary.tuning

In [9]:
train_data=pl.read_parquet('temp/application_train_filtered.parquet')
id_and_target=['SK_ID_CURR','TARGET']
X_train=train_data.drop(columns=id_and_target)
y_train=train_data['TARGET']

In [10]:
bool_features = []
for feature in X_train.select(pl.col(pl.Utf8)).columns:
    if train_data[feature].n_unique() == 2:
        bool_features.append(feature)

cat_features = [
    feature
    for feature in X_train.select(pl.col(pl.Utf8)).columns
    if feature not in bool_features
]

In [11]:
preprocessing=Pipeline([])
encoders=tr.PolarsColumnTransformer([])
for feature in bool_features:
    encoders.steps[feature]=PCT.Step(feature,tr.PolarsOneHotEncoder(drop=True),feature)
for feature in cat_features:
    encoders.steps[feature]=PCT.Step(feature,tr.TargetMeanOrderedLabeler(how='label'),feature)
preprocessing.steps.append(('encoders',encoders))
feature_remover=tr.FeatureRemover([])
preprocessing.steps.append(('feature_removal',feature_remover))

In [5]:
model_lgb=LGBMClassifier(n_jobs=1,verbosity=-1,force_col_wise=True)
full_pipeline=Pipeline([('preprocess', preprocessing),('model',model_lgb)])

In [6]:
model_for_selection=LGBMClassifier(verbose=-1,random_state=1,reg_alpha=1)
selector=BorutaShap(importance_measure='shap',model=model_for_selection)

In [17]:
selector=joblib.load('temp/model_1_selector.joblib')
bad_features=selector.features_to_remove.tolist()
bad_and_tentative_features=bad_features.copy()
bad_and_tentative_features.extend(selector.tentative.copy())
feature_removal_list=[bad_features,bad_and_tentative_features,[]]

In [21]:
models = tunes.Models()
search_space_lgbm = {
    "preprocess__feature_removal__feats_to_drop": tune.choice(feature_removal_list),
    "model__max_depth": tune.randint(5, 50),
    "model__num_leaves": tune.randint(10, 3051),
    "model__n_estimators": tune.randint(10, 251),
    "model__learning_rate": tune.loguniform(0.001, 0.1),
    "model__bagging_freq": tune.randint(0, 11),
    "model__colsample_bytree": tune.uniform(0.2, 1.0),
    "model__subsample": tune.uniform(0.2, 1.0),
    "model__reg_alpha": tune.loguniform(0.001, 100),
    "model__reg_lambda": tune.loguniform(0.001, 100),
    "model__boosting_type": tune.choice(["gbdt", "dart", "rf"]),
    "model__class_weight": tune.choice(["balanced", None]),
    "model__max_bin": tune.randint(5, 201),
}

models.add_model(
    "lgbm", full_pipeline, search_space_lgbm, metric_threshold=0.75
)


In [22]:
models.tune_all(X_train,y_train,metric='roc_auc')

0,1
Current time:,2023-10-31 14:39:29
Running for:,00:00:37.65
Memory:,17.1/31.2 GiB

Trial name,status,loc,model__bagging_freq,model__boosting_type,model__class_weight,model__colsample_byt ree,model__learning_rate,model__max_bin,model__max_depth,model__n_estimators,model__num_leaves,model__reg_alpha,model__reg_lambda,model__subsample,...ocess__feature_re moval__feats_to_drop,iter,total time (s),score
TrainableCV_2b13eaf0,RUNNING,192.168.0.103:60347,9,dart,,0.951897,0.00842985,200,22,135,1344,0.0197627,0.00392994,0.249749,['CNT_CHILDREN'_7440,,,
TrainableCV_7e06b277,RUNNING,192.168.0.103:60561,3,rf,balanced,0.673844,0.053261,110,12,98,198,0.0553883,3.43321,0.57669,['CNT_CHILDREN'_f580,1.0,8.87453,0.751851
TrainableCV_d089b988,RUNNING,192.168.0.103:60616,9,rf,balanced,0.882426,0.00137669,148,35,69,93,0.926264,12.1264,0.823256,[],,,
TrainableCV_08aba948,PENDING,,2,gbdt,,0.200765,0.00325256,166,40,64,1396,6.66413,4.30869,0.382349,['CNT_CHILDREN'_6a40,,,
TrainableCV_9238d9fc,TERMINATED,192.168.0.103:60392,6,gbdt,balanced,0.708002,0.0042012,51,5,168,588,0.0373266,0.0127622,0.884473,['CNT_CHILDREN'_ffc0,1.0,9.38939,0.736847
TrainableCV_c4ab68ca,TERMINATED,192.168.0.103:60442,6,rf,,0.526529,0.0143452,63,33,23,1550,0.507787,38.2468,0.759426,[],1.0,9.92924,0.746506
TrainableCV_841ea4d3,TERMINATED,192.168.0.103:60510,5,rf,balanced,0.307918,0.0591202,40,49,119,1706,0.0919788,0.0090812,0.209859,['CNT_CHILDREN'_ae40,1.0,10.8527,0.749596


[2m[36m(TrainableCV pid=60392)[0m Step 0 F-1 Score: 0.7368467863248905
[2m[36m(TrainableCV pid=60442)[0m Step 0 F-1 Score: 0.7465060617288583
[2m[36m(TrainableCV pid=60510)[0m Step 0 F-1 Score: 0.7495961418279152




[2m[36m(TrainableCV pid=60561)[0m Step 1 F-1 Score: 0.7529663496394745[32m [repeated 2x across cluster][0m


2023-10-31 14:39:39,687	INFO tune.py:1143 -- Total run time: 47.80 seconds (37.55 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/tmp/tune_results/lgbm", trainable=...)
- TrainableCV_08aba948: FileNotFoundError('Could not fetch metrics for TrainableCV_08aba948: both result.json and progress.csv were not found at /tmp/tune_results/lgbm/TrainableCV_08aba948_7_model__bagging_freq=2,model__boosting_type=gbdt,model__class_weight=None,model__colsample_bytree=0.2008,mod_2023-10-31_14-39-24')


lgbm tuned.


In [None]:
models.models['lgbm_grade_single'].best_params

182

In [14]:
from sklearn.ensemble import ExtraTreesClassifier

# Create an Extra Trees classifier instance with hyperparameters
extra_trees = ExtraTreesClassifier(random_state=42)     # Random seed for reproducibility

# Train the Extra Trees classifier on the training data
extra_trees.fit(preprocessing.fit_transform(X_train,y_train), y_train)

# # Now you can use the trained model for predictions on new data, e.g., X_test
# y_pred = extra_trees.predict(X_test)

# # You can also assess the model's accuracy, precision, recall, etc., using appropriate evaluation metrics
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(y_test, y_pred)
# print(f'Accuracy: {accuracy * 100:.2f}%')



ValueError: Input X contains NaN.
ExtraTreesClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    models.models["lgbm_grade_single"].pipeline.fit(
        X_train[train_index], y_train[train_index]
    )
    scores.append(
        roc_auc_score(
            y_train[test_index],
            models.models["lgbm_grade_single"].pipeline.predict_proba(
                X_train[test_index]
            )[:,1],
        )
    )