In [4]:
%reload_ext autoreload
%autoreload 1
import polars as pl
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from rgf.sklearn import RGFClassifier
import auxiliary.transformers as tr
from auxiliary.transformers import PolarsColumnTransformer as PCT
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import copy
import auxiliary.tuning as tunes
from ray import tune
import joblib
import numpy as np
from BorutaShap import BorutaShap
%aimport auxiliary.transformers
%aimport auxiliary.tuning

In [5]:
train_data=pl.read_parquet('temp/application_train_filtered.parquet')
id_and_target=['SK_ID_CURR','TARGET']
X_train=train_data.drop(columns=id_and_target)
y_train=train_data['TARGET']

In [6]:
pl.DataFrame([X_train["AMT_INCOME_TOTAL"],y_train]).select(pl.corr(X_train["AMT_INCOME_TOTAL"].name,y_train.name))

AMT_INCOME_TOTAL
f64
-0.003982


In [7]:
imp=tr.NumDiffFromRestImputer()
imp.fit_transform(X_train['bureau_DAYS_CREDIT_UPDATE_mode_Active'],y_train)

bureau_DAYS_CREDIT_UPDATE_mode_Active
i64
-7
-43
-83892
-83892
-83892
-16
-23
-18
-83892
-83892


In [8]:
bool_features = []
for feature in X_train.select(pl.col(pl.Utf8)).columns:
    if train_data[feature].n_unique() == 2:
        bool_features.append(feature)

cat_features = [
    feature
    for feature in X_train.select(pl.col(pl.Utf8)).columns
    if feature not in bool_features
]

In [9]:
preprocessing = Pipeline([])
cat_imputers = tr.PolarsColumnTransformer([])
for feature in cat_features:
    cat_imputers.steps[feature] = PCT.Step(
        feature, tr.NotInImputerPolars(min_values=100, fill_value="other"), feature
    )
preprocessing.steps.append(("cat_imputers", cat_imputers))

encoders = tr.PolarsColumnTransformer([])
for feature in bool_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.PolarsOneHotEncoder(drop=True), feature
    )
for feature in cat_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.TargetMeanOrderedLabeler(how="label"), feature
    )
preprocessing.steps.append(("encoders", encoders))
feature_remover = tr.FeatureRemover([])
preprocessing.steps.append(("feature_removal", feature_remover))

In [10]:
model_lgb=LGBMClassifier(n_jobs=1,verbosity=-1,force_col_wise=True)
full_pipeline=Pipeline([('preprocess', preprocessing),('model',model_lgb)])

In [11]:
model_for_selection=LGBMClassifier(verbose=-1,random_state=1,reg_alpha=1)
selector=BorutaShap(importance_measure='shap',model=model_for_selection)

In [12]:
selector=joblib.load('temp/model_1_selector.joblib')
bad_features=selector.features_to_remove.tolist()
bad_and_tentative_features=bad_features.copy()
bad_and_tentative_features.extend(selector.tentative.copy())
feature_removal_list=[bad_features,bad_and_tentative_features,[]]

In [13]:
models = tunes.Models()
search_space_lgbm = {
    "preprocess__feature_removal__feats_to_drop": tune.choice(feature_removal_list),
    "model__max_depth": tune.randint(5, 50),
    "model__num_leaves": tune.randint(10, 3051),
    "model__n_estimators": tune.randint(10, 251),
    "model__learning_rate": tune.loguniform(0.001, 0.1),
    "model__bagging_freq": tune.randint(0, 11),
    "model__colsample_bytree": tune.uniform(0.2, 1.0),
    "model__subsample": tune.uniform(0.2, 1.0),
    "model__reg_alpha": tune.loguniform(0.001, 100),
    "model__reg_lambda": tune.loguniform(0.001, 100),
    "model__boosting_type": tune.choice(["gbdt", "dart", "rf"]),
    "model__class_weight": tune.choice(["balanced", None]),
    "model__max_bin": tune.randint(5, 201),
}

models.add_model(
    "lgbm", full_pipeline, search_space_lgbm, metric_threshold=0.75
)


In [14]:
numeric_features_with_nulls = (
    pl.Series(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES)).columns
    )
    .filter(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES))
        .select(pl.all().is_null().any())
        .transpose()
        .to_series()
    )
    .to_list()
)

In [15]:
preprocessing_etrees=copy.deepcopy(preprocessing)
num_imputer = tr.PolarsColumnTransformer([])
for feature in numeric_features_with_nulls:
    num_imputer.steps[feature] = PCT.Step(
        feature, tr.NumDiffFromRestImputer(), feature
    )
preprocessing_etrees.steps.insert(0,('num_imputer', num_imputer))
# Create an Extra Trees classifier instance with hyperparameters
model_extra_trees = ExtraTreesClassifier(random_state=1)     # Random seed for reproducibility
full_pipeline_etrees=Pipeline([('preprocess',preprocessing_etrees),('model',model_extra_trees)])



In [None]:
preprocessing_rgf = copy.deepcopy(preprocessing_etrees)
model_rgf = RGFClassifier()
full_pipeline_rgf = Pipeline([("preprocess", preprocessing_rgf), ("model", model_rgf)])

In [49]:
models.tune_all(X_train,y_train,metric='roc_auc')

0,1
Current time:,2023-11-01 13:10:00
Running for:,00:00:04.42
Memory:,12.5/31.2 GiB

Trial name,status,loc,model__bagging_freq,model__boosting_type,model__class_weight,model__colsample_byt ree,model__learning_rate,model__max_bin,model__max_depth,model__n_estimators,model__num_leaves,model__reg_alpha,model__reg_lambda,model__subsample,...ocess__feature_re moval__feats_to_drop
TrainableCV_8ba9c83e,PENDING,,7,dart,,0.276289,0.0423578,136,42,55,912,0.394023,0.323181,0.33664,['CNT_CHILDREN'_1180


2023-11-01 13:10:01,097	INFO tune.py:1143 -- Total run time: 5.28 seconds (4.33 seconds for the tuning loop).
Resume experiment with: Tuner.restore(path="/tmp/tune_results/lgbm", trainable=...)
- TrainableCV_8ba9c83e: FileNotFoundError('Could not fetch metrics for TrainableCV_8ba9c83e: both result.json and progress.csv were not found at /tmp/tune_results/lgbm/TrainableCV_8ba9c83e_1_model__bagging_freq=7,model__boosting_type=dart,model__class_weight=None,model__colsample_bytree=0.2763,mod_2023-11-01_13-09-55')


TypeError: sklearn.pipeline.Pipeline.set_params() argument after ** must be a mapping, not NoneType

In [None]:
models.models['lgbm_grade_single'].best_params

182

In [18]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    full_pipeline_etrees.fit(
        X_train[train_index], y_train[train_index]
    )
    try:
        scores.append(
            roc_auc_score(
                y_train[test_index],
                full_pipeline_etrees.predict_proba(
                    X_train[test_index]
                )[:,1],
            )
        )
    except:
        bad_df=full_pipeline_etrees['preprocess'].transform(X_train[test_index])

In [None]:
scores = []
for train_index, test_index in StratifiedKFold(5).split(X_train, y_train):
    models.models["lgbm_grade_single"].pipeline.fit(
        X_train[train_index], y_train[train_index]
    )
    scores.append(
        roc_auc_score(
            y_train[test_index],
            models.models["lgbm_grade_single"].pipeline.predict_proba(
                X_train[test_index]
            )[:,1],
        )
    )