# Model 2: Credit Card Over Limit

**Imports:**

In [9]:
%reload_ext autoreload
%autoreload 1
import polars as pl
from imblearn.pipeline import Pipeline,make_pipeline
import auxiliary.transformers as tr
import auxiliary.eda_functions as eda
from auxiliary.transformers import PolarsColumnTransformer as PCT
from sklearn.preprocessing import MinMaxScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import roc_auc_score
import copy
import auxiliary.tuning as tunes
from ray import tune
import joblib
import numpy as np
from BorutaShap import BorutaShap
%aimport auxiliary.eda_functions
%aimport auxiliary.transformers
%aimport auxiliary.tuning

**Data Imports and Splitting:**

In [2]:
data = pl.read_parquet("temp/active_credit_cards.parquet")
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns="IS_OVER_LIMIT"),
    data["IS_OVER_LIMIT"],
    test_size=0.3,
    stratify=data["IS_OVER_LIMIT"],
    random_state=1,
)

## Pipeline Construction

**Specifying feature types:**

In [3]:
bool_features = []
for feature in X_train.select(pl.col(pl.Utf8)).columns:
    if X_train[feature].n_unique() == 2:
        bool_features.append(feature)

cat_features = [
    feature
    for feature in X_train.select(pl.col(pl.Utf8)).columns
    if feature not in bool_features
]

numeric_features_with_nulls = (
    pl.Series(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES)).columns
    )
    .filter(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES))
        .select(pl.all().is_null().any())
        .transpose()
        .to_series()
    )
    .to_list()
)

**Preprocessing pipeline:**

In [38]:
preprocessing = Pipeline([])
cat_imputers = tr.PolarsColumnTransformer([])
for feature in cat_features:
    cat_imputers.steps[feature] = PCT.Step(
        feature, tr.NotInImputerPolars(min_values=100, fill_value="other"), feature
    )
preprocessing.steps.append(("cat_imputers", cat_imputers))

encoders = tr.PolarsColumnTransformer([])
for feature in bool_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.PolarsOneHotEncoder(drop=True), feature
    )
for feature in cat_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.TargetMeanOrderedLabeler(how="label"), feature
    )

preprocessing.steps.append(("encoders", encoders))

scaler=tr.PolarsColumnTransformer([PCT.Step('scaler', MinMaxScaler(), preprocessing.fit_transform(X_train,y_train).columns)])

preprocessing.steps.append(("scaler", scaler))
preprocessing.steps.append(("null_imputer",tr.PolarsNullImputer(-1)))

feature_remover = tr.FeatureRemover([])
preprocessing.steps.append(("feature_removal", feature_remover))

**Model and Oversampling:**

In [43]:
model_lgb = LGBMClassifier(n_jobs=1, verbosity=-1)
sampler_model = tr.SamplingModelWrapper(model_lgb)
full_pipeline = Pipeline([("preprocess", preprocessing), ("model", sampler_model)])

## Feature Selection

**Running Boruta SHAP with different strictness and saving lists of bad features:**

In [51]:
feature_removal_list=[]
for i in [20,40,60,80,100]:
    model_for_selection_high_reg=LGBMClassifier(verbose=-1,random_state=1,num_leaves=100)
    selector=BorutaShap(importance_measure='shap',model=model_for_selection_high_reg,percentile=i)
    selector.fit(
        full_pipeline["preprocess"]
        .fit_transform(X_train, y_train)
        .to_pandas(),
        y_train.to_pandas(),
    )
    feature_removal_list.append(copy.deepcopy(selector.features_to_remove.tolist()))
joblib.dump(feature_removal_list,'temp/model_2_feature_removal.joblib')

['temp/model_2_feature_removal.joblib']

**Loading the lists of features to remove:**

In [None]:
feature_removal_list=joblib.load("temp/model_2_feature_removal.joblib")

## Hyperparameter Tuning