In [23]:
%reload_ext autoreload
%autoreload 1
import polars as pl
from imblearn.pipeline import Pipeline,make_pipeline
from sklearn.ensemble import ExtraTreesClassifier
from rgf.sklearn import RGFClassifier
import auxiliary.transformers as tr
from auxiliary.transformers import PolarsColumnTransformer as PCT
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import roc_auc_score
import copy
import auxiliary.tuning as tunes
from ray import tune
import joblib
import numpy as np
from BorutaShap import BorutaShap
%aimport auxiliary.transformers
%aimport auxiliary.tuning

In [2]:
data = pl.read_parquet("temp/active_credit_cards.parquet")
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns="IS_OVER_LIMIT"),
    data["IS_OVER_LIMIT"],
    test_size=0.3,
    stratify=data["IS_OVER_LIMIT"],
    random_state=1,
)

In [3]:
bool_features = []
for feature in X_train.select(pl.col(pl.Utf8)).columns:
    if X_train[feature].n_unique() == 2:
        bool_features.append(feature)

cat_features = [
    feature
    for feature in X_train.select(pl.col(pl.Utf8)).columns
    if feature not in bool_features
]

numeric_features_with_nulls = (
    pl.Series(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES)).columns
    )
    .filter(
        X_train.select(pl.col(pl.FLOAT_DTYPES), pl.col(pl.INTEGER_DTYPES))
        .select(pl.all().is_null().any())
        .transpose()
        .to_series()
    )
    .to_list()
)

In [4]:
preprocessing = Pipeline([])
cat_imputers = tr.PolarsColumnTransformer([])
for feature in cat_features:
    cat_imputers.steps[feature] = PCT.Step(
        feature, tr.NotInImputerPolars(min_values=100, fill_value="other"), feature
    )
preprocessing.steps.append(("cat_imputers", cat_imputers))

encoders = tr.PolarsColumnTransformer([])
for feature in bool_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.PolarsOneHotEncoder(drop=True), feature
    )
for feature in cat_features:
    encoders.steps[feature] = PCT.Step(
        feature, tr.TargetMeanOrderedLabeler(how="label"), feature
    )
preprocessing.steps.append(("encoders", encoders))
feature_remover = tr.FeatureRemover([])
preprocessing.steps.append(("feature_removal", feature_remover))

In [5]:
num_imputer = tr.PolarsColumnTransformer([])
for feature in numeric_features_with_nulls:
    num_imputer.steps[feature] = PCT.Step(
        feature, tr.NumDiffFromRestImputer(), feature
    )
preprocessing.steps.insert(0,('num_imputer', num_imputer))

In [6]:
model_lgb=LGBMClassifier(n_jobs=1,verbosity=-1,force_col_wise=True)
full_pipeline=Pipeline([('preprocess', preprocessing),('model',model_lgb)])

In [7]:
from imblearn.over_sampling import SMOTE

In [8]:
sm=SMOTE()

In [9]:
# preprocessing.steps.append(("smote",sm))

In [31]:
preprocessing

In [39]:
model_pipe=tr.SamplingModelWrapper(model_lgb,"smote")

In [40]:
model_pipe.fit(preprocessing.fit_transform(X_train,y_train),y_train)

In [24]:
pipe=make_pipeline(SMOTE(random_state=42), model_lgb)


In [41]:
model_pipe.predict(preprocessing.fit_transform(X_train,y_train)).sum()

5256