In [None]:
import os,warnings,gc
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedGroupKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cgb
%xmode Minimal

I supressed outputs cuz, I ended up with a OOM exception after submitting. You can uncomment it if you need.

In [None]:
pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(4)

In [None]:
path_to_train = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train"
path_to_test = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test"
path_to_features = "/kaggle/input/home-credit-credit-risk-model-stability/feature_definitions.csv"

In [None]:
feat_df = pl.read_csv(path_to_features)
# feat_df.head()

In [None]:
all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
display(train_files_df.head())
display(test_files_df.head())

In [None]:
train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
train_case_ids = train_base["case_id_base"].to_numpy()
test_case_ids = test_base["case_id_base"].to_numpy()
total_base = (
    pl.concat(
        [
            train_base,
            test_base
        ],
        how="vertical"
    )
)
del train_base,test_base
gc.collect()
total_base

In [None]:
def reduce_dtypes(df:pl.DataFrame):
    return (
        df
        .with_columns(
            cs.by_name("case_id").cast(pl.UInt32),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            cs.ends_with("T","M").cast(pl.String),
            cs.ends_with("P","A").cast(pl.Float32),
            (cs.ends_with("L") & cs.numeric()).cast(pl.Float32),
        )
    )

def grouping(df):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.numeric().max(),
            (~cs.numeric()).drop_nulls().mode().first()
        )
    )

def preprocess(filter_string:str,prefix_string:str):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(_).pipe(reduce_dtypes).pipe(grouping) for _ in train_files_list
                ]
            )
        )
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(_).select(train_df.columns).cast(train_df.schema).pipe(grouping) for _ in test_files_list
                ]
            )
        )
    return (
        pl.concat(
            [
                train_df,
                test_df
            ],
            how="vertical_relaxed"
        )
        .pipe(lambda df: df.rename({_:f"{prefix_string}_{_}" for _ in df.columns if not _ == 'case_id'}))
    )

def select_low_catcols(df:pl.DataFrame,thresh=200):
    col_names = []
    for col_name in df.select(cs.categorical()).columns:
        if df.select(pl.col(col_name).value_counts()).shape[0] > thresh:
            col_names.append(col_name)
    return df.select(~cs.by_name(col_names))
    
def select_low_cat_cols(df:pl.DataFrame,thresh=200):
    cols = []
    for col_name in df.select(pl.col(pl.Categorical)).columns:
        if df.select(pl.col(col_name).unique()).shape[0] >= thresh:
            cols.append(col_name)
    return df.drop(cols)

In [None]:
# def select_impuatable(df:pl.DataFrame,thresh=0.95):
#     cols =  (
#         df
#         .select(pl.all().is_null().mean())
#         .transpose(include_header=True)
#         .filter(pl.col("column_0") < thresh)
#         ["column"].to_list()
#     )
#     return df.select(cols)

In [None]:
total_past_shallow = preprocess("applprev_1", "past_shallow")
# display(total_past_shallow)

total_past_depth = preprocess("applprev_2", "past_depth")
# display(total_past_depth)

total_static_base = preprocess("static_0","static_base")
# display(total_static_base)

total_static_external = (
    preprocess("static_cb", "static_external")
    .with_columns(
        pl.col("static_external_riskassesment_302T").str.split("%").list.gather([0, 1]).apply(
            lambda x: (int(x[0]) + int(x[1].split("-")[1])) / 200
        )
    )
)
# display(total_static_external)

total_person_shallow = preprocess("person_1", "person_shallow")
# display(total_person_shallow)

total_person_depth = preprocess("person_2", "person_depth")
# display(total_person_depth)

total_other_shallow = preprocess("other_1", "other_shallow")
# display(total_other_shallow)

total_deposit_shallow = preprocess("deposit_1", "deposit_shallow")
# display(total_deposit_shallow)

total_debitcard_shallow = preprocess("debitcard", "card_shallow")
# display(total_debitcard_shallow)

total_credit_internal_shallow = preprocess("bureau_a_1", "int_shallow")
# display(total_credit_internal_shallow)

total_credit_internal_depth = preprocess("bureau_a_2", "int_depth")
# display(total_credit_internal_depth)

total_credit_external_shallow = preprocess("bureau_b_1", "ext_shallow")
# display(total_credit_external_shallow)

total_credit_external_depth = preprocess("bureau_b_2", "ext_depth")
# display(total_credit_external_depth)

total_registry_a = preprocess("registry_a", "reg_a")
# display(total_registry_a)

total_registry_b = preprocess("registry_b", "reg_b")
# display(total_registry_b)

total_registry_c = preprocess("registry_c", "reg_c")
# display(total_registry_c)


In [None]:
# gain_df = pl.read_csv("/kaggle/input/gain-files/gains.csv")
# new_filled_gains_df = pl.read_csv("/kaggle/input/gain-files/new_filled_gains.csv")
# new_gains_df = pl.read_csv("/kaggle/input/gain-files/new_gains.csv")
# drop_list = new_gains_df.filter(pl.col("gain") < 5000)["column_name"].to_list()
# print("Length of drop lsit: ",len(drop_list))

In [None]:
total_df = (
    total_base
    .join(
        total_past_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_past_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_base,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_external,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_other_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_deposit_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_debitcard_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_a,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_b,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_c,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .drop(drop_list)
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.String).cast(pl.Categorical)
    )
    .drop("Date")
#     .pipe(select_low_cat_cols)
)
cat_cols = total_df.select((cs.categorical() | cs.boolean() | cs.integer())).drop(["case_id_base","target"]).columns
cont_cols = total_df.drop(cat_cols).drop(["case_id_base","target"]).columns
total_df = (
    total_df
    .with_columns(
        pl.col(pl.Categorical).to_physical()
    )
)
total_df

In [None]:
#     .with_columns(
#         pl.col(pl.INTEGER_DTYPES).fill_null(strategy="mean"),
#         pl.col(pl.FLOAT_DTYPES).fill_null(strategy="mean"),
#         pl.col(pl.Boolean).fill_null(value=pl.col(pl.Boolean).drop_nulls().mode().first()),
#         pl.col(pl.Categorical).fill_null(value=pl.col(pl.Categorical).drop_nulls().mode().first())
#     )
#     .with_columns(
#         pl.col(pl.Categorical).to_physical()
#     )
# total_df = (
#     total_df
#     .with_columns(
#         (pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]) - pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min())/(pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).max()-pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min()).cast(pl.Float32),
#         (pl.col(pl.FLOAT_DTYPES) - pl.col(pl.FLOAT_DTYPES).mean())/(pl.col(pl.FLOAT_DTYPES).std()).cast(pl.Float32)
#     )
# )

In [None]:
train_total = (
    total_df
    .filter(
        pl.col("case_id_base").is_in(train_case_ids)
    )
    .drop("case_id_base")
)
submission_df = (
    total_df
    .filter(
        pl.col("case_id_base").is_in(test_case_ids)
    )
)
(
    train_total
    .null_count()
    .transpose(include_header=True,header_name="col name",column_names=["Nan count"])
    .sort(by="Nan count")
)

In [None]:
del total_df
del train_files_df
del test_files_df
del total_base
del total_past_shallow
del total_past_depth
del total_static_base
del total_static_external
del total_person_depth
del total_person_shallow
del total_other_shallow
del total_deposit_shallow
del total_debitcard_shallow
del total_credit_external_depth
del total_credit_external_shallow
del total_credit_internal_depth
del total_credit_internal_shallow
del total_registry_a
del total_registry_b
del total_registry_c
gc.collect()

In [None]:
class_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.01,
    "n_estimators": 8000,
    "max_bin": 255,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 420,
    "reg_alpha": 0.15,
    "reg_lambda": 15,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

reg_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 64,
    "learning_rate": 0.02,
    "n_estimators": 6000,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 420,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees": True,
    "num_leaves": 128,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

xgb_params = {
    "objective": "binary:logistic",
    "n_estimators": 5000,
    "eval_metric": "auc",
    "seed": 420,
    "booster": "gbtree",
    "device": "cuda",
    "eta": 0.02,
    "gamma": 5,
    "max_depth": 64,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "lambda": 10,
    "alpha": 2,
    "updater": "grow_gpu_hist",
    "grow_policy": "depthwise",
    "max_leaves": 256,
    "num_parallel_tree": 1,    
    "sample_type": "uniform",
    "normalize_type": "tree",
    "rate_drop": 0.15,
    "skip_drop": 0.9,
    "enable_categorical": True
}

cat_params = {
    "eval_metric": "AUC",
    "task_type": "GPU",
    "iterations": 8000,
    "learning_rate": 0.005,
    "bootstrap_type": "Poisson",
    "random_seed": 420,
    "l2_leaf_reg": 15,
    "subsample": 0.8,
    "depth": 16,
    "max_leaves": 64,
    "grow_policy": "Lossguide",
    "od_type": "Iter",
    "od_wait": 300,
    "verbose": 500
}

In [None]:
# train_total,valid_df = train_test_split(
#     train_total,
#     test_size=0.1,
#     random_state=533,
#     shuffle=True,
#     stratify=train_total.select("target")
# )
# gc.collect()

def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )

In [None]:
num_splits = 5
cv = StratifiedGroupKFold(n_splits=num_splits,shuffle=True)
lgb_reg_models = [
    lgb.LGBMRegressor(**reg_params) for _ in range(num_splits)
]
lgb_clf_models = [
    lgb.LGBMClassifier(**class_params) for _ in range(num_splits)
]
xgb_clf_models = [
    xgb.XGBClassifier(**xgb_params) for _ in range(num_splits)
]
cat_models = [
    cgb.CatBoostClassifier(**cat_params) for _ in range(num_splits)
]
for i,(train_ind,valid_ind) in enumerate(
    cv.split(train_total.drop("target"),
    train_total.select("target"),
    groups=train_total["week"])
    ):
    print(f"Training start for LGBRegressor: {i+1}")
    lgb_reg_models[i].fit(
        (
            train_total
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            train_total
            .select("target")
            .pipe(filter_ind,train_ind)
            .to_numpy()
        ),
        eval_set=[(
            (
                train_total
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                train_total
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_numpy()
            )
        )],
        callbacks=[lgb.log_evaluation(500),lgb.early_stopping(500)]
    )
#     print(f'''Roc score for validation df of Lgb regression model {i+1}: {roc_auc_score(
#         valid_df.select("target").to_numpy(),
#         lgb_reg_models[i].predict(valid_df.drop("target"))
#     )}''')
    print(gc.collect())
    print(f"Training start for LGBClassifier: {i+1}")
    lgb_clf_models[i].fit(
        (
            train_total
            .pipe(filter_ind,train_ind)
            .drop("target")
        ),
        (
            train_total
            .select("target")
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                train_total
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                train_total
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[lgb.log_evaluation(500),lgb.early_stopping(1000)]
    )
#     print(f'''Roc score for validation df of Lgb total classification model {i+1}: {roc_auc_score(
#         valid_df.select("target"),
#         lgb_clf_models[i].predict_proba(valid_df.drop("target"))[:,1]
#     )}''')
    print(gc.collect())
    print(f"Training start for XGBClassifier: {i+1}")
    early_stop = xgb.callback.EarlyStopping(rounds=100)
    log_eval = xgb.callback.EvaluationMonitor(period=500)
    xgb_clf_models[i].fit(
        (
            train_total
            .pipe(filter_ind,train_ind)
            .drop("target")
        ),
        (
            train_total
            .select("target")
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                train_total
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                train_total
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[early_stop,log_eval],
        verbose=False
    )
#     print(f'''Roc score for validation df of xgb total classification model {i+1}: {roc_auc_score(
#         valid_df.select("target"),
#         xgb_clf_models[i].predict_proba(valid_df.drop("target"))[:,1]
#     )}''')
    print(gc.collect())
    print(f"Training start for CatBoostClassifier: {i+1}")
    cat_models[i].fit(
        (
            train_total
            .drop("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        (
            train_total
            .pipe(filter_ind,train_ind)
            .select("target")
            .to_pandas()
        ),
        eval_set=[(
            (
                train_total
                .drop("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            ),
            (
                train_total
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            )
        )]
    )
#     print(f'''Roc score for validation df of cat boost classification model {i+1}: {roc_auc_score(
#         valid_df.select("target").to_pandas(),
#         cat_models[i].predict_proba(valid_df.drop("target").to_pandas())[:,1]
#     )}''')
    print(gc.collect())

In [None]:
class Model:

    def __init__(
            self,
            lgb_reg_models:list,
            lgb_clf_models:list,
            xgb_clf_models:list,
            cat_models:list
            ) -> None:
        self.lgb_clf_models = lgb_clf_models
        self.lgb_reg_models = lgb_reg_models
        self.xgb_clf_models = xgb_clf_models
        self.cat_models =  cat_models
    
    def predict_proba(self,X):
        self.lgb_clf_pred = [_.predict_proba(X)[:,1] for _ in self.lgb_clf_models] * 3.5
        self.lgb_reg_pred = [_.predict(X) for _ in self.lgb_reg_models]
        self.xgb_clf_pred = [_.predict_proba(X)[:,1] for _ in self.xgb_clf_models] * 1.5
        self.cat_pred = [_.predict_proba(X.to_pandas())[:,1] for _ in self.cat_models] * 1.5

        return np.mean(
            self.lgb_clf_pred+\
            self.lgb_reg_pred+\
            self.xgb_clf_pred+\
            self.cat_pred,
            axis=0
            )

In [None]:
# model = Model(
#     lgb_reg_models,
#     lgb_clf_models,
#     xgb_clf_models,
#     cat_models
#     )
# n_chunks = 5
# l = len(valid_df)
# chunk_size = l//n_chunks
# predictions = []
# for _ in range(n_chunks):
#     if _ < n_chunks-1:
#         predictions.append(model.predict_proba(valid_df.drop("target").slice(_*chunk_size,chunk_size)))
#     else:
#         predictions.append(model.predict_proba(valid_df.drop("target").slice(_*chunk_size,l-(_ * chunk_size))))    gc.collect()

# y_pred = np.concatenate(predictions)
# roc_auc_score(valid_df.select('target'),y_pred)

In [None]:
model = Model(
    lgb_reg_models,
    lgb_clf_models,
    xgb_clf_models,
    cat_models
    )
n_chunks = 5
l = len(submission_df)
chunk_size = l//n_chunks
predictions = []
for _ in range(n_chunks):
    if _ < n_chunks-1:
        predictions.append(model.predict_proba(submission_df.drop(["case_id_base","target"]).slice(_*chunk_size,chunk_size)))
    else:
        predictions.append(model.predict_proba(submission_df.drop(["case_id_base","target"]).slice(_*chunk_size,l - (_ * chunk_size))))
    gc.collect()

y_pred = np.concatenate(predictions)

In [None]:
# def gini_stability(base, model,w_fallingrate=88.0, w_resstd=-0.5):
#     base = base.to_pandas()
#     base["score"] = model.predict_proba(valid_df.drop('target'))
#     gini_in_time = base.loc[:, ["weekday", "target", "score"]]\
#         .sort_values("weekday")\
#         .groupby("weekday")[["target", "score"]]\
#         .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
#     x = np.arange(len(gini_in_time))
#     y = gini_in_time
#     a, b = np.polyfit(x, y, 1)
#     y_hat = a*x + b
#     residuals = y - y_hat
#     res_std = np.std(residuals)
#     avg_gini = np.mean(gini_in_time)
#     return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

# gini_stability(valid_df,model)

In [None]:
sub_df = pd.DataFrame({
    "case_id": submission_df["case_id_base"].to_list(),
    "score": y_pred
}).set_index("case_id")
sub_df.to_csv("./submission.csv")
sub_df