In [1]:
import os,warnings,gc
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.model_selection import StratifiedGroupKFold,train_test_split
import lightgbm as lgb
import xgboost as xgb
import catboost as cgb

In [2]:
pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(10)
pl.Config.set_fmt_str_lengths(200)

polars.config.Config

In [3]:
path_to_train = "/home/sohail/Downloads/credit_risk/train/"
path_to_test = "/home/sohail/Downloads/credit_risk/test/"

In [4]:
all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
display(train_files_df.head())
display(test_files_df.head())

index,path,filename
i64,str,str
31,"""/home/sohail/Downloads/credit_risk/train/train_applprev_1_0.parquet""","""train_applprev_1_0.parquet"""
13,"""/home/sohail/Downloads/credit_risk/train/train_applprev_1_1.parquet""","""train_applprev_1_1.parquet"""
0,"""/home/sohail/Downloads/credit_risk/train/train_applprev_2.parquet""","""train_applprev_2.parquet"""
7,"""/home/sohail/Downloads/credit_risk/train/train_base.parquet""","""train_base.parquet"""
11,"""/home/sohail/Downloads/credit_risk/train/train_credit_bureau_a_1_0.parquet""","""train_credit_bureau_a_1_0.parquet"""


index,path,filename
i64,str,str
31,"""/home/sohail/Downloads/credit_risk/test/test_applprev_1_0.parquet""","""test_applprev_1_0.parquet"""
35,"""/home/sohail/Downloads/credit_risk/test/test_applprev_1_1.parquet""","""test_applprev_1_1.parquet"""
9,"""/home/sohail/Downloads/credit_risk/test/test_applprev_1_2.parquet""","""test_applprev_1_2.parquet"""
1,"""/home/sohail/Downloads/credit_risk/test/test_applprev_2.parquet""","""test_applprev_2.parquet"""
33,"""/home/sohail/Downloads/credit_risk/test/test_base.parquet""","""test_base.parquet"""


In [5]:
train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt64),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("weeknum"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt64),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("weeknum")
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
train_case_ids = train_base["case_id"]
test_case_ids = test_base["case_id"]
total_df = (
    pl.concat(
        [
            train_base,
            test_base
        ],
        how="vertical"
    )
)
del train_base,test_base
gc.collect()
total_df

case_id,Date,weeknum,month,weekday,week,year,target
u64,date,u8,i8,i8,i8,i32,u8
0,2019-01-03,0,1,4,1,1,0
1,2019-01-03,0,1,4,1,1,0
2,2019-01-04,0,1,5,1,1,0
3,2019-01-03,0,1,4,1,1,0
4,2019-01-04,0,1,5,1,1,1
…,…,…,…,…,…,…,…
57630,2020-10-06,92,10,2,41,2,0
57631,2020-10-06,92,10,2,41,2,0
57632,2020-10-06,92,10,2,41,2,0
57633,2020-10-06,92,10,2,41,2,0


In [6]:
imp_df = (
    (
        pl.read_parquet("/home/sohail/Downloads/lgb_imp.parquet")
        .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean_lgb"))
    ).join(
        pl.read_parquet("/home/sohail/Downloads/cat_imp.parquet")
        .select(pl.col("index"),pl.mean_horizontal(pl.col(pl.NUMERIC_DTYPES)).alias("mean_cat")),
        on="index"
    )
    .filter(
        (pl.col("mean_lgb") > 5000) & (pl.col("mean_cat") > 0.01) & (pl.col("index").str.split("_").list.lengths() > 2)
    )
    .with_columns(
        pl.when(
            pl.col("index").str.split("_").list.get(0).str.contains_any(["year","week","month"])
        )
        .then(
            pl.col("index").str.split("_").list.get(1)
        )
        .otherwise(
            pl.col("index").str.split("_").list.get(0)
        )
        .alias("prefix_string")
    )
    .with_columns(
        pl.when(
            pl.col("index").str.split("_").list.get(0).str.contains_any(["year","week","month"])
        )
        .then(
            pl.col("index").str.split("_").list.get(2)
        )
        .otherwise(
            pl.col("index").str.split("_").list.get(1)
        )
        .alias("type")
    )
    .with_columns(
        pl.when(
            pl.col("index").str.split("_").list.get(0).str.contains_any(["week","year","month"])
        )
        .then(
            pl.col("index").str.split("_").list.slice(3).list.join("_")
        )
        .otherwise(
            pl.col("index").str.split("_").list.slice(2).list.join("_")
        )
        .alias("col_name")
    )
    .with_columns(
        pl.when(
            pl.col("index").str.split("_").list.get(0).str.contains_any(["week","year","month"])
        )
        .then(
            pl.col("index").str.split("_").list.get(0)
        )
        .otherwise(
            pl.lit("none")
        )
        .alias("date_type")
    )
)

In [7]:
def rename_datecols(df:pl.DataFrame,name:str):
    if name == "intshallow":
        return df.rename(
            {
                "dpdmaxdatemonth_442T":"dpdmaxdatemonth_442D",
                "dpdmaxdatemonth_89T":"dpdmaxdatemonth_89D",
                "dpdmaxdateyear_596T":"dpdmaxdateyear_596D",
                "dpdmaxdateyear_896T":"dpdmaxdateyear_896D",
                "overdueamountmaxdatemonth_284T":"overdueamountmaxdatemonth_284D",
                "overdueamountmaxdatemonth_365T":"overdueamountmaxdatemonth_365D",
                "overdueamountmaxdateyear_2T":"overdueamountmaxdateyear_2D",
                "overdueamountmaxdateyear_994T":"overdueamountmaxdateyear_994D",
            }
        )
    elif name == "intdepth":
        return df.rename(
            {
                "pmts_month_158T":"pmts_month_158D",
                "pmts_month_706T":"pmts_month_706D",
                "pmts_year_1139T":"pmts_year_1139D",
                "pmts_year_507T":"pmts_year_507D",
            }
        )
    elif name == "extshallow":
        return df.rename(
            {
                "dpdmaxdatemonth_804T":"dpdmaxdatemonth_804D",
                "dpdmaxdateyear_742T":"dpdmaxdateyear_742D",
                "overdueamountmaxdatemonth_494T":"overdueamountmaxdatemonth_494D",
                "overdueamountmaxdateyear_432T":"overdueamountmaxdateyear_432D",
            }
        )
    else:
        return df
    
def staticasssement_compute(df:pl.DataFrame,name:str):
    if name == "staticexternal":
        return (
            df
            .with_columns(
                cs.contains("302T").str.split("%").list.gather([0,1])
                .map_elements(lambda x: (int(x[0]) + int(x[1].strip().split(" ")[1]))/200,return_dtype=pl.Float32)
            )
        )
    else:
        return df

def convert_dtype(df:pl.DataFrame):
    return (
        df
        .select(
            cs.by_name("case_id").cast(pl.UInt64),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            (cs.ends_with("T","M") | (cs.ends_with("L") & cs.string())).cast(pl.String),
            (cs.ends_with("L") & cs.integer()).cast(pl.Int32),
            (cs.ends_with("L") & cs.float()).cast(pl.Float32),
            (cs.ends_with("P","A") & cs.unsigned_integer()).cast(pl.UInt32),
            (cs.ends_with("P","A") & cs.signed_integer()).cast(pl.Int32),
            (cs.ends_with("P","A") & cs.float()).cast(pl.Float32),
            pl.col(pl.Boolean)
        )
    )    


def grouping(df:pl.DataFrame,prefix_string:str):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.by_name(
                imp_df
                .filter(
                    (pl.col("prefix_string") == prefix_string) & (pl.col("type") == "mean")
                )
                .unique("col_name")
                .sort(by="col_name")
                ["col_name"].to_list()
            ).mean().prefix("mean_"),
            cs.by_name(
                imp_df
                .filter(
                    (pl.col("prefix_string") == prefix_string) & (pl.col("type") == "max")
                )
                .unique("col_name")
                .sort(by="col_name")
                ["col_name"].to_list()
            ).mean().prefix("max_"),
            cs.by_name(
                imp_df
                .filter(
                    (pl.col("prefix_string") == prefix_string) & (pl.col("type") == "min")
                )
                .unique("col_name")
                .sort(by="col_name")
                ["col_name"].to_list()
            ).mean().prefix("min_"),
            cs.by_name(
                imp_df
                .filter(
                    (pl.col("prefix_string") == prefix_string) & (pl.col("type") == "mode")
                )
                .unique("col_name")
                .sort(by="col_name")
                ["col_name"].to_list()
            ).drop_nulls().mode().first().prefix("mode_"),
        )
    )


In [8]:
def preprocess(filter_string:str,prefix_string:str):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(train_file)
                    .pipe(rename_datecols,prefix_string)
                    .pipe(convert_dtype)
                    .pipe(grouping,prefix_string)
                    for train_file in train_files_list
                ],
                rechunk=False,
                parallel=False
            )
        )
        test_schema = {"_".join(col_name.split("_")[1:]) if col_name != "case_id" else col_name:dtype for col_name,dtype in train_df.schema.items()}
        test_columns = {col_name for col_name in test_schema.keys()}
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(test_file)
                    .pipe(rename_datecols,prefix_string)
                    .select(test_columns)
                    .cast(test_schema)
                    .pipe(grouping,prefix_string)
                    for test_file in test_files_list
                ],
                rechunk=False,
                parallel=False
            )
        )
    return(
        pl.concat(
            [
                train_df,
                test_df
            ],
            how="vertical"
        )
        .pipe(staticasssement_compute,prefix_string)
        .with_columns(pl.col(pl.String).cast(pl.Categorical).rank("dense"))
        .select(pl.col("case_id"),pl.all().exclude("case_id").shrink_dtype().prefix(f"{prefix_string}_"))
    )

In [9]:
string_list = ["applprev_1","applprev_2","static_0","static_cb","person_1","person_2","other_1","deposit_1","debitcard","bureau_a_1","bureau_a_2","bureau_b_1","bureau_b_2","registry_a","registry_b","registry_b"]
prefix_string_list = ["pastshallow","pastdepth","staticbase","staticexternal","personshallow","persondepth","othershallow","depositshallow","cardshallow","intshallow","intdepth","extshallow","extdepth","rega","regb","regc"]

In [10]:
count = 1
for string_in,pre_string in zip(string_list,prefix_string_list):
    total_df = total_df.join(preprocess(string_in,pre_string),on="case_id",how="left")
    print(f"done {count}")
    count += 1

(
    total_df
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        cs.by_name(
            imp_df
            .filter(
                pl.col("date_type") == "week"
            )
            .select(pl.col("index").str.split("_").list.slice(1).list.join("_"))
            ["index"].to_list()
        ).dt.week().prefix("week_"),
        cs.by_name(
            imp_df
            .filter(
                pl.col("date_type") == "month"
            )
            .select(pl.col("index").str.split("_").list.slice(1).list.join("_"))
            ["index"].to_list()
        ).dt.month().prefix("month_"),
        (cs.by_name(
            imp_df
            .filter(
                pl.col("date_type") == "year"
            )
            .select(pl.col("index").str.split("_").list.slice(1).list.join("_"))
            ["index"].to_list()
        ).dt.year() - 2018).prefix("year_"),
    )
    .select(pl.all().shrink_dtype())
)

done 1
done 2
done 3
done 4
done 5
done 6
done 7
done 8
done 9
done 10
done 11
done 12
done 13
done 14
done 15
done 16


case_id,Date,weeknum,month,weekday,week,year,target,pastshallow_mean_annuity_853A,pastshallow_mean_approvaldate_319D,pastshallow_mean_byoccupationinc_3656910L,pastshallow_mean_childnum_21L,pastshallow_mean_creationdate_885D,pastshallow_mean_credacc_actualbalance_314A,pastshallow_mean_credamount_590A,pastshallow_mean_currdebt_94A,pastshallow_mean_dateactivated_425D,pastshallow_mean_downpmt_134A,pastshallow_mean_dtlastpmt_581D,pastshallow_mean_dtlastpmtallstes_3545839D,pastshallow_mean_employedfrom_700D,pastshallow_mean_firstnonzeroinstldate_307D,pastshallow_mean_mainoccupationinc_437A,pastshallow_mean_maxdpdtolerance_577P,pastshallow_mean_outstandingdebt_522A,pastshallow_mean_pmtnum_8L,pastshallow_mean_tenor_203L,pastshallow_max_annuity_853A,pastshallow_max_byoccupationinc_3656910L,pastshallow_max_childnum_21L,pastshallow_max_creationdate_885D,pastshallow_max_credacc_actualbalance_314A,pastshallow_max_currdebt_94A,pastshallow_max_downpmt_134A,pastshallow_max_dtlastpmt_581D,pastshallow_max_dtlastpmtallstes_3545839D,pastshallow_max_employedfrom_700D,…,month_intshallow_min_lastupdate_388D,month_intshallow_min_numberofoverdueinstlmaxdat_148D,month_intshallow_min_numberofoverdueinstlmaxdat_641D,month_intshallow_min_overdueamountmax2date_1002D,month_intshallow_min_overdueamountmax2date_1142D,year_pastshallow_mean_employedfrom_700D,year_pastshallow_max_employedfrom_700D,year_pastshallow_min_employedfrom_700D,year_staticbase_mean_datefirstoffer_1144D,year_staticbase_mean_lastdelinqdate_224D,year_staticbase_mean_lastrejectdate_50D,year_staticbase_mean_validfrom_1069D,year_staticbase_max_lastrejectdate_50D,year_staticbase_max_validfrom_1069D,year_staticbase_min_validfrom_1069D,year_staticexternal_mean_dateofbirth_337D,year_staticexternal_max_birthdate_574D,year_staticexternal_max_dateofbirth_337D,year_staticexternal_min_dateofbirth_337D,year_personshallow_mean_birth_259D,year_personshallow_mean_empl_employedfrom_271D,year_personshallow_max_birth_259D,year_personshallow_max_empl_employedfrom_271D,year_personshallow_min_birth_259D,year_personshallow_min_empl_employedfrom_271D,year_intshallow_mean_dateofcredstart_739D,year_intshallow_mean_numberofoverdueinstlmaxdat_148D,year_intshallow_mean_numberofoverdueinstlmaxdat_641D,year_intshallow_mean_overdueamountmax2date_1002D,year_intshallow_max_numberofoverdueinstlmaxdat_148D,year_intshallow_max_overdueamountmax2date_1002D,year_intshallow_min_dateofcredstart_181D,year_intshallow_min_dateofcredstart_739D,year_intshallow_min_dateofrealrepmt_138D,year_intshallow_min_lastupdate_388D,year_intshallow_min_numberofoverdueinstlmaxdat_148D,year_intshallow_min_overdueamountmax2date_1002D
u64,i64,u8,i8,i8,i8,i32,u8,f32,i64,f32,f32,i64,f32,f32,f32,i64,f32,i64,i64,i64,i64,f32,f32,f32,f32,f32,f32,f32,f32,i64,f32,f32,f32,i64,i64,i64,…,i8,i8,i8,i8,i8,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
0,0,0,1,4,1,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,-32,-1,-32,-1,-32,-1,,,,,,,,,,,,
1,0,0,1,4,1,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,-61,-10,-61,-10,-61,-10,,,,,,,,,,,,
2,0,0,1,5,1,1,0,1161.30,,,0.00,-2102,,13000.00,,,0.00,,,-3245,-2071,8200.00,,,18.00,18.00,1161.30,,0.00,-2102,,,0.00,,,-3245,…,,,,,,-8,-8,-8,,,-5,,-5,,,,,,,-44,-8,-44,-8,-44,-8,,,,,,,,,,,,
3,0,0,1,4,1,1,0,6140.00,,,,4,,59999.80,,,0.00,,,-233,35,11000.00,,,12.00,12.00,6140.00,,,4,,,0.00,,,-233,…,,,,,,0,0,0,,,1,,1,,,,,,,-25,0,-25,0,-25,0,,,,,,,,,,,,
4,0,0,1,5,1,1,1,2556.60,,,,4,,40000.00,,,0.00,,,,35,16000.00,,,24.00,24.00,2556.60,,,4,,,0.00,,,,…,,,,,,,,,,,,,,,,,,,,-24,-4,-24,-4,-24,-4,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57630,0,92,10,2,41,2,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,-58,,-58,-58,,,,,,,,,,,,,,,,,,
57631,0,92,10,2,41,2,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,-63,,-63,-63,,,,,,,,,,,,,,,,,,
57632,0,92,10,2,41,2,0,6390.20,,,,14,,100000.00,,,0.00,,,,45,100000.00,,,24.00,24.00,6390.20,,,14,,,0.00,,,,…,,,,,,,,,,,,,,,,-65,,-65,-65,,,,,,,,,,,,,,,,,,
57633,0,92,10,2,41,2,0,4287.95,,,,-813,,87500.00,,,0.00,,,-2091,-959,105450.00,,,19.00,19.00,4287.95,,,-813,,,0.00,,,-2091,…,,,,,,-3,-3,-3,,,1,,1,,,-28,,-28,-28,,,,,,,,,,,,,,,,,,


In [11]:
class_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.01,
    "n_estimators": 10000,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 420,
    "reg_alpha": 0.15,
    "reg_lambda": 15,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

xgb_params = {
    "objective": "binary:logistic",
    "n_estimators": 10000,
    "eval_metric": "auc",
    "seed": 420,
    "booster": "gbtree",
    "device": "cuda",
    "eta": 0.01,
    "gamma": 5,
    "max_depth": 128,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "lambda": 10,
    "alpha": 2,
    "updater": "grow_gpu_hist",
    "grow_policy": "depthwise",
    "max_leaves": 256,
    "num_parallel_tree": 1,    
    "sample_type": "uniform",
    "normalize_type": "tree",
    "rate_drop": 0.15,
    "skip_drop": 0.9,
    "enable_categorical": True
}

cat_params = {
    "eval_metric": "AUC",
    "task_type": "GPU",
    "iterations": 10000,
    "learning_rate": 0.01,
    "bootstrap_type": "Poisson",
    "random_seed": 420,
    "l2_leaf_reg": 15,
    "subsample": 0.8,
    "depth": 32,
    "max_leaves": 64,
    "grow_policy": "Lossguide",
    "od_type": "Iter",
    "od_wait": 1000,
    "verbose": 500
}

In [12]:
week_num = total_df["weeknum"]
train_total = total_df.drop("weeknum")
gc.collect()

def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )

In [13]:
cv = StratifiedGroupKFold(n_splits=5,shuffle=False)
lgb_imp = pd.DataFrame(index=total_df.drop("target").columns,columns=["gain_1","gain_2","gain_3","gain_4","gain_5"])

for i,(train_ind,valid_ind) in enumerate(
    cv.split(total_df,total_df.select("target"),
    groups=week_num)):
    print(f"Training start for LGBClassifier: {i+1}")
    lgb_model = lgb.LGBMClassifier(**class_params)
    lgb_model.fit(
        (
            total_df
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            total_df
            .select('target')
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                total_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                total_df
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[lgb.log_evaluation(500),lgb.early_stopping(800)]
    )
    for imp,col_name in sorted(zip(lgb_model.feature_importances_,total_df.drop("target").columns)):
        lgb_imp.loc[col_name,f"gain_{i+1}"] = imp
    gc.collect()

lgb_imp.reset_index(inplace=True)
lgb_imp.to_parquet("/home/sohail/Downloads/lgb_imp.parquet")

del train_ind,valid_ind,lgb_model
gc.collect()

xgb_imp = pd.DataFrame(index=total_df.drop("target").columns,columns=["gain_1","gain_2","gain_3","gain_4","gain_5"])

for i,(train_ind,valid_ind) in enumerate(
    cv.split(total_df,total_df.select("target"),
    groups=week_num)):
    print(f"Training start for XGBClassifier: {i+1}")
    early_stop = xgb.callback.EarlyStopping(rounds=800)
    log_eval = xgb.callback.EvaluationMonitor(period=500)
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(
        (
            total_df
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            total_df
            .select("target")
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                total_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                total_df
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[early_stop,log_eval],
        verbose=False
    )
    for imp,col_name in sorted(zip(xgb_model.feature_importances_,total_df.drop("target").columns)):
        xgb_imp.loc[col_name,f"gain_{i+1}"] = imp
    gc.collect()

xgb_imp.reset_index(inplace=True)
xgb_imp.to_parquet("/home/sohail/Downloads/xgb_imp.parquet")

del xgb_model,train_ind,valid_ind,log_eval,early_stop
gc.collect()

cat_imp = pd.DataFrame(index=total_df.drop("target").columns,columns=["gain_1","gain_2","gain_3","gain_4","gain_5"])

for i,(train_ind,valid_ind) in enumerate(
    cv.split(total_df,total_df.select("target"),
    groups=week_num)):
    print(f"Training start for CatBoostClassifier: {i+1}")
    cat_model = cgb.CatBoostClassifier(**cat_params)
    cat_model.fit(
        (
            total_df
            .drop("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        (
            total_df
            .select("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        eval_set=[(
            (
                total_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            ),
            (
                total_df
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            )
        )]
    )
    for imp,col_name in sorted(zip(cat_model.feature_importances_,total_df.drop("target").columns)):
        cat_imp.loc[col_name,f"gain_{i+1}"] = imp
    gc.collect()

cat_imp.reset_index(inplace=True)
cat_imp.to_parquet("/home/sohail/Downloads/cat_imp.parquet")

del total_df,cat_model,train_ind,valid_ind

Training start for LGBClassifier: 1
Training until validation scores don't improve for 800 rounds
