In [1]:
import os,warnings,gc
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedGroupKFold,train_test_split
from sklearn.base import BaseEstimator,TransformerMixin
from typing import Literal
import lightgbm as lgb
import xgboost as xgb
import catboost as cgb

In [2]:
pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(4)

polars.config.Config

In [3]:
path_to_train = "/home/sohail/Downloads/credit_risk/train/"
path_to_test = "/home/sohail/Downloads/credit_risk/test/"

In [4]:
all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
display(train_files_df.head())
display(test_files_df.head())

index,path,filename
i64,str,str
31,"""/home/sohail/D…","""train_applprev…"
13,"""/home/sohail/D…","""train_applprev…"
…,…,…
7,"""/home/sohail/D…","""train_base.par…"
11,"""/home/sohail/D…","""train_credit_b…"


index,path,filename
i64,str,str
31,"""/home/sohail/D…","""test_applprev_…"
35,"""/home/sohail/D…","""test_applprev_…"
…,…,…
1,"""/home/sohail/D…","""test_applprev_…"
33,"""/home/sohail/D…","""test_base.parq…"


In [5]:
train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("week_num"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("week_num")
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
train_case_ids = train_base.select("case_id_base").to_series().to_numpy()
test_case_ids = test_base.select("case_id_base").to_series().to_numpy()
sub_length = len(test_case_ids)
total_base = (
    pl.concat(
        [
            train_base,
            test_base
        ],
        how="vertical"
    )
)
del train_base,test_base

In [6]:
col_df = (
    pl.DataFrame({
        "col_name":list(
            set(
                pl.read_parquet("/home/sohail/Downloads/lgb_imp.parquet")
                .filter(pl.col("index").str.contains("Date").not_())
                .select(pl.col("index"),pl.mean_horizontal(cs.numeric()).alias("mean"))
                .sort(by="mean")
                .filter(pl.col("mean") > 4000)
                ["index"].to_list()
            ).intersection(set(
                pl.read_parquet("/home/sohail/Downloads/xgb_imp.parquet")
                .filter(pl.col("index").str.contains("Date").not_())
                .select(pl.col("index"),pl.mean_horizontal(cs.numeric()).alias("mean"))
                .sort(by="mean")
                .filter(pl.col("mean") > 0.0005)
                ["index"].to_list()
            )).intersection(set(
                pl.read_parquet("/home/sohail/Downloads/cat_imp.parquet")
                .filter(pl.col("index").str.contains("Date").not_())
                .select(pl.col("index"),pl.mean_horizontal(cs.numeric()).alias("mean"))
                .sort(by="mean")
                .filter(pl.col("mean") > 0.02)
                ["index"].to_list()
            )
            )
        )
    })
    .with_columns(
        pl.col("col_name").str.split_exact("_",2).alias("name")
    )
    .unnest("name")
    .filter(
        pl.col("field_0").is_not_null() & pl.col("field_1").is_not_null()
    )
)

def get_columns(type:str,name:str,date:Literal["year","month","week",None]=None):
    if date:
        return (
            col_df
            .select(
                pl.when(
                    pl.col("field_0").str.contains(date)
                    &
                    pl.col("field_2").str.contains(name)
                )
                .then(
                    pl.col("col_name").str.split("_").list.slice(1).list.join("_")
                )
            )
            .drop_nulls()
            .unique()
            .sort(by="col_name")
            ["col_name"].to_list()
        )
    else:
        return (
            col_df
            .select(
                pl.when(
                    pl.col("field_0").str.contains(type)
                    &
                    pl.col("field_1").str.contains(name)
                )
                .then(
                    pl.col("col_name").str.split("_").list.slice(1).list.join("_")
                )
                .when(
                    pl.col("field_1").str.contains(type)
                    &
                    pl.col("field_2").str.contains(name)
                )
                .then(
                    pl.col("col_name").str.split("_").list.slice(2).list.join("_")
                )
            )
            .drop_nulls()
            .unique()
            .sort(by="col_name")
            ["col_name"].to_list()
        )

In [7]:
def reduce_dtypes(df:pl.DataFrame,name:str):
    if name == "intshallow":
        df = df.rename(
            {
                "dpdmaxdatemonth_442T":"dpdmaxdatemonth_442D",
                "dpdmaxdatemonth_89T":"dpdmaxdatemonth_89D",
                "dpdmaxdateyear_596T":"dpdmaxdateyear_596D",
                "dpdmaxdateyear_896T":"dpdmaxdateyear_896D",
                "overdueamountmaxdatemonth_284T":"overdueamountmaxdatemonth_284D",
                "overdueamountmaxdatemonth_365T":"overdueamountmaxdatemonth_365D",
                "overdueamountmaxdateyear_2T":"overdueamountmaxdateyear_2D",
                "overdueamountmaxdateyear_994T":"overdueamountmaxdateyear_994D",
            }
        )
    elif name == "intdepth":
        df = df.rename(
            {
                "pmts_month_158T":"pmts_month_158D",
                "pmts_month_706T":"pmts_month_706D",
                "pmts_year_1139T":"pmts_year_1139D",
                "pmts_year_507T":"pmts_year_507D",
            }
        )
    elif name == "extshallow":
        df = df.rename(
            {
                "dpdmaxdatemonth_804T":"dpdmaxdatemonth_804D",
                "dpdmaxdateyear_742T":"dpdmaxdateyear_742D",
                "overdueamountmaxdatemonth_494T":"overdueamountmaxdatemonth_494D",
                "overdueamountmaxdateyear_432T":"overdueamountmaxdateyear_432D",
            }
        )
    return (
        df
        .select(
            cs.by_name("case_id").cast(pl.UInt32),
            cs.contains("num_group").cast(pl.UInt16).prefix(f"{name}_"),
            cs.ends_with("D").cast(pl.Date).prefix(f"{name}_"),
            cs.ends_with("T","M").cast(pl.String).prefix(f"{name}_"),
            cs.ends_with("P","A").cast(pl.Float32).prefix(f"{name}_"),
            (cs.ends_with("L") & cs.numeric()).cast(pl.Float32).prefix(f"{name}_")
        )
    )

def grouping(df:pl.DataFrame,name:str):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.by_name(get_columns("mean",name)).mean().prefix("mean_"),
            cs.by_name(get_columns("max",name)).max().prefix("max_"),
            cs.by_name(get_columns("var",name)).var().prefix("var_"),
            (cs.string() | cs.boolean()).drop_nulls().mode().first().prefix("mode_")
        )
    )

def date_features(df:pl.DataFrame,name:str):
    return (
        df
        .join(
            total_base.select(["case_id_base","Date"]),
            left_on="case_id",
            right_on="case_id_base",
            how="left"
        )
        .with_columns(
            (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
            (cs.by_name(get_columns("mean",name,"year")).dt.year() - 2018).cast(pl.Int8).prefix("year_"),
            cs.by_name(get_columns("mean",name,"month")).dt.month().cast(pl.Int8).prefix("month_"),
            cs.by_name(get_columns("mean",name,"week")).dt.week().cast(pl.Int8).prefix("week_")
        )
        .drop("Date")
    )

def select_low_cat_cols(df:pl.DataFrame,thresh=200):
    cols = []
    for col_name in df.select(pl.col(pl.Categorical)).columns:
        if df.select(pl.col(col_name).unique()).shape[0] >= thresh:
            cols.append(col_name)
        if df.select(pl.col(col_name).unique()).shape[0] <= 1:
            cols.append(col_name)
    return df.drop(cols)

def select_impuatable(df:pl.DataFrame,thresh=0.95):
    cols =  (
        df
        .select(pl.all().is_null().mean())
        .transpose(include_header=True)
        .filter(pl.col("column_0") < thresh)
        ["column"].to_list()
    )
    return df.select(cols)

def preprocess(filter_string:str,prefix_string:str):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(_).pipe(reduce_dtypes,prefix_string).pipe(grouping,prefix_string) for _ in train_files_list
                ],
                how="vertical_relaxed"
            )
        )
        gc.collect()
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(_).pipe(reduce_dtypes,prefix_string).pipe(grouping,prefix_string) for _ in test_files_list
                ],
                how="vertical_relaxed"
            )
        )
        gc.collect()
    return (
        pl.concat(
            [
                train_df,
                test_df
            ],
            how="vertical_relaxed"
        )
        .pipe(select_impuatable)
        .pipe(select_low_cat_cols)
        .with_columns(
            pl.col(pl.String).cast(pl.Categorical).rank("dense")
        )
        .pipe(date_features,prefix_string)
        .select(pl.all().shrink_dtype())
    )

In [8]:
total_past_shallow = preprocess("applprev_1", "pastshallow")
# display(total_past_shallow)
print("done 1")

total_past_depth = preprocess("applprev_2", "pastdepth")
# display(total_past_depth)
print("done 2")

total_static_base = preprocess("static_0","staticbase")
# display(total_static_base)
print("done 3")


total_static_external = (
    preprocess("static_cb", "staticexternal")
    .with_columns(
        cs.contains("riskassesment_302T").str.split("%").list.gather([0, 1]).apply(
            lambda x: (int(x[0]) + int(x[1].split("-")[1])) / 200
        )
    )
)
# display(total_static_external)
print("done 4")


total_person_shallow = preprocess("person_1", "personshallow")
# display(total_person_shallow)
print("done 5")


total_person_depth = preprocess("person_2", "persondepth")
# display(total_person_depth)
print("done 6")


total_other_shallow = preprocess("other_1", "othershallow")
# display(total_other_shallow)
print("done 7")


total_deposit_shallow = preprocess("deposit_1", "depositshallow")
# display(total_deposit_shallow)
print("done 8")


total_debitcard_shallow = preprocess("debitcard", "cardshallow")
# display(total_debitcard_shallow)
print("done 9")


total_credit_internal_shallow = preprocess("bureau_a_1", "intshallow")
# display(total_credit_internal_shallow)
print("done 10")


total_credit_internal_depth = preprocess("bureau_a_2", "intdepth")
# display(total_credit_internal_depth)
print("done 11")


total_credit_external_shallow = preprocess("bureau_b_1", "extshallow")
# display(total_credit_external_shallow)
print("done 12")


total_credit_external_depth = preprocess("bureau_b_2", "extdepth")
# display(total_credit_external_depth)
print("done 13")


total_registry_a = preprocess("registry_a", "rega")
# display(total_registry_a)
print("done 14")


total_registry_b = preprocess("registry_b", "regb")
# display(total_registry_b)
print("done 15")


total_registry_c = preprocess("registry_c", "regc")
# display(total_registry_c)
print("done 16")

done 1
done 2


In [None]:
total_df = (
    total_base
    .join(
        total_past_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_past_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_base,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_external,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_other_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_deposit_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_debitcard_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_a,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_b,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_c,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .drop("Date")
)
week_num = total_df.filter(pl.col("case_id_base").is_in(train_case_ids)).select("week_num")
train_df = total_df.filter(pl.col("case_id_base").is_in(train_case_ids)).drop("case_id_base")
sub_df = total_df.filter(pl.col("case_id_base").is_in(test_case_ids)).drop("target")

del train_case_ids,test_case_ids
del total_df
del train_files_df
del test_files_df
del total_base
del total_past_shallow
del total_past_depth
del total_static_base
del total_static_external
del total_person_depth
del total_person_shallow
del total_other_shallow
del total_deposit_shallow
del total_debitcard_shallow
del total_credit_external_depth
del total_credit_external_shallow
del total_credit_internal_depth
del total_credit_internal_shallow
del total_registry_a
del total_registry_b
del total_registry_c
gc.collect()

0

In [None]:
class_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 64,
    "learning_rate": 0.005,
    "n_estimators": 8000,
    "max_data_per_bin": 258,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 420,
    "reg_alpha": 0.15,
    "reg_lambda": 15,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

xgb_params = {
    "objective": "binary:logistic",
    "n_estimators": 8000,
    "eval_metric": "auc",
    "seed": 420,
    "booster": "gbtree",
    "device": "cuda",
    "eta": 0.005,
    "gamma": 5,
    "max_depth": 64,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "lambda": 10,
    "alpha": 2,
    "updater": "grow_gpu_hist",
    "grow_policy": "depthwise",
    "max_leaves": 256,
    "num_parallel_tree": 1,    
    "sample_type": "uniform",
    "normalize_type": "tree",
    "rate_drop": 0.15,
    "skip_drop": 0.9,
    "enable_categorical": True
}

cat_params = {
    "eval_metric": "AUC",
    "task_type": "GPU",
    "iterations": 8000,
    "learning_rate": 0.005,
    "bootstrap_type": "Poisson",
    "random_seed": 420,
    "l2_leaf_reg": 15,
    "subsample": 0.8,
    "depth": 32,
    "max_leaves": 64,
    "grow_policy": "Lossguide",
    "od_type": "Iter",
    "od_wait": 500,
    "verbose": 500
}

In [None]:
def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )

gc.collect()

0

In [None]:
if sub_length <= 10:
    train_df = train_df.with_row_index().filter(pl.col("index").hash(512)%30 == 1).drop("index")
    week_num = week_num.with_row_index().filter(pl.col("index").hash(512)%30 == 1).drop("index")
    class_params["n_estimators"] = 2000
    class_params["learning_rate"] = 0.01
    xgb_params["n_estimators"] = 2000
    xgb_params["eta"] = 0.01
    cat_params["iterations"] = 2000
    cat_params["learning_rate"] = 0.01

In [None]:
class Model:

    def __init__(self,
        sub_df:pl.DataFrame
        ):
        self.df = sub_df.drop("case_id_base")
        self.lgb_pred = []
        self.xgb_pred = []
        self.cat_pred = []

    def _predict_lgb(self,model):
        self.lgb_pred.append(model.predict_proba(self.df)[:,1])
        gc.collect()

    def _predict_xgb(self,model):
        self.xgb_pred.append(model.predict_proba(self.df)[:,1])
        gc.collect()

    def _predict_cat(self,model):
        self.cat_pred.append(model.predict_proba(self.df.to_pandas())[:,1])
        gc.collect()

    def predict_proba(self):
        return np.mean(
            self.lgb_pred+\
            self.xgb_pred+\
            self.cat_pred,
            axis=0
        )
model = Model(sub_df)

In [None]:
cv = StratifiedGroupKFold(n_splits=5,shuffle=False)

for i,(train_ind,valid_ind) in enumerate(
    cv.split(train_df,train_df.select("target"),
    groups=week_num)):
    print(f"Training start for LGBClassifier: {i+1}")
    if i%2 != 0:
        class_params["reg_alpha"] = 0.2
        class_params["reg_lambda"] = 2
        class_params["random_state"] = 42
    lgb_model = lgb.LGBMClassifier(**class_params)
    lgb_model.fit(
        (
            train_df
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            train_df
            .select('target')
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                train_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                train_df
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[lgb.log_evaluation(500),lgb.early_stopping(800)]
    )
    gc.collect()
    model._predict_lgb(lgb_model)

del train_ind,valid_ind,lgb_model
gc.collect()

for i,(train_ind,valid_ind) in enumerate(
    cv.split(train_df,train_df.select("target"),
    groups=week_num)):
    print(f"Training start for XGBClassifier: {i+1}")
    if i%2 != 0:
        xgb_params["lambda"] = 15
        xgb_params["alpha"] = 5
        xgb_params["gamma"] = 8
        xgb_params["seed"] = 42
    early_stop = xgb.callback.EarlyStopping(rounds=800)
    log_eval = xgb.callback.EvaluationMonitor(period=500)
    xgb_model = xgb.XGBClassifier(**xgb_params)
    xgb_model.fit(
        (
            train_df
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            train_df
            .select("target")
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                train_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                train_df
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[early_stop,log_eval],
        verbose=False
    )
    gc.collect()
    model._predict_xgb(xgb_model)

del xgb_model,train_ind,valid_ind,log_eval,early_stop
gc.collect()

for i,(train_ind,valid_ind) in enumerate(
    cv.split(train_df,train_df.select("target"),
    groups=week_num)):
    print(f"Training start for CatBoostClassifier: {i+1}")
    if i%2 != 0:
        cat_params["l2_leaf_reg"] = 20
        cat_params["random_seed"] = 42
    cat_model = cgb.CatBoostClassifier(**cat_params)
    cat_model.fit(
        (
            train_df
            .drop("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        (
            train_df
            .select("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        eval_set=[(
            (
                train_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            ),
            (
                train_df
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            )
        )]
    )
    gc.collect()
    model._predict_cat(cat_model)

del train_df,cat_model,train_ind,valid_ind

Training start for LGBClassifier: 1
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.806296
[1000]	valid_0's auc: 0.807089
[1500]	valid_0's auc: 0.803433
Early stopping, best iteration is:
[909]	valid_0's auc: 0.807739
Training start for LGBClassifier: 2
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.8248
[1000]	valid_0's auc: 0.824267
Early stopping, best iteration is:
[643]	valid_0's auc: 0.825251
Training start for LGBClassifier: 3
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.830739
[1000]	valid_0's auc: 0.829862
Early stopping, best iteration is:
[646]	valid_0's auc: 0.831599
Training start for LGBClassifier: 4
Training until validation scores don't improve for 800 rounds
[500]	valid_0's auc: 0.818799
[1000]	valid_0's auc: 0.815034
Early stopping, best iteration is:
[468]	valid_0's auc: 0.819632
Training start for LGBClassifier: 5
Training until validation scores don't 

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5904708	best: 0.5904708 (0)	total: 80.7ms	remaining: 2m 41s
500:	test: 0.8079814	best: 0.8080245 (495)	total: 9.44s	remaining: 28.2s
1000:	test: 0.8094037	best: 0.8100091 (862)	total: 19.1s	remaining: 19.1s
bestTest = 0.8100091219
bestIteration = 862
Shrink model to first 863 iterations.
Training start for CatBoostClassifier: 2


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5561447	best: 0.5561447 (0)	total: 19.2ms	remaining: 38.4s
500:	test: 0.8236493	best: 0.8236493 (500)	total: 9.43s	remaining: 28.2s
1000:	test: 0.8285884	best: 0.8289168 (965)	total: 19s	remaining: 19s
1500:	test: 0.8276383	best: 0.8292733 (1357)	total: 28.6s	remaining: 9.52s
bestTest = 0.8292733431
bestIteration = 1357
Shrink model to first 1358 iterations.
Training start for CatBoostClassifier: 3


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5957471	best: 0.5957471 (0)	total: 18ms	remaining: 36s
500:	test: 0.8189771	best: 0.8189771 (500)	total: 9.18s	remaining: 27.5s
1000:	test: 0.8234265	best: 0.8240911 (787)	total: 18.7s	remaining: 18.7s
bestTest = 0.8240910769
bestIteration = 787
Shrink model to first 788 iterations.
Training start for CatBoostClassifier: 4


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5650345	best: 0.5650345 (0)	total: 17.8ms	remaining: 35.7s
500:	test: 0.8161178	best: 0.8161525 (498)	total: 9.3s	remaining: 27.8s
1000:	test: 0.8154657	best: 0.8178689 (623)	total: 18.8s	remaining: 18.8s
bestTest = 0.817868948
bestIteration = 623
Shrink model to first 624 iterations.
Training start for CatBoostClassifier: 5


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6304987	best: 0.6304987 (0)	total: 19.2ms	remaining: 38.5s
500:	test: 0.8201178	best: 0.8201178 (500)	total: 9.15s	remaining: 27.4s
1000:	test: 0.8254408	best: 0.8258091 (979)	total: 18.7s	remaining: 18.7s
1500:	test: 0.8259228	best: 0.8261347 (1486)	total: 28.5s	remaining: 9.46s
1999:	test: 0.8256923	best: 0.8265126 (1884)	total: 38.1s	remaining: 0us
bestTest = 0.8265126348
bestIteration = 1884
Shrink model to first 1885 iterations.


In [None]:
sub_df = pd.DataFrame({
    "case_id": sub_df.select("case_id_base").collect(streaming=True).to_series().to_list(),
    "score": model.predict_proba()
}).set_index("case_id")
# sub_df.to_csv("./submission.csv")
sub_df

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.241406
57549,0.05536
57551,0.067224
57552,0.02568
57569,0.018475
57630,0.019885
57631,0.022177
57632,0.018504
57633,0.059204
57634,0.043389
