In [1]:
import os,warnings,gc
warnings.filterwarnings('ignore')
import numpy as np
import json
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,StratifiedGroupKFold
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
import xgboost as xgb
import catboost as cgb
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
train_base = (
    pl.read_parquet("/home/sohail/Downloads/credit_risk/train/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
# train_case_ids = train_base["case_id_base"]
train_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
0,2019-01-03,1,1,4,0,0
1,2019-01-03,1,1,4,0,0
2,2019-01-04,1,1,5,0,0
3,2019-01-03,1,1,4,0,0
4,2019-01-04,1,1,5,0,1
…,…,…,…,…,…,…
2703450,2020-10-05,10,41,1,1,0
2703451,2020-10-05,10,41,1,1,0
2703452,2020-10-05,10,41,1,1,0
2703453,2020-10-05,10,41,1,1,0


In [3]:
test_base = (
    pl.read_parquet("/home/sohail/Downloads/credit_risk/test/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
# test_case_ids = test_base["case_id_base"]
test_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
57543,2020-10-06,10,41,2,1,0
57549,2020-10-06,10,41,2,1,0
57551,2020-10-06,10,41,2,1,0
57552,2020-10-07,10,41,3,1,0
57569,2020-10-06,10,41,2,1,0
57630,2020-10-06,10,41,2,1,0
57631,2020-10-06,10,41,2,1,0
57632,2020-10-06,10,41,2,1,0
57633,2020-10-06,10,41,2,1,0
57634,2020-10-06,10,41,2,1,0


In [4]:
gain_df = pl.read_csv("/home/sohail/Downloads/gains.csv")
new_gains_df = pl.read_csv("/home/sohail/Downloads/new_gains.csv")
new_filled_gains_df = pl.read_csv("/home/sohail/Downloads/new_filled_gains.csv")

In [5]:
drop_list = new_gains_df.filter(pl.col("gain") < 1000)["column_name"].to_list()
len(drop_list)

19

In [6]:
total_df = (
    pl.read_parquet("/home/sohail/Downloads/total_imputed.parquet")
    .rename({"case_id":"case_id_base"})
    .drop(drop_list)
)
total_df

month,week,weekday,year,target,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,…,ext_shallow_pmtnumpending_403L,ext_shallow_residualamount_1093A,ext_shallow_residualamount_127A,ext_shallow_residualamount_3940956A,ext_shallow_totalamount_503A,ext_shallow_totalamount_881A,ext_shallow_classificationofcontr_1114M,ext_shallow_contractdate_551D,ext_shallow_contractmaturitydate_151D,ext_shallow_contractst_516M,ext_shallow_contracttype_653M,ext_shallow_credor_3940957M,ext_shallow_dpdmaxdatemonth_804T,ext_shallow_dpdmaxdateyear_742T,ext_shallow_lastupdate_260D,ext_shallow_overdueamountmaxdatemonth_494T,ext_shallow_overdueamountmaxdateyear_432T,ext_shallow_periodicityofpmts_997M,ext_shallow_pmtmethod_731M,ext_shallow_purposeofcred_722M,ext_shallow_subjectrole_326M,ext_shallow_subjectrole_43M,ext_depth_num_group1,ext_depth_num_group2,ext_depth_pmts_dpdvalue_108P,ext_depth_pmts_pmtsoverdue_635A,ext_depth_pmts_date_1107D,reg_a_amount_4527230A,reg_a_num_group1,reg_a_recorddate_4527225D,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D,case_id_base
i8,i8,i8,i32,u8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f32,f32,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f32,f64,f32,f64,f64,f32,f64,f64,f64,f32,f64,i64
1,1,4,0,0,0.0,4349.0,1.0,2.0,2.0,0.0,0.0,0.0,0.0,64824.0,0.0,0.0,52000.0,0.0,0.0,0.0,36.0,7.41000704e8,36.0,-843.0,1.0,-461.0,3.0,2.0,-239.0,-338.0,-258.0,4.0,-1824.0,2.0,-313.0,3.0,…,36.0,0.0,33313.511719,21158.150391,400000.0,108565.398438,2.0,-174.0,1325.0,8.0,3.0,11.0,10.0,3.0,7.0,12.0,4.0,2.0,1.0,2.0,2.0,2.0,1.0,9.0,0.0,33.0,-28.0,3780.0,13.0,14.0,19440.0,12.0,-130.0,15.0,3088.199951,-10.0,0
1,1,4,0,0,0.0,4204.200195,1.0,0.0,0.0,0.0,0.0,0.0,0.0,45760.0,0.0,0.0,36400.0,1.0,1.0,0.0,12.0,7.29854848e8,12.0,-927.0,1.0,-3618.0,6.0,1.0,-550.0,-787.0,-583.0,4.0,-8842.0,1.0,-3683.0,1.0,…,0.0,0.0,14620.0,21506.330078,127200.0,153773.0,3.0,-1102.0,160.0,2.0,1.0,12.0,6.0,2.0,6.0,8.0,6.0,3.0,3.0,1.0,1.0,1.0,2.0,36.0,255114.0,253.800003,-83.0,2793.600098,11.0,14.0,6885.0,1.0,-167.0,11.0,3915.055908,-4.0,1
1,1,5,0,0,0.0,1682.400024,35000.0,0.0,0.0,0.0,0.0,0.0,0.0,16000.0,0.0,0.0,8200.0,0.0,1.0,0.0,24.0,7.60218176e8,24.0,-1925.0,1.0,-2102.0,1.0,1.0,-766.0,-1258.0,-1187.0,2.0,-3245.0,1.0,-2071.0,1.0,…,17.0,0.0,0.0,0.0,120241.601562,78000.0,2.0,-256.0,457.0,2.0,3.0,10.0,8.0,2.0,-4.0,8.0,2.0,2.0,4.0,3.0,1.0,2.0,2.0,35.0,48854.0,0.2,-225.0,2466.0,2.0,14.0,14523.400391,12.0,-24.0,0.0,20.200001,-156.0,2
1,1,4,0,0,0.0,6140.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,59999.800781,0.0,0.0,11000.0,0.0,0.0,0.0,12.0,7.4118048e8,12.0,-152.0,2.0,4.0,1.0,1.0,-778.0,-1504.0,-1614.0,2.0,-233.0,2.0,35.0,1.0,…,16.0,0.0,0.0,0.0,208000.0,168009.203125,2.0,-227.0,181.0,2.0,3.0,11.0,1.0,6.0,3.0,4.0,6.0,2.0,3.0,2.0,1.0,1.0,1.0,14.0,0.0,0.0,-142.0,5195.399902,4.0,14.0,24933.400391,4.0,-146.0,5.0,1611.0,-96.0,3
1,1,5,0,1,0.0,2556.600098,1.0,1.0,0.0,0.0,2.0,0.0,0.0,40000.0,0.0,0.0,16000.0,0.0,0.0,0.0,24.0,7.80407744e8,24.0,14.0,13.0,4.0,6.0,1.0,-411.0,-40.0,-451.0,1.0,-1115.0,2.0,35.0,1.0,…,13.0,0.0,4283.0,4039.67627,43381.800781,39841.0,1.0,-29.0,521.0,2.0,1.0,1.0,4.0,6.0,2.0,5.0,6.0,2.0,3.0,3.0,2.0,2.0,1.0,19.0,1.460478e6,10.6,-20.0,2153.400146,6.0,14.0,24300.0,18.0,-52.0,5.0,2704.083984,-139.0,4
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
10,41,1,1,0,0.0,30875.0,1.0,2.0,0.092,0.0,0.092,0.092,0.0,150000.0,0.0,0.0,50000.0,6.0,12.0,0.0,24.0,7.60714944e8,24.0,-637.0,1.0,-1040.0,2.0,1.0,-622.0,-637.0,-637.0,1.0,-8087.0,2.0,-1010.0,1.0,…,5.0,0.0,20724.521484,17126.191406,30095.400391,375577.59375,2.0,-998.0,52.0,2.0,4.0,8.0,5.0,6.0,-2.0,9.0,7.0,1.0,3.0,2.0,2.0,2.0,1.0,36.0,4606.0,1.6,-84.0,8838.0,5.0,14.0,1288.400024,1.0,-123.0,5.0,4640.800293,14.0,2703450
10,41,1,1,0,0.0,12809.200195,10340.0,0.0,90.0,0.0,90.0,90.0,0.0,114000.0,59773.714844,0.0,50000.0,1.0,5.0,68098.398438,24.0,7.80594496e8,24.0,-371.0,1.0,-371.0,2.0,1.0,-368.0,-1388.0,8.0,1.0,-6861.0,2.0,-585.0,1.0,…,4.0,0.0,14571.97168,29998.800781,107242.203125,219331.40625,2.0,-11.0,353.0,2.0,3.0,2.0,9.0,11.0,6.0,11.0,11.0,2.0,2.0,3.0,2.0,2.0,0.0,5.0,20074.0,0.2,6.0,1823.400024,5.0,14.0,12035.0,6.0,-36.0,5.0,1845.400024,14.0,2703451
10,41,1,1,0,0.0,9048.0,1.0,3.0,3600.0,0.0,0.0,0.116,0.0,80000.0,0.0,0.0,34000.0,1.0,2.0,0.0,14.0,7.80417984e8,14.0,-750.0,1.0,-45.0,4.0,2.0,-409.0,-566.0,-566.0,1.0,-977.0,2.0,-382.0,3.0,…,12.0,0.0,35543.199219,25192.380859,123980.0,29787.599609,2.0,-264.0,426.0,2.0,3.0,5.0,5.0,8.0,8.0,4.0,11.0,2.0,4.0,3.0,1.0,2.0,1.0,35.0,26520.0,47.200001,-73.0,2750.600098,10.0,14.0,15552.0,5.0,-116.0,12.0,2072.400146,14.0,2703452
10,41,1,1,0,0.0,5981.399902,33059.0,0.0,179.423996,0.0,398.024017,198.024002,14.0,123800.0,34550.855469,0.0,76000.0,33.0,8.0,46806.601562,48.0,7.8082592e8,48.0,-292.0,1.0,-292.0,2.0,1.0,-287.0,-1060.0,4.0,1.0,-2125.0,2.0,-2625.0,3.0,…,21.0,0.0,12397.600586,10350.045898,101969.203125,510430.40625,1.0,-293.0,623.0,1.0,4.0,1.0,8.0,11.0,2.0,9.0,11.0,2.0,3.0,2.0,2.0,2.0,0.0,10.0,47.0,5.4,-54.0,850.0,5.0,14.0,9499.600586,7.0,-128.0,5.0,772.799988,14.0,2703453


In [7]:
train_total = (
    total_df
    .filter(
        pl.col("case_id_base").is_in(train_base.select("case_id_base"))
    )
    .drop("case_id_base")
)
submission_df = (
    total_df
    .filter(
        pl.col("case_id_base").is_in(test_base.select("case_id_base"))
    )
)
train_total.null_count().transpose(include_header=True,header_name="col name",column_names=["Nan count"]).sort(by="Nan count")

col name,Nan count
str,u32
"""month""",0
"""week""",0
"""weekday""",0
"""year""",0
"""target""",0
…,…
"""reg_b_num_grou…",0
"""reg_b_deductio…",0
"""reg_c_num_grou…",0
"""reg_c_pmtamoun…",0


In [8]:
train_total,valid_df = train_test_split(
    train_total,
    test_size=0.1,
    random_state=533,
    shuffle=True,
    stratify=train_total.select("target")
)
gc.collect()

0

In [9]:
def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )

In [10]:
class_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.01,
    "n_estimators": 8000,
    "max_bin": 255,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 420,
    "reg_alpha": 0.2,
    "reg_lambda": 20,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

xgb_params = {
    "objective": "binary:logistic",
    "n_estimators": 5000,
    "eval_metric": "auc",
    "seed": 420,
    "booster": "gbtree",
    "device": "cuda",
    "eta": 0.02,
    "gamma": 5,
    "max_depth": 64,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "lambda": 10,
    "alpha": 2,
    "updater": "grow_gpu_hist",
    "grow_policy": "depthwise",
    "max_leaves": 256,
    "num_parallel_tree": 1,    
    "sample_type": "uniform",
    "normalize_type": "tree",
    "rate_drop": 0.15,
    "skip_drop": 0.9,
    "enable_categorical": True
}


cat_params = {
    "eval_metric": "AUC",
    "task_type": "GPU",
    "iterations": 8000,
    "learning_rate": 0.005,
    "bootstrap_type": "Poisson",
    "random_seed": 420,
    "l2_leaf_reg": 15,
    "subsample": 0.8,
    "depth": 16,
    "max_leaves": 64,
    "grow_policy": "Lossguide",
    "od_type": "Iter",
    "od_wait": 700,
    "verbose": 500
}

cat_clf_params = {
    "eval_metric": "AUC",
    "task_type": "GPU",
    "iterations": 6000,
    "learning_rate": 0.01,
    "bootstrap_type": "Poisson",
    "random_seed": 420,
    "l2_leaf_reg": 15,
    "subsample": 0.8,
    "depth": 16,
    "max_leaves": 64,
    "grow_policy": "Lossguide",
    "od_type": "Iter",
    "od_wait": 700,
    "verbose": 500
}

In [11]:
num_splits = 5
cv = StratifiedGroupKFold(n_splits=num_splits,shuffle=True)
lgb_clf_models = [
    lgb.LGBMClassifier(**class_params) for _ in range(num_splits)
]
xgb_clf_models = [
    xgb.XGBClassifier(**xgb_params) for _ in range(num_splits)
]
cat_models = [
    cgb.CatBoostClassifier(**cat_params) for _ in range(num_splits)
]
cat_clf_models = [
    cgb.CatBoostClassifier(**cat_clf_params) for _ in range(num_splits)
]
for i,(train_ind,valid_ind) in enumerate(
    cv.split(train_total.drop("target"),
    train_total.select("target"),
    groups=train_total["week"])
    ):
    print(f"Training start for LGBClassifier: {i+1}")
    lgb_clf_models[i].fit(
        (
            train_total
            .pipe(filter_ind,train_ind)
            .drop("target")
        ),
        (
            train_total
            .select("target")
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                train_total
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                train_total
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[lgb.log_evaluation(500),lgb.early_stopping(2000)]
    )
    print(f'''Roc score for validation df of Lgb total classification model {i+1}: {roc_auc_score(
        valid_df.select("target"),
        lgb_clf_models[i].predict_proba(valid_df.drop("target"))[:,1]
    )}''')
    print(gc.collect())
    print(f"Training start for XGBClassifier: {i+1}")
    early_stop = xgb.callback.EarlyStopping(rounds=300)
    log_eval = xgb.callback.EvaluationMonitor(period=500)
    xgb_clf_models[i].fit(
        (
            train_total
            .pipe(filter_ind,train_ind)
            .drop("target")
        ),
        (
            train_total
            .select("target")
            .pipe(filter_ind,train_ind)
        ),
        eval_set=[(
            (
                train_total
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                train_total
                .select("target")
                .pipe(filter_ind,valid_ind)
            )
        )],
        callbacks=[early_stop,log_eval],
        verbose=False
    )
    print(f'''Roc score for validation df of xgb total classification model {i+1}: {roc_auc_score(
        valid_df.select("target"),
        xgb_clf_models[i].predict_proba(valid_df.drop("target"))[:,1]
    )}''')
    print(gc.collect())
    print(f"Training start for CatBoostClassifier: {i+1}")
    cat_models[i].fit(
        (
            train_total
            .drop("target")
            .pipe(filter_ind,train_ind)
            .to_pandas()
        ),
        (
            train_total
            .pipe(filter_ind,train_ind)
            .select("target")
            .to_pandas()
        ),
        eval_set=[(
            (
                train_total
                .drop("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            ),
            (
                train_total
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_pandas()
            )
        )]
    )
    print(f'''Roc score for validation df of cat boost classification model {i+1}: {roc_auc_score(
        valid_df.select("target").to_pandas(),
        cat_models[i].predict_proba(valid_df.drop("target").to_pandas())[:,1]
    )}''')
    print(gc.collect())
    # cat_clf_models[i].fit(
    #     (
    #         train_total
    #         .select(cat_cols)
    #     )
    # )

Training start for LGBClassifier: 1
Training until validation scores don't improve for 2000 rounds
[500]	valid_0's auc: 0.827567
[1000]	valid_0's auc: 0.836732
[1500]	valid_0's auc: 0.840326
[2000]	valid_0's auc: 0.842194
[2500]	valid_0's auc: 0.843371
[3000]	valid_0's auc: 0.844078
[3500]	valid_0's auc: 0.844492
[4000]	valid_0's auc: 0.844663
[4500]	valid_0's auc: 0.844841
[5000]	valid_0's auc: 0.844768
[5500]	valid_0's auc: 0.84476
[6000]	valid_0's auc: 0.844665
Early stopping, best iteration is:
[4402]	valid_0's auc: 0.84486
Roc score for validation df of Lgb total classification model 1: 0.8427298714637474
31
Training start for XGBClassifier: 1
[0]	validation_0-auc:0.74400
[500]	validation_0-auc:0.84290
[986]	validation_0-auc:0.84551
Roc score for validation df of xgb total classification model 1: 0.8434238609922731
132
Training start for CatBoostClassifier: 1


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7114078	best: 0.7114078 (0)	total: 104ms	remaining: 13m 55s
500:	test: 0.8162660	best: 0.8162660 (500)	total: 22.3s	remaining: 5m 33s
1000:	test: 0.8309364	best: 0.8309364 (1000)	total: 46.5s	remaining: 5m 25s
1500:	test: 0.8368471	best: 0.8368471 (1500)	total: 1m 10s	remaining: 5m 3s
2000:	test: 0.8398386	best: 0.8398386 (2000)	total: 1m 32s	remaining: 4m 37s
2500:	test: 0.8415634	best: 0.8415634 (2500)	total: 1m 54s	remaining: 4m 11s
3000:	test: 0.8428519	best: 0.8428519 (3000)	total: 2m 15s	remaining: 3m 46s
3500:	test: 0.8437029	best: 0.8437029 (3500)	total: 2m 36s	remaining: 3m 20s
4000:	test: 0.8443268	best: 0.8443268 (4000)	total: 2m 56s	remaining: 2m 56s
4500:	test: 0.8448238	best: 0.8448238 (4499)	total: 3m 15s	remaining: 2m 32s
5000:	test: 0.8452303	best: 0.8452303 (5000)	total: 3m 34s	remaining: 2m 8s
5500:	test: 0.8456023	best: 0.8456030 (5499)	total: 3m 53s	remaining: 1m 46s
6000:	test: 0.8458722	best: 0.8458741 (5990)	total: 4m 12s	remaining: 1m 24s
6500:	test:

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7118705	best: 0.7118705 (0)	total: 44.2ms	remaining: 5m 53s
500:	test: 0.8098778	best: 0.8098778 (500)	total: 22.3s	remaining: 5m 33s
1000:	test: 0.8253887	best: 0.8253887 (1000)	total: 46.4s	remaining: 5m 24s
1500:	test: 0.8314674	best: 0.8314674 (1500)	total: 1m 9s	remaining: 5m 1s
2000:	test: 0.8345669	best: 0.8345669 (2000)	total: 1m 32s	remaining: 4m 36s
2500:	test: 0.8364197	best: 0.8364197 (2500)	total: 1m 54s	remaining: 4m 10s
3000:	test: 0.8376944	best: 0.8376944 (3000)	total: 2m 15s	remaining: 3m 45s
3500:	test: 0.8386053	best: 0.8386053 (3500)	total: 2m 35s	remaining: 3m 20s
4000:	test: 0.8393211	best: 0.8393219 (3999)	total: 2m 55s	remaining: 2m 55s
4500:	test: 0.8398273	best: 0.8398273 (4500)	total: 3m 15s	remaining: 2m 31s
5000:	test: 0.8402302	best: 0.8402302 (5000)	total: 3m 34s	remaining: 2m 8s
5500:	test: 0.8406116	best: 0.8406116 (5500)	total: 3m 53s	remaining: 1m 45s
6000:	test: 0.8409480	best: 0.8409483 (5994)	total: 4m 11s	remaining: 1m 23s
6500:	test: 

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7171739	best: 0.7171739 (0)	total: 45.2ms	remaining: 6m 1s
500:	test: 0.8132825	best: 0.8132825 (500)	total: 22.2s	remaining: 5m 32s
1000:	test: 0.8285686	best: 0.8285686 (1000)	total: 46.4s	remaining: 5m 24s
1500:	test: 0.8345651	best: 0.8345651 (1500)	total: 1m 9s	remaining: 5m 1s
2000:	test: 0.8374826	best: 0.8374826 (2000)	total: 1m 32s	remaining: 4m 35s
2500:	test: 0.8392929	best: 0.8392929 (2500)	total: 1m 53s	remaining: 4m 9s
3000:	test: 0.8404800	best: 0.8404800 (3000)	total: 2m 14s	remaining: 3m 44s
3500:	test: 0.8413322	best: 0.8413322 (3500)	total: 2m 35s	remaining: 3m 19s
4000:	test: 0.8419310	best: 0.8419310 (4000)	total: 2m 54s	remaining: 2m 54s
4500:	test: 0.8424051	best: 0.8424051 (4500)	total: 3m 13s	remaining: 2m 30s
5000:	test: 0.8428006	best: 0.8428006 (5000)	total: 3m 32s	remaining: 2m 7s
5500:	test: 0.8431196	best: 0.8431196 (5500)	total: 3m 51s	remaining: 1m 45s
6000:	test: 0.8434144	best: 0.8434148 (5999)	total: 4m 9s	remaining: 1m 23s
6500:	test: 0.8

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7143151	best: 0.7143151 (0)	total: 45.8ms	remaining: 6m 6s
500:	test: 0.8111688	best: 0.8111688 (500)	total: 22.3s	remaining: 5m 34s
1000:	test: 0.8256657	best: 0.8256657 (1000)	total: 46.7s	remaining: 5m 26s
1500:	test: 0.8308766	best: 0.8308766 (1500)	total: 1m 10s	remaining: 5m 3s
2000:	test: 0.8335465	best: 0.8335465 (2000)	total: 1m 32s	remaining: 4m 38s
2500:	test: 0.8351467	best: 0.8351467 (2500)	total: 1m 55s	remaining: 4m 12s
3000:	test: 0.8362572	best: 0.8362572 (3000)	total: 2m 16s	remaining: 3m 47s
3500:	test: 0.8370941	best: 0.8370941 (3500)	total: 2m 37s	remaining: 3m 22s
4000:	test: 0.8377193	best: 0.8377193 (4000)	total: 2m 57s	remaining: 2m 57s
4500:	test: 0.8382559	best: 0.8382559 (4500)	total: 3m 16s	remaining: 2m 33s
5000:	test: 0.8387191	best: 0.8387191 (5000)	total: 3m 36s	remaining: 2m 9s
5500:	test: 0.8390516	best: 0.8390516 (5500)	total: 3m 55s	remaining: 1m 46s
6000:	test: 0.8393283	best: 0.8393283 (6000)	total: 4m 13s	remaining: 1m 24s
6500:	test: 

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7094741	best: 0.7094741 (0)	total: 43.7ms	remaining: 5m 49s
500:	test: 0.8091254	best: 0.8091254 (500)	total: 22.1s	remaining: 5m 30s
1000:	test: 0.8240677	best: 0.8240677 (1000)	total: 46s	remaining: 5m 21s
1500:	test: 0.8299726	best: 0.8299726 (1500)	total: 1m 9s	remaining: 4m 59s
2000:	test: 0.8330230	best: 0.8330230 (2000)	total: 1m 31s	remaining: 4m 34s
2500:	test: 0.8348547	best: 0.8348547 (2500)	total: 1m 52s	remaining: 4m 8s
3000:	test: 0.8360656	best: 0.8360656 (3000)	total: 2m 13s	remaining: 3m 43s
3500:	test: 0.8369388	best: 0.8369388 (3500)	total: 2m 33s	remaining: 3m 17s
4000:	test: 0.8375648	best: 0.8375668 (3997)	total: 2m 53s	remaining: 2m 53s
4500:	test: 0.8380795	best: 0.8380795 (4500)	total: 3m 12s	remaining: 2m 29s
5000:	test: 0.8384875	best: 0.8384875 (5000)	total: 3m 31s	remaining: 2m 6s
5500:	test: 0.8388227	best: 0.8388227 (5500)	total: 3m 49s	remaining: 1m 44s
6000:	test: 0.8391681	best: 0.8391681 (6000)	total: 4m 8s	remaining: 1m 22s
6500:	test: 0.8

In [12]:
class Model:

    def __init__(
            self,
            lgb_clf_models:list,
            xgb_clf_models:list,
            cat_models:list
            ) -> None:
        self.lgb_clf_models = lgb_clf_models
        self.xgb_clf_models = xgb_clf_models
        self.cat_models =  cat_models
    
    def predict_proba(self,X):

        self.lgb_clf_pred = [_.predict_proba(X)[:,1] for _ in self.lgb_clf_models] * 2
        self.xgb_clf_pred = [_.predict_proba(X)[:,1] for _ in self.xgb_clf_models]
        self.cat_pred = [_.predict_proba(X.to_pandas())[:,1] for _ in self.cat_models]

        return np.mean(
            self.lgb_clf_pred+\
            self.xgb_clf_pred+\
            self.cat_pred,
            axis=0
            )

In [13]:
model = Model(
    lgb_clf_models,
    xgb_clf_models,
    cat_models
    )

y_pred = model.predict_proba(valid_df.drop("target"))
auc_score = roc_auc_score(valid_df.select('target'),y_pred)
auc_score

0.8474002954505955

In [14]:
def gini_stability(base,y,w_fallingrate=88.0, w_resstd=-0.5):
    base = base.to_pandas()
    base["score"] = y
    gini_in_time = base.loc[:, ["weekday", "target", "score"]]\
        .sort_values("weekday")\
        .groupby("weekday")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

gini_score = gini_stability(valid_df,y_pred)
gini_score

0.6878907026151981

In [15]:
print(roc_auc_score(valid_df.select("target"),np.mean(model.lgb_clf_pred,axis=0)))
print(gini_stability(valid_df,np.mean(model.lgb_clf_pred,axis=0)))

0.8448479646320486
0.6830680771807403


In [16]:
model.lgb_clf_pred

[array([0.00621409, 0.03489557, 0.00558151, ..., 0.01204837, 0.00205422,
        0.03026072]),
 array([0.00996328, 0.02990734, 0.00429911, ..., 0.01514234, 0.00202418,
        0.03549717]),
 array([0.00666438, 0.02828708, 0.00578279, ..., 0.01170573, 0.00331275,
        0.03118495]),
 array([0.00604   , 0.01814798, 0.00657663, ..., 0.01012939, 0.00215094,
        0.03281149]),
 array([0.00777309, 0.02708617, 0.00286457, ..., 0.01174099, 0.00208598,
        0.04165706]),
 array([0.00621409, 0.03489557, 0.00558151, ..., 0.01204837, 0.00205422,
        0.03026072]),
 array([0.00996328, 0.02990734, 0.00429911, ..., 0.01514234, 0.00202418,
        0.03549717]),
 array([0.00666438, 0.02828708, 0.00578279, ..., 0.01170573, 0.00331275,
        0.03118495]),
 array([0.00604   , 0.01814798, 0.00657663, ..., 0.01012939, 0.00215094,
        0.03281149]),
 array([0.00777309, 0.02708617, 0.00286457, ..., 0.01174099, 0.00208598,
        0.04165706])]

In [17]:
print(roc_auc_score(valid_df.select("target"),np.mean(model.xgb_clf_pred,axis=0)))
print(gini_stability(valid_df,np.mean(model.xgb_clf_pred,axis=0)))

0.8455786297472909


0.6840164672161458


In [18]:
model.xgb_clf_pred

[array([0.00604226, 0.03829956, 0.00823687, ..., 0.01325773, 0.00612623,
        0.01764909], dtype=float32),
 array([0.00625285, 0.02765135, 0.0059294 , ..., 0.01567665, 0.00602752,
        0.01919378], dtype=float32),
 array([0.00575946, 0.02694468, 0.01035446, ..., 0.01500583, 0.00734547,
        0.02026871], dtype=float32),
 array([0.00565003, 0.02405031, 0.00899982, ..., 0.01346271, 0.00906722,
        0.01613377], dtype=float32),
 array([0.00743908, 0.03265947, 0.00789238, ..., 0.01901342, 0.00543336,
        0.02447376], dtype=float32)]

In [19]:
print(roc_auc_score(valid_df.select("target"),np.mean(model.cat_pred,axis=0)))
print(gini_stability(valid_df,np.mean(model.cat_pred,axis=0)))

0.8459847831045288
0.684467377735524


In [20]:
model.cat_pred

[array([0.00553317, 0.02525808, 0.00740729, ..., 0.01400124, 0.00766944,
        0.02083976]),
 array([0.00576882, 0.02587494, 0.00655486, ..., 0.0185197 , 0.00722916,
        0.02145336]),
 array([0.00549824, 0.02521695, 0.00740571, ..., 0.01208073, 0.00700351,
        0.02139665]),
 array([0.00514528, 0.0227229 , 0.00812103, ..., 0.01130249, 0.00641884,
        0.01870658]),
 array([0.00647916, 0.02738061, 0.00572591, ..., 0.01207945, 0.006707  ,
        0.01917659])]

In [None]:
import numpy as np

In [12]:
ones = [np.full(shape=10,fill_value=1)] * 5
twos = [np.full(shape=10,fill_value=2)] * 5
threes = [np.full(shape=10,fill_value=3)] * 5

In [13]:
[_*3.5 for _ in ones]

[array([3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5]),
 array([3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5]),
 array([3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5]),
 array([3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5]),
 array([3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5, 3.5])]

In [8]:
np.mean(ones+twos+threes,axis=0)

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])