In [1]:
import os,sys,warnings,re,math,gc,time
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["KERAS_BACKEND"] = "tensorflow"
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
import lightgbm as lgb
tf.get_logger().setLevel("ERROR")
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_thousands_separator(",")
pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(10)

polars.config.Config

In [3]:
train_base = (
    pl.read_parquet("/home/sohail/Downloads/credit_risk/train/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
train_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
0,2019-01-03,1,1,4,0,0
1,2019-01-03,1,1,4,0,0
2,2019-01-04,1,1,5,0,0
3,2019-01-03,1,1,4,0,0
4,2019-01-04,1,1,5,0,1
…,…,…,…,…,…,…
2703450,2020-10-05,10,41,1,1,0
2703451,2020-10-05,10,41,1,1,0
2703452,2020-10-05,10,41,1,1,0
2703453,2020-10-05,10,41,1,1,0


In [4]:
test_base = (
    pl.read_parquet("/home/sohail/Downloads/credit_risk/test/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
test_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
57543,2020-10-06,10,41,2,1,0
57549,2020-10-06,10,41,2,1,0
57551,2020-10-06,10,41,2,1,0
57552,2020-10-07,10,41,3,1,0
57569,2020-10-06,10,41,2,1,0
57630,2020-10-06,10,41,2,1,0
57631,2020-10-06,10,41,2,1,0
57632,2020-10-06,10,41,2,1,0
57633,2020-10-06,10,41,2,1,0
57634,2020-10-06,10,41,2,1,0


In [5]:
drop_list = [
    "int_shallow_subjectrole_182M",
    "int_shallow_subjectrole_93M",
    "person_depth_empls_employer_name_740M",
    "person_shallow_contaddr_matchlist_1032L",
    "person_shallow_remitter_829L",
    "static_base_applicationcnt_361L",
    "static_base_bankacctype_710L",
    "static_base_commnoinclast6m_3546845L",
    "static_base_deferredmnthsnum_166L",
    "static_base_isdebitcard_729L",
    "static_base_mastercontrelectronic_519L",
    "static_base_mastercontrexist_109L",
    "static_base_paytype1st_925L",
    "static_base_paytype_783L",
    "static_base_typesuite_864L",
    "past_depth_cacccardblochreas_147M",
    "past_depth_cacccardblochreas_147M",
    "int_shallow_financialinstitution_591M",
    "person_depth_empls_economicalst_849M",
    "static_external_assignmentdate_4527235D",
    "int_shallow_purposeofcred_426M",
    "deposit_shallow_num_group1",
    "static_base_cardtype_51L",
    "reg_a_recorddate_4527225D",
    "static_external_responsedate_4527233D",
    "int_shallow_numberofoverdueinstls_834L",
    "int_shallow_numberofoutstandinstls_520L",
    "person_shallow_housetype_905L",
    "person_depth_conts_role_79M",
    "static_base_clientscnt_493L",
    "card_shallow_num_group1",
    "static_external_responsedate_4917613D",
    "static_base_lastrejectcommodtypec_5251769M",
    "static_base_clientscnt_257L"
]

1424.28 static_base_clientscnt_1130L
1580.23 static_base_clientscnt3m_3712950L
1613.62 int_shallow_description_351M
1723.46 int_shallow_classificationofcontr_13M
2064.03 person_depth_num_group2
2150.15 static_base_equalitydataagreement_891L
2677.09 static_base_numpmtchanneldd_318L
2766.21 static_base_clientscnt_360L
2849.83 static_base_applicationscnt_629L
2902.95 int_shallow_outstandingamount_354A
2908.08 static_base_numinstpaidearly5d_1087L
3120.68 past_shallow_actualdpd_943P
3232.40 past_shallow_credacc_transactions_402L
3279.17 int_shallow_totaloutstanddebtvalue_668A
3315.26 past_shallow_cancelreason_3545846M
3419.50 static_base_clientscnt_946L
3465.42 static_external_responsedate_1012D
3488.22 int_shallow_classificationofcontr_400M
3562.02 static_base_actualdpdtolerance_344P
3571.77 static_base_clientscnt_100L
3585.06 static_base_clientscnt_304L
3682.52 past_shallow_isbidproduct_390L
3720.91 static_external_education_88M
3757.75 static_external_pmtcount_693L
3840.75 person_shallow_contaddr_smempladdr_334L
3952.76 static_base_clientscnt_157L
3955.59 int_shallow_contractst_545M
3994.07 static_external_pmtcount_4527229L
4019.92 static_base_applications30d_658L
4183.73 static_base_clientscnt_1022L
4365.62 static_base_applicationscnt_464L
4915.10 int_shallow_overdueamount_31A

In [6]:
total_df = (
    pl.read_parquet("/home/sohail/Downloads/total_df.parquet")
    .drop(drop_list)
)
total_df

case_id_base,month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,…,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,reg_a_amount_4527230A,reg_a_num_group1,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D,target
u32,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f64,f32,f64,f64,f64,f32,f64,u8
0,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
1,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
2,0.00,0.00,0.67,0.00,-0.01,-1.08,0.00,-1.12,0.00,-0.45,0.00,0.00,0.00,-1.06,0.00,-0.35,-1.58,0.00,0.05,0.00,0.08,0.00,0.08,0.78,0.00,0.61,0.20,0.00,0.78,0.88,0.91,0.00,0.85,0.00,0.61,0.00,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
3,0.00,0.00,0.50,0.00,-0.01,0.13,0.00,0.00,0.00,-0.45,0.00,0.00,0.00,-0.22,0.00,-0.35,-1.50,0.00,0.00,0.00,-0.97,0.00,-0.97,0.78,0.02,1.00,0.20,0.00,0.78,0.88,0.91,0.00,0.99,0.25,1.00,0.00,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
4,0.00,0.00,0.67,0.00,-0.01,-0.84,0.00,0.00,0.00,-0.45,0.00,0.00,0.00,-0.60,0.00,-0.35,-1.36,0.00,0.00,0.00,0.08,0.00,0.08,0.78,0.03,1.00,0.20,0.00,0.78,0.88,0.91,0.20,0.83,0.25,1.00,0.00,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57630,0.82,0.78,0.17,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
57631,0.82,0.78,0.17,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
57632,0.82,0.78,0.17,1.00,-0.01,0.19,0.00,0.00,0.00,-0.45,0.00,0.00,0.00,0.54,0.00,-0.35,0.99,0.00,0.00,0.00,0.08,0.00,0.08,0.78,0.20,1.00,0.20,0.00,0.78,0.88,0.91,0.20,0.83,0.25,1.00,0.00,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
57633,0.82,0.78,0.17,1.00,-0.01,0.50,0.00,0.00,0.00,8.13,0.00,0.00,0.00,2.45,0.00,-0.35,3.77,0.00,0.16,0.00,1.14,0.00,1.14,0.78,0.02,0.86,0.20,0.00,0.78,0.88,0.91,0.20,0.90,0.25,0.85,0.00,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0


In [7]:
train_total = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base")).not_()).drop("case_id_base")
train_total

month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,past_shallow_isbidproduct_390L,…,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,reg_a_amount_4527230A,reg_a_num_group1,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D,target
f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,…,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f64,f32,f64,f64,f64,f32,f64,u8
0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,false,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,false,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
0.00,0.00,0.67,0.00,-0.01,-1.08,0.00,-1.12,0.00,-0.45,0.00,0.00,0.00,-1.06,0.00,-0.35,-1.58,0.00,0.05,0.00,0.08,0.00,0.08,0.78,0.00,0.61,0.20,0.00,0.78,0.88,0.91,0.00,0.85,0.00,0.61,0.00,false,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
0.00,0.00,0.50,0.00,-0.01,0.13,0.00,0.00,0.00,-0.45,0.00,0.00,0.00,-0.22,0.00,-0.35,-1.50,0.00,0.00,0.00,-0.97,0.00,-0.97,0.78,0.02,1.00,0.20,0.00,0.78,0.88,0.91,0.00,0.99,0.25,1.00,0.00,false,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
0.00,0.00,0.67,0.00,-0.01,-0.84,0.00,0.00,0.00,-0.45,0.00,0.00,0.00,-0.60,0.00,-0.35,-1.36,0.00,0.00,0.00,0.08,0.00,0.08,0.78,0.03,1.00,0.20,0.00,0.78,0.88,0.91,0.20,0.83,0.25,1.00,0.00,false,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.77,0.87,0.59,0.00,0.00,0.18,0.40,0.06,0.00,0.98,0.79,0.71,0.88,0.70,0.88,0.64,0.36,0.40,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
0.82,0.78,0.00,1.00,-0.01,6.80,-1.19,0.56,-1.77,-0.45,0.18,0.84,-0.46,1.49,-0.63,-0.35,-0.41,-0.17,0.63,-0.55,0.08,0.36,0.08,0.88,0.00,0.80,0.20,0.00,0.88,0.85,0.85,0.20,0.62,0.25,0.80,0.00,false,…,-0.00,0.00,0.01,0.00,0.04,0.17,0.60,0.87,0.87,0.60,0.00,0.09,0.73,0.40,0.19,0.49,0.99,0.89,0.65,0.88,0.64,0.88,0.18,0.73,0.40,0.19,0.18,1.00,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
0.82,0.78,0.00,1.00,-0.01,1.93,-0.74,-1.12,-1.76,-0.45,0.19,0.86,-0.46,0.81,1.40,-0.35,-0.41,-0.20,0.26,0.94,0.08,1.32,0.08,0.93,0.00,0.93,0.20,0.00,0.64,1.00,1.00,0.20,0.83,0.25,0.64,0.00,false,…,-0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.89,0.91,0.59,0.00,0.00,0.18,0.40,0.38,0.00,0.99,1.00,0.71,0.88,0.70,0.88,0.09,0.36,0.40,0.62,0.00,1.00,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0
0.82,0.78,0.00,1.00,-0.01,0.91,0.00,0.00,0.00,-0.45,0.00,0.00,0.00,0.16,-0.63,-0.35,-0.86,-0.20,0.11,-0.55,-0.79,0.00,-0.79,0.86,0.00,0.86,0.20,0.50,0.92,0.91,0.86,0.20,0.95,0.25,0.86,0.50,false,…,-0.00,0.00,0.00,0.00,0.00,0.17,0.58,0.60,0.68,0.57,0.00,0.91,0.09,0.80,0.31,0.00,0.99,0.90,0.71,0.97,0.70,0.97,0.55,0.09,0.60,0.25,0.00,1.00,0.00,0.06,-1.61,0.05,0.34,0.04,0.00,0.62,0
0.82,0.78,0.00,1.00,-0.01,0.08,0.25,-1.12,-1.75,-0.45,0.21,0.87,10.12,0.99,0.54,-0.35,0.32,-0.04,0.42,0.48,2.19,1.33,2.19,0.94,0.00,0.50,0.20,0.00,0.94,0.81,1.00,0.20,0.83,0.25,0.50,0.00,false,…,-0.00,0.00,0.01,0.00,0.04,0.18,0.57,0.91,0.92,0.58,0.00,0.18,0.73,0.80,0.06,0.49,0.99,0.82,0.69,0.88,0.66,0.88,0.64,0.73,0.60,0.06,0.18,1.00,0.00,0.06,0.00,0.06,0.55,0.04,0.00,0.62,0


In [8]:
submission_df = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base"))).drop(["target"])
submission_df

case_id_base,month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,…,int_shallow_totaloutstanddebtvalue_39A,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,reg_a_amount_4527230A,reg_a_num_group1,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D
u32,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f64,f32,f64,f64,f64,f32,f64
57543,0.82,0.78,0.17,1.0,-0.01,-1.12,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.64,0.0,-0.35,-0.97,0.0,0.0,0.0,0.7,0.0,0.7,0.78,0.02,0.85,0.2,0.5,0.78,0.88,0.91,0.2,0.83,0.25,0.84,0.5,…,-0.11,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.82,0.59,0.0,0.0,0.18,0.8,0.06,0.0,0.99,0.79,0.71,1.0,0.7,0.95,0.64,0.09,0.0,0.06,0.0,0.99,0.0,0.06,-0.39,0.04,0.84,0.04,0.0,0.62
57549,0.82,0.78,0.17,1.0,-0.01,-0.98,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.98,0.0,-0.35,-1.47,0.0,0.0,0.0,-0.97,0.0,-0.97,0.78,0.02,0.9,0.2,0.0,0.78,0.88,0.91,0.0,0.94,0.0,0.9,0.0,…,0.0,-0.0,0.0,0.01,0.0,0.04,0.18,0.61,0.66,0.87,0.6,0.0,0.55,0.18,0.4,1.0,0.03,0.98,0.98,0.98,0.88,0.92,0.88,0.73,0.36,0.4,0.62,0.18,0.99,0.0,0.06,-2.14,0.11,0.64,0.04,0.0,0.62
57551,0.82,0.78,0.17,1.0,-0.01,-0.07,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.22,0.0,-0.35,-0.41,0.0,0.05,0.0,-0.44,0.0,-0.44,0.78,0.02,0.94,0.2,0.0,0.78,0.88,0.91,0.0,0.95,0.25,0.94,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62
57552,0.82,0.78,0.33,1.0,-0.01,-0.23,0.0,-0.28,0.0,-0.45,0.0,0.0,0.0,-0.41,0.0,-0.35,-1.36,0.0,0.0,0.0,-0.44,0.0,-0.44,0.78,0.02,0.75,0.2,0.0,0.78,0.88,0.91,0.0,0.83,0.25,0.75,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62
57569,0.82,0.78,0.17,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.78,0.0,0.78,0.2,0.5,0.78,0.88,0.91,0.2,0.83,0.25,0.76,0.5,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62
57630,0.82,0.78,0.17,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.78,0.0,0.78,0.2,0.5,0.78,0.88,0.91,0.2,0.83,0.25,0.76,0.5,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62
57631,0.82,0.78,0.17,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.78,0.0,0.78,0.2,0.5,0.78,0.88,0.91,0.2,0.83,0.25,0.76,0.5,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62
57632,0.82,0.78,0.17,1.0,-0.01,0.19,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,0.54,0.0,-0.35,0.99,0.0,0.0,0.0,0.08,0.0,0.08,0.78,0.2,1.0,0.2,0.0,0.78,0.88,0.91,0.2,0.83,0.25,1.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62
57633,0.82,0.78,0.17,1.0,-0.01,0.5,0.0,0.0,0.0,8.13,0.0,0.0,0.0,2.45,0.0,-0.35,3.77,0.0,0.16,0.0,1.14,0.0,1.14,0.78,0.02,0.86,0.2,0.0,0.78,0.88,0.91,0.2,0.9,0.25,0.85,0.0,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62
57634,0.82,0.78,0.17,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.78,0.0,0.78,0.2,0.5,0.78,0.88,0.91,0.2,0.83,0.25,0.76,0.5,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.77,0.87,0.59,0.0,0.0,0.18,0.4,0.06,0.0,0.98,0.79,0.71,0.88,0.7,0.88,0.64,0.36,0.4,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.55,0.04,0.0,0.62


In [9]:
train_df,valid_df = train_test_split(
    train_total,
    stratify=train_total.select("target"),
    test_size=0.1,
    random_state=420,
    shuffle=True
    )

In [10]:
print(train_df.shape)
print(valid_df.shape)

(1373993, 318)
(152666, 318)


In [11]:
X_train,y_train = train_df.select((~(cs.by_name("target"))).cast(pl.Float32)),train_df.select(pl.col("target").cast(pl.Float32))
X_valid,y_valid = valid_df.select((~(cs.by_name("target"))).cast(pl.Float32)),valid_df.select(pl.col("target").cast(pl.Float32))
del train_df,valid_df

In [12]:
params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.005,
    "n_estimators": 20000,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "reg_alpha": 0.15,
    "reg_lambda": 20,
    "extra_trees": True,
    "num_leaves": 512,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

In [13]:
model = lgb.LGBMClassifier(**params)
model.fit(
    X_train,
    y_train,
    eval_set=[
        (X_valid,y_valid)
        ],
        callbacks=[
            lgb.log_evaluation(100),lgb.early_stopping(2000)
            ]
    )

Training until validation scores don't improve for 2000 rounds
[100]	valid_0's auc: 0.810998
[200]	valid_0's auc: 0.818268
[300]	valid_0's auc: 0.824718
[400]	valid_0's auc: 0.830101
[500]	valid_0's auc: 0.834517
[600]	valid_0's auc: 0.83826
[700]	valid_0's auc: 0.841174
[800]	valid_0's auc: 0.843504
[900]	valid_0's auc: 0.845493
[1000]	valid_0's auc: 0.847105
[1100]	valid_0's auc: 0.848473
[1200]	valid_0's auc: 0.849679
[1300]	valid_0's auc: 0.850707
[1400]	valid_0's auc: 0.851662
[1500]	valid_0's auc: 0.852517
[1600]	valid_0's auc: 0.853303
[1700]	valid_0's auc: 0.853966
[1800]	valid_0's auc: 0.85458
[1900]	valid_0's auc: 0.855138
[2000]	valid_0's auc: 0.855628
[2100]	valid_0's auc: 0.856108
[2200]	valid_0's auc: 0.85655
[2300]	valid_0's auc: 0.85698
[2400]	valid_0's auc: 0.857365
[2500]	valid_0's auc: 0.857712
[2600]	valid_0's auc: 0.858049
[2700]	valid_0's auc: 0.858347
[2800]	valid_0's auc: 0.858659
[2900]	valid_0's auc: 0.858901
[3000]	valid_0's auc: 0.859155
[3100]	valid_0's auc

- 863425 normal
- 863725 drop gain 256 num leaves
- 863586 normal
- 863408 drop split
- 863139 512 num leaves
- 863088 512 256
- 863195 0.8,0.8
- 86323 0.75, 0.75

In [None]:
for num,col_name in sorted(zip(model.feature_importances_,X_train.columns)):
    print(f"{num:.2f} {col_name:>10}")
    # print(f'''"{col_name}",''')

1071.31 static_base_clientscnt_1130L
1416.14 person_depth_num_group2
1419.94 int_shallow_description_351M
1543.99 static_base_clientscnt3m_3712950L
1925.95 static_base_numinstpaidearly5d_1087L
2341.40 static_base_clientscnt_360L
2371.87 past_shallow_credacc_transactions_402L
2458.85 int_shallow_classificationofcontr_400M
2470.53 static_base_applicationscnt_629L
2474.84 static_base_numpmtchanneldd_318L
2650.44 int_shallow_totaloutstanddebtvalue_668A
2903.52 static_base_applications30d_658L
2904.02 past_shallow_actualdpd_943P
2933.90 int_shallow_outstandingamount_354A
2945.05 static_base_actualdpdtolerance_344P
3002.40 int_shallow_classificationofcontr_13M
3070.16 static_external_pmtcount_693L
3088.79 static_base_clientscnt_946L
3102.34 static_base_clientscnt_1022L
3111.35 static_base_clientscnt_304L
3151.88 static_external_responsedate_1012D
3241.94 static_base_clientscnt_100L
3334.88 static_base_clientscnt_157L
3883.20 static_external_pmtcount_4527229L
3938.57 static_external_education

In [None]:
sub_df = pd.DataFrame({
    "case_id": submission_df['case_id_base'].to_list(),
    "score": model.predict_proba(submission_df.drop("case_id_base"))[:,1]
}).set_index("case_id")
sub_df.to_csv("./submission.csv")