In [1]:
import os,sys,warnings,re,math,gc,time
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold,StratifiedGroupKFold
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_thousands_separator(",")
pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(6)

polars.config.Config

In [3]:
path_to_train = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train"
path_to_test = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test"
path_to_features = "/kaggle/input/home-credit-credit-risk-model-stability/feature_definitions.csv"

In [4]:
feat_df = pl.read_csv(path_to_features)
# feat_df.head()

In [5]:
all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
# display(train_files_df.head())
# display(test_files_df.head())

In [6]:
train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
# train_base

In [7]:
test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
# test_base

In [8]:
total_base = (
    pl.concat(
        [
            train_base,
            test_base
        ],
        how="vertical"
    )
)

In [9]:
def reduce_dtypes(df:pl.DataFrame):
    return (
        df
        .with_columns(
            cs.by_name("case_id").cast(pl.UInt32),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            cs.ends_with("T","M").cast(pl.String),
            cs.ends_with("P","A").cast(pl.Float32),
            (cs.ends_with("L") & cs.numeric()).cast(pl.Float32),
        )
    )

def grouping(df):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.numeric().max(),
            (~cs.numeric()).drop_nulls().mode().first()
        )
    )

def preprocess(filter_string:str,prefix_string:str):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(_).pipe(reduce_dtypes).pipe(grouping) for _ in train_files_list
                ]
            )
        )
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(_).select(train_df.columns).cast(train_df.schema).pipe(grouping) for _ in test_files_list
                ]
            )
        )
    return (
        pl.concat(
            [
                train_df,
                test_df
            ],
            how="vertical_relaxed"
        )
        .pipe(lambda df: df.rename({_:f"{prefix_string}_{_}" for _ in df.columns if not _ == 'case_id'}))
    )

def select_low_catcols(df:pl.DataFrame,thresh=200):
    col_names = []
    for col_name in df.select(cs.categorical()).columns:
        if df.select(pl.col(col_name).value_counts()).shape[0] > thresh:
            col_names.append(col_name)
    return df.select(~cs.by_name(col_names))
    

def select_impuatable(df:pl.DataFrame,thresh=0.95):
    cols =  (
        df
        .select(pl.all().is_null().mean())
        .transpose(include_header=True)
        .filter(pl.col("column_0") < thresh)
        ["column"].to_list()
    )
    return df.select(cols)

In [10]:
total_past_shallow = preprocess("applprev_1","past_shallow")
total_past_depth = preprocess("applprev_2","past_depth")
total_static_base = preprocess("static_0","static_base")
total_static_external = (
    preprocess("static_cb","static_external")
    .with_columns(
        pl.col("static_external_riskassesment_302T").str.split("%").list.gather([0,1]).apply(lambda x: (int(x[0])+int(x[1].split("-")[1]))/200)
    )
)
total_person_shallow = preprocess("person_1","person_shallow")
total_person_depth = preprocess("person_2","person_depth")
total_other_shallow = preprocess("other_1","other_shallow")
total_deposit_shallow = preprocess("deposit_1","deposit_shallow")
total_debitcard_shallow = preprocess("debitcard","card_shallow")
total_credit_internal_shallow = preprocess("bureau_a_1","int_shallow")
total_credit_internal_depth = preprocess("bureau_a_2","int_depth")
total_credit_external_shallow = preprocess("bureau_b_1","ext_shallow")
total_credit_external_depth = preprocess("bureau_b_2","ext_depth")
total_registry_a = preprocess("registry_a","reg_a")
total_registry_b = preprocess("registry_b","reg_b")
total_registry_c = preprocess("registry_c","reg_c")

In [11]:
select_columns = [
    'case_id_base',
    'static_base_numinstpaidearly5dest_4493211L',
    'reg_c_num_group1',
    'static_base_applicationscnt_1086L',
    'static_base_clientscnt6m_3712949L',
    'int_shallow_refreshdate_3813885D',
    'past_shallow_credacc_status_367L',
    'int_shallow_contractst_545M',
    'static_base_numinstpaidearlyest_4493214L',
    'int_shallow_periodicityofpmts_1102L',
    'person_shallow_contaddr_smempladdr_334L',
    'static_base_clientscnt_887L',
    'past_shallow_isbidproduct_390L',
    'static_base_numinstpaidearly5dobd_4499205L',
    'static_base_numactiverelcontr_750L',
    'static_base_avgdbdtollast24m_4525197P',
    'int_shallow_lastupdate_1112D',
    'static_base_lastrejectcommoditycat_161M',
    'static_base_numcontrs3months_479L',
    'static_base_numinstregularpaidest_4493210L',
    'past_shallow_cancelreason_3545846M',
    'int_shallow_contractst_964M',
    'static_base_clientscnt12m_3712952L',
    'static_base_numinstpaid_4499208L',
    'int_shallow_purposeofcred_874M',
    'int_shallow_numberofcontrsvalue_258L',
    'static_base_numinstpaidearly_338L',
    'static_base_numinstlswithdpd5_4187116L',
    'int_shallow_totaldebtoverduevalue_718A',
    'static_base_maxdbddpdtollast6m_4187119P',
    'static_external_maritalst_893M',
    'static_base_numnotactivated_1143L',
    'past_depth_credacc_cards_status_52L',
    'past_depth_num_group2',
    'static_base_clientscnt_1071L',
    'reg_b_num_group1',
    'static_external_secondquarter_766L',
    'static_base_numinstpaidearly3dest_4493216L',
    'past_shallow_childnum_21L',
    'static_base_numinstpaidlastcontr_4325080L',
    'int_shallow_periodicityofpmts_837L',
    'static_base_numinstregularpaid_973L',
    'deposit_shallow_openingdate_313D',
    'card_shallow_openingdate_857D',
    'static_base_maxdpdlast3m_392P',
    'static_base_numinstls_657L',
    'static_base_numactivecreds_622L',
    'int_shallow_prolongationcount_1120L',
    'static_base_sellerplacecnt_915L',
    'person_shallow_personindex_1023L',
    'static_external_thirdquarter_1082L',
    'static_external_fourthquarter_440L',
    'static_external_firstquarter_103L',
    'static_base_mindbdtollast24m_4525191P',
    'static_base_lastcancelreason_561M',
    'static_base_avgdbddpdlast3m_4187120P',
    'static_base_maxdpdinstlnum_3546846P',
    'person_shallow_type_25L',
    'static_base_validfrom_1069D',
    'static_base_numinstpaidlate1d_3546852L',
    'static_base_maxdbddpdlast1m_3658939P',
    'reg_b_deductiondate_4917603D',
    'past_shallow_credacc_minhisbal_90A',
    'person_shallow_num_group1',
    'static_base_numinstmatpaidtearly2d_4499204L',
    'int_shallow_contractsum_5085717L',
    'static_base_avgdbddpdlast24m_3658932P',
    'static_external_contractssum_5085716L',
    'static_base_applicationscnt_867L',
    'static_base_lastapprcommoditycat_1041M',
    'person_shallow_empl_industry_691L',
    'static_base_numinstpaidearly3d_3546850L',
    'static_base_cntincpaycont9m_3716944L',
    'static_base_sellerplacescnt_216L',
    'past_shallow_credacc_maxhisbal_375A',
    'static_base_numinstunpaidmaxest_4493212L',
    'int_shallow_num_group1',
    'static_base_numinstlallpaidearly3d_817L',
    'static_base_avglnamtstart24m_4525187A',
    'person_depth_num_group1',
    'static_base_numinsttopaygrest_4493213L',
    'past_depth_conts_type_509L',
    'static_external_assignmentdate_238D',
    'int_shallow_numberofinstls_229L',
    'past_shallow_revolvingaccount_394A',
    'int_shallow_numberofoverdueinstls_725L',
    'static_base_numinstlswithoutdpd_562L',
    'static_base_numincomingpmts_3546848L',
    'int_shallow_financialinstitution_382M',
    'static_base_homephncnt_628L',
    'static_base_lastst_736L',
    'static_base_clientscnt_533L',
    'static_base_numinstlsallpaid_934L',
    'static_base_mindbddpdlast24m_3658935P',
    'static_external_pmtscount_423L',
    'static_base_numinstunpaidmax_3546851L',
    'static_base_numactivecredschannel_414L',
    'int_shallow_numberofoutstandinstls_59L',
    'static_base_opencred_647L',
    'static_external_pmtaverage_4527227A',
    'static_base_monthsannuity_845L',
    'deposit_shallow_amount_416A',
    'static_base_maxdpdlast6m_474P',
    'static_base_maxlnamtstart6m_4525199A',
    'static_base_totinstallast1m_4525188A',
    'static_base_maxpmtlast3m_4525190A',
    'reg_b_amount_4917619A',
    'static_base_avgpmtlast12m_4525200A',
    'person_shallow_education_927M',
    'past_shallow_rejectreason_755M',
    'past_shallow_isdebitcard_527L',
    'past_shallow_credacc_actualbalance_314A',
    'static_base_dtlastpmtallstes_4499206D',
    'static_base_numinsttopaygr_769L',
    'static_base_lastrejectreasonclient_4145040M',
    'person_shallow_safeguarantyflag_411L',
    'static_external_pmtaverage_3A',
    'static_base_numrejects9m_859L',
    'static_base_maxdbddpdtollast12m_3658940P',
    'past_shallow_postype_4733339M',
    'static_external_numberofqueries_373L',
    'static_base_avgmaxdpdlast9m_3716943P',
    'static_base_posfpd10lastmonth_333P',
    'int_shallow_numberofinstls_320L',
    'static_external_days360_512L',
    'static_base_lastactivateddate_801D',
    'static_base_maxoutstandbalancel12m_4187113A',
    'static_base_datefirstoffer_1144D',
    'static_base_lastapprdate_640D',
    'static_base_numinstlswithdpd10_728L',
    'static_base_maxdpdtolerance_374P',
    'int_shallow_numberofcontrsvalue_358L',
    'reg_c_processingdate_168D',
    'static_base_maxdpdfrom6mto36m_3546853P',
    'static_base_twobodfilling_608L',
    'static_base_annuitynextmonth_57A',
    'static_base_avginstallast24m_3658937A',
    'static_base_sumoutstandtotalest_4493215A',
    'static_base_downpmt_116A',
    'static_base_maininc_215A',
    'static_base_daysoverduetolerancedd_3976961L',
    'past_shallow_dateactivated_425D',
    'past_shallow_approvaldate_319D',
    'static_base_currdebtcredtyperange_828A',
    'static_base_avgdpdtolclosure24_3658938P',
    'past_shallow_credamount_590A',
    'person_shallow_persontype_792L',
    'person_shallow_persontype_1072L',
    'static_base_avgoutstandbalancel6m_4187114A',
    'int_shallow_nominalrate_498L',
    'static_base_maxdpdlast9m_1059P',
    'year',
    'past_shallow_num_group1',
    'past_depth_num_group1',
    'past_shallow_dtlastpmt_581D',
    'static_base_maxinstallast24m_3658928A',
    'static_external_birthdate_574D',
    'int_shallow_overdueamount_659A',
    'int_shallow_nominalrate_281L',
    'reg_a_num_group1',
    'int_shallow_totaldebtoverduevalue_178A',
    'static_external_requesttype_4525192L',
    'static_external_days30_165L',
    'static_base_inittransactioncode_186L',
    'int_shallow_dateofcredend_353D',
    'static_base_posfpd30lastmonth_3976960P',
    'static_base_lastapprcredamount_781A',
    'past_shallow_rejectreasonclient_4145042M',
    'past_shallow_dtlastpmtallstes_3545839D',
    'int_shallow_dpdmaxdateyear_896T',
    'static_base_posfstqpd30lastmonth_3976962P',
    'past_shallow_mainoccupationinc_437A',
    'int_shallow_overdueamountmaxdatemonth_284T',
    'past_shallow_creationdate_885D',
    'static_base_inittransactionamount_650A',
    'static_base_lastrejectreason_759M',
    'past_shallow_currdebt_94A',
    'int_shallow_instlamount_852A',
    'int_shallow_outstandingamount_362A',
    'static_base_sumoutstandtotal_3546847A',
    'int_shallow_overdueamountmaxdateyear_994T',
    'static_base_lastrejectcredamount_222A',
    'person_shallow_language1_981M',
    'past_shallow_firstnonzeroinstldate_307D',
    'reg_c_pmtamount_36A',
    'int_shallow_annualeffectiverate_63L',
    'person_shallow_mainoccupationinc_384A',
    'int_shallow_lastupdate_388D',
    'past_shallow_byoccupationinc_3656910L',
    'static_base_lastapplicationdate_877D',
    'past_shallow_outstandingdebt_522A',
    'person_shallow_relationshiptoclient_415T',
    'int_shallow_monthlyinstlamount_674A',
    'int_shallow_credlmt_230A',
    'static_base_maxdebt4_972A',
    'past_shallow_downpmt_134A',
    'int_shallow_dpdmaxdatemonth_442T',
    'int_shallow_dateofcredstart_181D',
    'static_base_credtype_322L',
    'past_shallow_maxdpdtolerance_577P',
    'int_shallow_dateofcredend_289D',
    'int_shallow_annualeffectiverate_199L',
    'int_shallow_dateofrealrepmt_138D',
    'static_base_firstclxcampaign_1125D',
    'past_shallow_credtype_587L',
    'static_external_maritalst_385M',
    'static_base_currdebt_22A',
    'static_base_firstdatedue_489D',
    'static_base_totaldebt_9A',
    'static_external_days180_256L',
    'person_shallow_role_1084L',
    'int_shallow_overdueamountmaxdatemonth_365T',
    'int_shallow_dpdmaxdatemonth_89T',
    'past_shallow_tenor_203L',
    'past_shallow_credacc_credlmt_575A',
    'past_shallow_pmtnum_8L',
    'int_shallow_totaloutstanddebtvalue_39A',
    'static_base_maxannuity_159A',
    'past_shallow_annuity_853A',
    'person_shallow_relationshiptoclient_642T',
    'int_shallow_overdueamountmax2_398A',
    'int_shallow_monthlyinstlamount_332A',
    'static_base_maxdpdlast12m_727P',
    'int_shallow_overdueamountmax_35A',
    'int_shallow_debtoverdue_47A',
    'int_shallow_credlmt_935A',
    'static_base_maxdpdinstldate_3546855D',
    'static_external_days90_310L',
    'static_base_disbursementtype_67L',
    'static_external_days120_123L',
    'person_shallow_empl_employedtotal_800L',
    'static_base_amtinstpaidbefduel24m_4187115A',
    'week',
    'past_shallow_inittransactioncode_279L',
    'int_shallow_instlamount_768A',
    'int_shallow_debtoutstand_525A',
    'static_base_datelastinstal40dpd_247D',
    'static_base_totalsettled_863A',
    'static_base_maxdpdlast24m_143P',
    'static_base_pctinstlsallpaidearl3d_427L',
    'int_shallow_totalamount_996A',
    'month',
    'past_shallow_familystate_726L',
    'static_base_datelastunpaid_3546854D',
    'past_shallow_education_1138M',
    'person_shallow_familystate_447L',
    'int_shallow_residualamount_488A',
    'int_shallow_overdueamountmax2date_1002D',
    'static_external_education_1103M',
    'static_base_mobilephncnt_593L',
    'static_base_cntpmts24_3658933L',
    'static_external_description_5085714M',
    'static_base_lastdelinqdate_224D',
    'weekday',
    'static_base_eir_270L',
    'static_base_interestrate_311L',
    'int_shallow_overdueamountmaxdateyear_2T',
    'static_base_disbursedcredamount_1113A',
    'static_external_dateofbirth_337D',
    'static_base_pctinstlsallpaidlat10d_839L',
    'int_shallow_overdueamountmax2date_1142D',
    'static_base_credamount_770A',
    'person_shallow_empl_employedfrom_271D',
    'static_base_annuity_780A',
    'int_shallow_overdueamountmax2_14A',
    'int_shallow_dateofcredstart_739D',
    'person_shallow_incometype_1044T',
    'int_shallow_totalamount_6A',
    'past_shallow_employedfrom_700D',
    'int_shallow_numberofoverdueinstlmaxdat_148D',
    'int_shallow_numberofoverdueinstlmaxdat_641D',
    'reg_a_amount_4527230A',
    'int_shallow_residualamount_856A',
    'static_external_pmtssum_45A',
    'static_base_price_1097A',
    'int_shallow_dpdmaxdateyear_596T',
    'static_base_isbidproduct_1095L',
    'int_shallow_dpdmax_757P',
    'static_base_pmtnum_254L',
    'int_shallow_numberofoverdueinstlmax_1039L',
    'static_base_pctinstlsallpaidlate6d_3546844L',
    'int_shallow_numberofoverdueinstlmax_1151L',
    'int_shallow_dpdmax_139P',
    'person_shallow_birth_259D',
    'static_base_pctinstlsallpaidlate4d_3546849L',
    'static_base_lastrejectdate_50D',
    'past_shallow_status_219L',
    'int_shallow_overdueamountmax_155A',
    'static_base_pctinstlsallpaidlate1d_3546856L',
    'person_shallow_sex_738L',
    'target'
]

In [12]:
total_df = (
    total_base
    .join(
        total_past_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_past_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_base,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_external,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_other_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_deposit_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_debitcard_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_a,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_b,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_c,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.String).cast(pl.Categorical)
    )
    .select(select_columns)
    .with_columns(
        pl.col(pl.INTEGER_DTYPES).fill_null(strategy="mean"),
        pl.col(pl.FLOAT_DTYPES).fill_null(strategy="mean"),
        pl.col(pl.Boolean).fill_null(value=pl.col(pl.Boolean).drop_nulls().mode().first()),
        pl.col(pl.Categorical).fill_null(value=pl.col(pl.Categorical).drop_nulls().mode().first())
    )
    .with_columns(
        pl.col(pl.Categorical).to_physical()
    )
)
cat_cols = total_df.select((cs.integer() | cs.boolean()).exclude(["case_id_base","target"])).columns
cont_cols = total_df.select(cs.float()).columns
total_df = (
    total_df
    .with_columns(
        (pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]) - pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min())/(pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).max()-pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min()).cast(pl.Float32),
        (pl.col(pl.FLOAT_DTYPES) - pl.col(pl.FLOAT_DTYPES).mean())/(pl.col(pl.FLOAT_DTYPES).std()).cast(pl.Float32)
    )
)
total_df

case_id_base,static_base_numinstpaidearly5dest_4493211L,reg_c_num_group1,static_base_applicationscnt_1086L,static_base_clientscnt6m_3712949L,int_shallow_refreshdate_3813885D,past_shallow_credacc_status_367L,int_shallow_contractst_545M,static_base_numinstpaidearlyest_4493214L,int_shallow_periodicityofpmts_1102L,person_shallow_contaddr_smempladdr_334L,static_base_clientscnt_887L,past_shallow_isbidproduct_390L,static_base_numinstpaidearly5dobd_4499205L,static_base_numactiverelcontr_750L,static_base_avgdbdtollast24m_4525197P,int_shallow_lastupdate_1112D,static_base_lastrejectcommoditycat_161M,static_base_numcontrs3months_479L,static_base_numinstregularpaidest_4493210L,past_shallow_cancelreason_3545846M,int_shallow_contractst_964M,static_base_clientscnt12m_3712952L,static_base_numinstpaid_4499208L,int_shallow_purposeofcred_874M,int_shallow_numberofcontrsvalue_258L,static_base_numinstpaidearly_338L,static_base_numinstlswithdpd5_4187116L,int_shallow_totaldebtoverduevalue_718A,static_base_maxdbddpdtollast6m_4187119P,static_external_maritalst_893M,static_base_numnotactivated_1143L,past_depth_credacc_cards_status_52L,past_depth_num_group2,static_base_clientscnt_1071L,reg_b_num_group1,static_external_secondquarter_766L,…,static_base_eir_270L,static_base_interestrate_311L,int_shallow_overdueamountmaxdateyear_2T,static_base_disbursedcredamount_1113A,static_external_dateofbirth_337D,static_base_pctinstlsallpaidlat10d_839L,int_shallow_overdueamountmax2date_1142D,static_base_credamount_770A,person_shallow_empl_employedfrom_271D,static_base_annuity_780A,int_shallow_overdueamountmax2_14A,int_shallow_dateofcredstart_739D,person_shallow_incometype_1044T,int_shallow_totalamount_6A,past_shallow_employedfrom_700D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,reg_a_amount_4527230A,int_shallow_residualamount_856A,static_external_pmtssum_45A,static_base_price_1097A,int_shallow_dpdmaxdateyear_596T,static_base_isbidproduct_1095L,int_shallow_dpdmax_757P,static_base_pmtnum_254L,int_shallow_numberofoverdueinstlmax_1039L,static_base_pctinstlsallpaidlate6d_3546844L,int_shallow_numberofoverdueinstlmax_1151L,int_shallow_dpdmax_139P,person_shallow_birth_259D,static_base_pctinstlsallpaidlate4d_3546849L,static_base_lastrejectdate_50D,past_shallow_status_219L,int_shallow_overdueamountmax_155A,static_base_pctinstlsallpaidlate1d_3546856L,person_shallow_sex_738L,target
u32,f32,f32,f32,f32,f64,f64,f64,f32,f32,bool,f32,bool,f32,f32,f32,f64,f64,f32,f32,f64,f64,f32,f32,f64,f32,f32,f32,f32,f32,f64,f32,f64,f32,f32,f32,f32,…,f32,f32,f64,f32,f64,f32,f64,f32,f64,f32,f32,f64,f64,f32,f64,f64,f64,f32,f32,f32,f32,f64,bool,f32,f32,f32,f32,f32,f32,f64,f32,f64,f64,f32,f32,f64,u8
0,0.00,0.04,-0.14,-0.03,0.99,0.20,0.00,0.00,0.00,false,-0.06,false,0.00,-0.52,0.00,0.99,0.00,-0.34,0.00,0.00,0.00,-0.05,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.14,0.00,0.09,-0.18,0.06,0.00,…,0.94,0.94,0.29,-0.38,0.63,0.00,0.88,-0.45,0.98,-0.71,0.00,0.90,0.00,0.00,0.83,0.70,0.88,0.00,0.00,0.00,0.00,0.33,false,0.00,0.72,0.00,0.00,0.00,0.00,0.79,0.00,0.84,0.20,0.00,0.00,0.00,0
1,0.00,0.04,-0.14,-0.03,0.99,0.20,0.00,0.00,0.00,false,-0.06,false,0.00,-0.52,0.00,0.99,0.00,-0.34,0.00,0.00,0.00,-0.05,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.14,0.00,0.09,-0.18,0.06,0.00,…,0.12,0.12,0.29,-0.61,0.63,0.00,0.88,-0.68,0.82,-0.30,0.00,0.90,0.00,0.00,0.83,0.70,0.88,0.00,0.00,0.00,0.00,0.33,false,0.00,0.10,0.00,0.00,0.00,0.00,0.27,0.00,0.84,0.20,0.00,0.00,1.00,0
2,0.00,0.04,-0.14,-0.03,0.99,0.20,0.00,0.00,0.00,false,-0.06,false,0.00,-0.52,0.00,0.99,0.00,-0.34,0.00,0.00,0.00,-0.05,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-0.14,0.00,0.09,-0.18,0.06,0.00,…,0.94,0.94,0.29,0.75,0.63,0.00,0.88,0.64,0.84,0.30,0.00,0.90,0.12,0.00,0.85,0.70,0.88,0.00,0.00,0.00,0.00,0.33,false,0.00,1.96,0.00,0.00,0.00,0.00,0.58,0.00,0.60,0.00,0.00,0.00,0.00,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57632,4.29,0.04,-0.14,-0.03,0.99,0.20,0.00,-0.03,0.00,false,-0.06,false,-0.03,-0.52,-0.19,0.99,0.00,-0.34,-0.68,0.00,0.00,-0.05,-0.68,0.00,0.00,-0.01,-0.59,0.00,-0.19,0.00,-0.14,0.00,0.09,-0.18,0.06,-0.53,…,-1.50,-1.50,0.29,-0.47,0.48,-0.59,0.88,-0.54,0.89,0.23,0.00,0.90,0.25,0.00,0.83,0.70,0.88,0.00,0.00,0.00,-0.27,0.33,false,0.00,-1.15,0.00,-0.66,0.00,0.00,0.57,-0.72,0.84,0.20,0.00,-0.93,0.00,0
57633,0.00,0.04,-0.14,-0.03,1.00,0.20,0.11,0.00,-0.07,false,-0.06,false,0.00,-0.52,0.00,0.99,0.07,-0.34,0.00,0.00,0.04,-0.05,0.00,0.22,3.85,0.00,0.00,-0.02,0.00,0.00,-0.14,0.00,0.09,-0.18,0.06,0.10,…,0.61,0.61,1.00,3.62,0.76,0.00,0.88,3.40,0.89,1.41,-0.01,0.88,0.25,0.35,0.83,0.70,0.88,0.00,0.58,0.00,-1.08,1.00,false,-0.27,3.21,-0.15,0.00,-0.26,-0.14,0.57,0.00,0.85,0.20,-0.01,0.00,0.00,0
57634,-1.09,0.04,-0.14,-0.03,0.99,0.20,0.00,-0.97,0.00,false,-0.06,false,-0.97,-0.52,-0.18,0.99,0.00,-0.34,-1.03,0.00,0.00,-0.05,-1.04,0.00,0.00,-0.77,-0.59,0.00,-0.17,0.00,-0.14,0.00,0.09,-0.18,0.06,-0.85,…,-0.00,-0.00,0.29,-0.80,0.63,-0.59,0.88,-0.85,0.89,-0.96,0.00,0.90,0.25,0.00,0.83,0.70,0.88,0.00,0.00,0.00,-0.64,0.33,false,0.00,-0.53,0.00,-0.66,0.00,0.00,0.57,-0.72,0.84,0.20,0.00,-0.93,0.00,0


In [13]:
train_total = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base")).not_()).drop("case_id_base")
submission_df = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base"))).drop(["target"])

In [14]:
del total_df
del train_files_df
del test_files_df
del test_base
del train_base
del total_base
del total_past_shallow
del total_past_depth
del total_static_base
del total_static_external
del total_person_depth
del total_person_shallow
del total_other_shallow
del total_deposit_shallow
del total_debitcard_shallow
del total_credit_external_depth
del total_credit_external_shallow
del total_credit_internal_depth
del total_credit_internal_shallow
del total_registry_a
del total_registry_b
del total_registry_c
gc.collect()

0

In [15]:
X_total,y_total = train_total.select((~(cs.by_name("target"))).cast(pl.Float32)).with_row_index(),train_total.select(pl.col("target").cast(pl.Float32)).with_row_index()
gc.collect()

0

In [16]:
del train_total
gc.collect()

0

In [19]:
class_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.025,
    "n_estimators": 5000,
    "colsample_bynode": 0.82,
    "colsample_bytree": 0.82,
    "random_state": 421,
    "reg_alpha": 0.25,
    "reg_lambda": 25,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain"
}

reg_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.025,
    "n_estimators": 5000,
    "colsample_bynode": 0.81,
    "colsample_bytree": 0.81,
    "random_state": 41,
    "reg_alpha": 0.3,
    "reg_lambda": 25,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain"
}

In [20]:
splits = 6
lgb_models = [
    lgb.LGBMClassifier(**class_params) for _ in range(splits)
]
lgb_reg = [
    lgb.LGBMRegressor(**reg_params) for _ in range(splits)
]
cv = StratifiedGroupKFold(n_splits=splits,random_state=420,shuffle=True)
for i,(train_ind,valid_ind) in enumerate(cv.split(X_total,y_total.drop("index"),groups=X_total["weekday"])):
    X_train = X_total.filter(pl.col('index').is_in(train_ind)).drop("index")
    y_train = y_total.filter(pl.col('index').is_in(train_ind)).drop("index")["target"].to_numpy()
    X_valid = X_total.filter(pl.col('index').is_in(valid_ind)).drop("index")
    y_valid = y_total.filter(pl.col('index').is_in(valid_ind)).drop("index")["target"].to_numpy()
    X_train_cat,X_train_reg = X_train.select(cat_cols),X_train.select(cont_cols)
    X_valid_cat,X_valid_reg = X_valid.select(cat_cols),X_valid.select(cont_cols)
    del X_train,X_valid
    gc.collect()
    lgb_reg[i].fit(X_train_reg,y_train,eval_set=[(X_valid_reg,y_valid)],callbacks=[lgb.log_evaluation(100),lgb.early_stopping(100)])
    lgb_models[i].fit(X_train_cat,y_train,eval_set=[(X_valid_cat,y_valid)],callbacks=[lgb.log_evaluation(100),lgb.early_stopping(100)])

Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.783975
[200]	valid_0's auc: 0.794296
[300]	valid_0's auc: 0.802098
[400]	valid_0's auc: 0.807665
[500]	valid_0's auc: 0.811575
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.811575
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.7676
[200]	valid_0's auc: 0.777233
[300]	valid_0's auc: 0.784813
[400]	valid_0's auc: 0.789703
[500]	valid_0's auc: 0.793002
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.793002
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.789586
[200]	valid_0's auc: 0.800083
[300]	valid_0's auc: 0.807452
[400]	valid_0's auc: 0.812423
[500]	valid_0's auc: 0.816094
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.816094
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.767304
[200]	valid_0's auc: 0.776726
[300]	vali

In [21]:
del X_train_reg,y_train,X_train_cat,X_valid_reg,X_valid_cat,y_valid
gc.collect()

109

In [22]:
class Model:

    def __init__(self,clf_models:list,reg_models:list) -> None:
        self.clf_models = clf_models
        self.reg_models = reg_models
    
    def predict_proba(self,X):
        X_reg = X.select(cont_cols)
        X_cat = X.select(cat_cols)
        clf_pred = [_.predict_proba(X_cat)[:,1] for _ in self.clf_models]
        reg_pred = [_.predict(X_reg) for _ in self.reg_models]
        return np.mean(reg_pred+clf_pred,axis=0)

In [23]:
model = Model(lgb_models,lgb_reg)
n_chunks = 5
chunk_size = len(submission_df)//n_chunks
slices  = [slice(i*chunk_size,(i+1)*chunk_size) if i != 4 else slice(i*chunk_size,None) for i in range(n_chunks)]
predictions = []
for _slice in slices:
    chunk = submission_df[_slice]
    chunk_pred = model.predict_proba(chunk.drop("case_id_base"))
    predictions.append(chunk_pred)
    del chunk
    gc.collect()
all_predictions = np.concatenate(predictions)

In [26]:
sub_df = pd.DataFrame({
    "case_id": submission_df["case_id_base"].to_list(),
    "score": all_predictions
}).set_index("case_id")
sub_df.to_csv("./submission.csv")
sub_df

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.019799
57549,0.039023
57551,0.00786
57552,0.018574
57569,0.115637
57630,0.032945
57631,0.034625
57632,0.007506
57633,0.014365
57634,0.015707
