In [1]:
import os,sys,warnings,re,math,gc,time
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold,StratifiedGroupKFold
from sklearn.preprocessing import OrdinalEncoder
import lightgbm as lgb
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_thousands_separator(",")
pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(6)

polars.config.Config

In [3]:
path_to_train = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train"
path_to_test = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test"
path_to_features = "/kaggle/input/home-credit-credit-risk-model-stability/feature_definitions.csv"

In [4]:
feat_df = pl.read_csv(path_to_features)
feat_df.head()

Variable,Description
str,str
"""actualdpd_943P""","""Days Past Due (DPD) of previous contract (actual)."""
"""actualdpdtolerance_344P""","""DPD of client with tolerance."""
"""addres_district_368M""","""District of the person's address."""
"""addres_role_871L""","""Role of person's address."""
"""addres_zip_823M""","""Zip code of the address."""


In [5]:
all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
display(train_files_df.head())
display(test_files_df.head())

index,path,filename
i64,str,str
31,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_applprev_1_0.parque…","""train_applprev_1_0.parquet"""
4,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_applprev_1_1.parque…","""train_applprev_1_1.parquet"""
24,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_applprev_2.parquet""","""train_applprev_2.parquet"""
21,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_base.parquet""","""train_base.parquet"""
29,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train/train_credit_bureau_a_1_0…","""train_credit_bureau_a_1_0.parquet"""


index,path,filename
i64,str,str
18,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_applprev_1_0.parquet""","""test_applprev_1_0.parquet"""
26,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_applprev_1_1.parquet""","""test_applprev_1_1.parquet"""
14,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_applprev_1_2.parquet""","""test_applprev_1_2.parquet"""
1,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_applprev_2.parquet""","""test_applprev_2.parquet"""
27,"""/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test/test_base.parquet""","""test_base.parquet"""


In [6]:
train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
train_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
0,2019-01-03,1,1,4,0,0
1,2019-01-03,1,1,4,0,0
2,2019-01-04,1,1,5,0,0
…,…,…,…,…,…,…
2703452,2020-10-05,10,41,1,1,0
2703453,2020-10-05,10,41,1,1,0
2703454,2020-10-05,10,41,1,1,0


In [7]:
test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
test_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
57543,2021-05-14,5,19,5,2,0
57549,2022-01-17,1,3,1,3,0
57551,2020-11-27,11,48,5,1,0
…,…,…,…,…,…,…
57632,2022-02-05,2,5,6,3,0
57633,2022-01-25,1,4,2,3,0
57634,2021-01-27,1,4,3,2,0


In [8]:
total_base = (
    pl.concat(
        [
            train_base,
            test_base
        ],
        how="vertical"
    )
)

In [9]:
def reduce_dtypes(df:pl.DataFrame):
    return (
        df
        .with_columns(
            cs.by_name("case_id").cast(pl.UInt32),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            cs.ends_with("T","M").cast(pl.String),
            cs.ends_with("P","A").cast(pl.Float32),
            (cs.ends_with("L") & cs.numeric()).cast(pl.Float32),
        )
    )

def grouping(df):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.numeric().max(),
            (~cs.numeric()).drop_nulls().mode().first()
        )
    )

def preprocess(filter_string:str,prefix_string:str):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(_).pipe(reduce_dtypes).pipe(grouping) for _ in train_files_list
                ]
            )
        )
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(_).select(train_df.columns).cast(train_df.schema).pipe(grouping) for _ in test_files_list
                ]
            )
        )
    return (
        pl.concat(
            [
                train_df,
                test_df
            ],
            how="vertical_relaxed"
        )
        .pipe(lambda df: df.rename({_:f"{prefix_string}_{_}" for _ in df.columns if not _ == 'case_id'}))
    )

def select_low_catcols(df:pl.DataFrame,thresh=200):
    col_names = []
    for col_name in df.select(cs.categorical()).columns:
        if df.select(pl.col(col_name).value_counts()).shape[0] > thresh:
            col_names.append(col_name)
    return df.select(~cs.by_name(col_names))
    

def select_impuatable(df:pl.DataFrame,thresh=0.95):
    cols =  (
        df
        .select(pl.all().is_null().mean())
        .transpose(include_header=True)
        .filter(pl.col("column_0") < thresh)
        ["column"].to_list()
    )
    return df.select(cols)

In [10]:
total_past_shallow = preprocess("applprev_1","past_shallow")
total_past_shallow

case_id,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_district_544M,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,past_shallow_isbidproduct_390L,past_shallow_isdebitcard_527L,past_shallow_postype_4733339M,past_shallow_profession_152M,past_shallow_rejectreason_755M,past_shallow_rejectreasonclient_4145042M,past_shallow_status_219L
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,date,str,date,str,str,date,str,date,date,str,date,str,date,str,bool,bool,str,str,str,str,str
836630,0.00,4136.80,,,,0.00,,,,11000.00,,0.00,30000.00,,1,,3.00,,3.00,,"""P94_109_143""",2017-03-23,,"""COL""",,"""P50_139_56""",,,"""a55475b1""",,,2017-11-02,"""POS""",false,,"""a55475b1""","""a55475b1""","""P94_109_143""","""P94_109_143""","""D"""
1536271,0.00,1421.00,,0.00,19404.00,19404.00,0.00,0.00,0.00,19404.00,0.00,0.00,20000.00,0.00,1,0.00,12.00,760031232.00,12.00,2018-02-03,"""a55475b1""",2018-02-03,"""CL""","""REL""",2011-01-19,"""P152_138_35""",2018-07-25,2018-07-25,"""P97_36_170""",2014-01-15,"""SINGLE""",2011-02-08,"""POS""",false,false,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""K"""
1615771,0.00,2504.60,,,,0.00,,,,23840.00,0.00,0.00,29000.00,0.00,0,0.00,12.00,,12.00,2018-07-12,"""a55475b1""",2018-07-12,,"""COL""",2018-07-17,"""P119_55_167""",2019-06-11,2019-06-11,"""P33_146_175""",2014-01-15,"""SINGLE""",2018-08-11,"""POS""",false,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""K"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57549,0.00,3819.80,50000.00,0.00,34066.00,34066.00,0.00,0.00,0.00,60000.00,10256.63,0.00,60000.00,1.00,13,10638.20,36.00,840049792.00,36.00,2021-10-12,"""a55475b1""",2012-05-03,"""CL""","""REL""",2012-12-14,"""P158_150_171""",2017-01-19,2022-01-08,"""a55475b1""",1998-07-12,"""DIVORCED""",2013-01-12,"""POS""",false,false,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""","""D"""
57760,0.00,14092.60,,1.00,100000.00,100000.00,0.00,0.00,0.00,200000.00,0.00,0.00,112000.00,18.00,7,0.00,24.00,820377856.00,24.00,2020-06-15,"""a55475b1""",2020-06-15,"""AC""","""REL""",2020-08-07,"""P117_87_136""",2019-07-30,2020-06-15,"""a55475b1""",2009-02-12,"""MARRIED""",2020-07-15,"""NDF""",false,true,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""","""K"""
57970,0.00,768.00,18000.00,2.00,,30000.00,,,,30000.00,0.00,0.00,43000.00,0.00,7,0.00,24.00,,24.00,,"""P94_109_143""",2012-09-10,,"""CAL""",,"""a55475b1""",,,"""P97_36_170""",2006-12-30,"""MARRIED""",2012-10-11,"""CASH""",false,false,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""","""D"""


In [11]:
total_past_depth = preprocess("applprev_2","past_depth")
total_past_depth

case_id,past_depth_num_group1,past_depth_num_group2,past_depth_cacccardblochreas_147M,past_depth_conts_type_509L,past_depth_credacc_cards_status_52L
u32,u16,u16,str,str,str
1916042,15,2,"""a55475b1""","""PRIMARY_MOBILE""","""INACTIVE"""
951199,0,1,"""a55475b1""","""PHONE""",
1271927,1,1,"""a55475b1""","""HOME_PHONE""","""CANCELLED"""
…,…,…,…,…,…
1279447,18,3,"""a55475b1""","""PRIMARY_MOBILE""",
41065,0,0,,"""PRIMARY_MOBILE""",
57543,4,1,"""a55475b1""","""PRIMARY_MOBILE""",


In [12]:
total_static_base = preprocess("static_0","static_base")
total_static_base

case_id,static_base_actualdpdtolerance_344P,static_base_amtinstpaidbefduel24m_4187115A,static_base_annuity_780A,static_base_annuitynextmonth_57A,static_base_applicationcnt_361L,static_base_applications30d_658L,static_base_applicationscnt_1086L,static_base_applicationscnt_464L,static_base_applicationscnt_629L,static_base_applicationscnt_867L,static_base_avgdbddpdlast24m_3658932P,static_base_avgdbddpdlast3m_4187120P,static_base_avgdbdtollast24m_4525197P,static_base_avgdpdtolclosure24_3658938P,static_base_avginstallast24m_3658937A,static_base_avglnamtstart24m_4525187A,static_base_avgmaxdpdlast9m_3716943P,static_base_avgoutstandbalancel6m_4187114A,static_base_avgpmtlast12m_4525200A,static_base_clientscnt12m_3712952L,static_base_clientscnt3m_3712950L,static_base_clientscnt6m_3712949L,static_base_clientscnt_100L,static_base_clientscnt_1022L,static_base_clientscnt_1071L,static_base_clientscnt_1130L,static_base_clientscnt_136L,static_base_clientscnt_157L,static_base_clientscnt_257L,static_base_clientscnt_304L,static_base_clientscnt_360L,static_base_clientscnt_493L,static_base_clientscnt_533L,static_base_clientscnt_887L,static_base_clientscnt_946L,static_base_cntincpaycont9m_3716944L,…,static_base_credtype_322L,static_base_datefirstoffer_1144D,static_base_datelastinstal40dpd_247D,static_base_datelastunpaid_3546854D,static_base_disbursementtype_67L,static_base_dtlastpmtallstes_4499206D,static_base_equalitydataagreement_891L,static_base_equalityempfrom_62L,static_base_firstclxcampaign_1125D,static_base_firstdatedue_489D,static_base_inittransactioncode_186L,static_base_isbidproduct_1095L,static_base_isbidproductrequest_292L,static_base_isdebitcard_729L,static_base_lastactivateddate_801D,static_base_lastapplicationdate_877D,static_base_lastapprcommoditycat_1041M,static_base_lastapprcommoditytypec_5251766M,static_base_lastapprdate_640D,static_base_lastcancelreason_561M,static_base_lastdelinqdate_224D,static_base_lastrejectcommoditycat_161M,static_base_lastrejectcommodtypec_5251769M,static_base_lastrejectdate_50D,static_base_lastrejectreason_759M,static_base_lastrejectreasonclient_4145040M,static_base_lastrepayingdate_696D,static_base_lastst_736L,static_base_maxdpdinstldate_3546855D,static_base_opencred_647L,static_base_paytype1st_925L,static_base_paytype_783L,static_base_payvacationpostpone_4187118D,static_base_previouscontdistrict_112M,static_base_twobodfilling_608L,static_base_typesuite_864L,static_base_validfrom_1069D
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,str,date,date,date,str,date,bool,bool,date,date,str,bool,bool,bool,date,date,str,str,date,str,date,str,str,date,str,str,date,str,date,bool,str,str,date,str,str,str,date
723627,,,5319.40,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,1.00,,0.00,0.00,0.00,0.00,0.00,0.00,3.00,0.00,,…,"""COL""",,,,"""SBA""",,,,,,"""POS""",false,,,,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,,,,"""OTHER""","""OTHER""",,"""a55475b1""","""FO""",,
1450951,0.00,83112.21,2165.00,6339.00,0.00,0.00,0.00,0.00,0.00,3.00,-5.00,-1.00,,0.00,5096.20,,0.00,124702.27,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9.00,…,"""COL""",2008-04-03,,2019-04-06,"""SBA""",,,,2016-01-31,2012-05-09,"""POS""",false,,,2019-05-13,2019-05-09,"""a55475b1""","""a55475b1""",2019-05-09,"""a55475b1""",2019-04-06,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,"""A""",2018-09-06,false,"""OTHER""","""OTHER""",,"""P133_12_111""","""FO""",,
1244783,0.00,,2973.40,5870.60,0.00,0.00,0.00,0.00,0.00,3.00,-122.00,,,0.00,2685.60,,0.00,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,4.00,…,"""COL""",,,2017-09-16,"""SBA""",,,,,2006-09-08,"""POS""",false,,,2018-09-25,2018-09-21,"""P148_110_5""","""a55475b1""",2018-09-21,"""a55475b1""",2017-09-16,"""P12_6_178""","""a55475b1""",2016-01-24,"""a55475b1""","""a55475b1""",,"""A""",2007-02-05,false,"""OTHER""","""OTHER""",,"""P109_162_152""","""FO""",,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57760,0.00,278286.97,8008.00,0.00,0.00,0.00,0.00,0.00,0.00,8.00,-31.00,,-31.00,0.00,55657.40,,,0.00,83367.20,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,…,"""CAL""",2016-07-20,,2018-06-14,"""GBA""",,,,2016-07-19,2016-03-12,"""CASH""",true,,false,2020-08-07,2020-06-15,"""a55475b1""","""a55475b1""",2020-06-15,"""a55475b1""",2018-06-14,"""P159_130_59""","""P75_90_70""",2017-01-11,"""P99_56_166""","""P94_109_143""",,"""A""",2018-01-14,false,"""OTHER""","""OTHER""",,"""P117_87_136""","""BO""",,
58033,0.00,137465.80,19195.20,0.00,0.00,0.00,0.00,0.00,0.00,17.00,-4.00,-1.00,-3.00,0.00,12167.60,14842.00,0.00,22678.43,11094.20,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,9.00,…,"""CAL""",2008-02-21,,2022-05-15,,,,,2015-12-20,2012-10-25,"""CASH""",true,,false,2022-07-23,2022-04-06,"""a55475b1""","""a55475b1""",2022-04-06,"""a55475b1""",2022-05-15,"""a55475b1""","""a55475b1""",2020-07-06,"""a55475b1""","""P94_109_143""",,"""A""",2022-05-15,false,,,,"""a55475b1""","""FO""",,
57970,9.00,117938.84,1965.20,9907.20,0.00,0.00,0.00,0.00,0.00,7.00,-3.00,5.00,-2.00,2.00,7345.00,,2.00,55877.42,8655.60,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,7.00,…,"""COL""",,,2022-01-01,"""SBA""",2021-12-13,,,,2018-08-22,"""POS""",false,,false,2021-04-03,2021-04-02,"""P109_133_183""","""P49_111_165""",2021-04-02,"""a55475b1""",2022-01-01,"""a55475b1""","""a55475b1""",2021-04-02,"""a55475b1""","""P94_109_143""",,"""D""",2021-12-01,false,,,,"""a55475b1""","""FO""",,


In [13]:
total_static_external = (
    preprocess("static_cb","static_external")
    .with_columns(
        pl.col("static_external_riskassesment_302T").str.split("%").list.gather([0,1]).apply(lambda x: (int(x[0])+int(x[1].split("-")[1]))/200)
    )
)
total_static_external

case_id,static_external_contractssum_5085716L,static_external_days120_123L,static_external_days180_256L,static_external_days30_165L,static_external_days360_512L,static_external_days90_310L,static_external_firstquarter_103L,static_external_for3years_128L,static_external_for3years_504L,static_external_for3years_584L,static_external_formonth_118L,static_external_formonth_206L,static_external_formonth_535L,static_external_forquarter_1017L,static_external_forquarter_462L,static_external_forquarter_634L,static_external_fortoday_1092L,static_external_forweek_1077L,static_external_forweek_528L,static_external_forweek_601L,static_external_foryear_618L,static_external_foryear_818L,static_external_foryear_850L,static_external_fourthquarter_440L,static_external_numberofqueries_373L,static_external_pmtaverage_3A,static_external_pmtaverage_4527227A,static_external_pmtaverage_4955615A,static_external_pmtcount_4527229L,static_external_pmtcount_4955617L,static_external_pmtcount_693L,static_external_pmtscount_423L,static_external_pmtssum_45A,static_external_secondquarter_766L,static_external_thirdquarter_1082L,static_external_assignmentdate_238D,static_external_assignmentdate_4527235D,static_external_assignmentdate_4955616D,static_external_birthdate_574D,static_external_dateofbirth_337D,static_external_dateofbirth_342D,static_external_description_5085714M,static_external_education_1103M,static_external_education_88M,static_external_maritalst_385M,static_external_maritalst_893M,static_external_requesttype_4525192L,static_external_responsedate_1012D,static_external_responsedate_4527233D,static_external_responsedate_4917613D,static_external_riskassesment_302T,static_external_riskassesment_940T
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,date,date,date,date,date,date,str,str,str,str,str,str,date,date,date,f64,str
610368,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.00,12697.45,,,,,,1992-09-01,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,2019-01-27,,,,
1394,,0.00,0.00,0.00,3.00,0.00,2.00,,,,,,,,,,,,,,,,,1.00,3.00,,,,,,,5.00,3474.80,5.00,1.00,,,,1966-01-01,1966-01-01,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,2019-01-30,,,,
1504941,,0.00,0.00,0.00,3.00,0.00,1.00,,,,,,,,,,,,,,,,,2.00,3.00,,,,,,,6.00,6050.60,0.00,0.00,,,,1984-09-01,1984-09-01,,"""a55475b1""","""39a0853f""","""a55475b1""","""3439d993""","""a55475b1""",,2019-09-08,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57630,499975.00,1.00,2.00,1.00,5.00,1.00,1.00,,,,,,,,,,,,,,,,,3.00,5.00,,,,,,,,,4.00,1.00,,,,,1967-02-01,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,2021-03-30,,
57632,17677.00,1.00,2.00,0.00,4.00,0.00,1.00,,,,,,,,,,,,,,,,,1.00,4.00,,,15841.20,,14.00,,,,1.00,1.00,,,2015-11-26,,1958-11-01,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,2022-02-19,,
57551,2926195.25,1.00,3.00,1.00,4.00,1.00,3.00,,,,,,,,,,,,,,,,,2.00,4.00,,,,,,,,,5.00,5.00,,,,,1982-05-01,,"""2fc785b2""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,,,2020-12-11,,


In [14]:
total_person_shallow = preprocess("person_1","person_shallow")
total_person_shallow

case_id,person_shallow_childnum_185L,person_shallow_mainoccupationinc_384A,person_shallow_num_group1,person_shallow_personindex_1023L,person_shallow_persontype_1072L,person_shallow_persontype_792L,person_shallow_birth_259D,person_shallow_birthdate_87D,person_shallow_contaddr_district_15M,person_shallow_contaddr_matchlist_1032L,person_shallow_contaddr_smempladdr_334L,person_shallow_contaddr_zipcode_807M,person_shallow_education_927M,person_shallow_empl_employedfrom_271D,person_shallow_empl_employedtotal_800L,person_shallow_empl_industry_691L,person_shallow_empladdr_district_926M,person_shallow_empladdr_zipcode_114M,person_shallow_familystate_447L,person_shallow_gender_992L,person_shallow_housetype_905L,person_shallow_housingtype_772L,person_shallow_incometype_1044T,person_shallow_isreference_387L,person_shallow_language1_981M,person_shallow_maritalst_703L,person_shallow_registaddr_district_1083M,person_shallow_registaddr_zipcode_184M,person_shallow_relationshiptoclient_415T,person_shallow_relationshiptoclient_642T,person_shallow_remitter_829L,person_shallow_role_1084L,person_shallow_role_993L,person_shallow_safeguarantyflag_411L,person_shallow_sex_738L,person_shallow_type_25L
u32,f32,f32,u16,f32,f32,f32,date,date,str,bool,bool,str,str,date,str,str,str,str,str,str,str,str,str,bool,str,str,str,str,str,str,bool,str,str,bool,str,str
747582,,24000.00,1,0.00,1.00,1.00,1990-12-01,,"""a55475b1""",false,false,"""a55475b1""","""P33_146_175""",2017-01-15,"""MORE_ONE""","""GOVERNMENT""","""a55475b1""","""P91_47_168""","""SINGLE""",,,,"""SALARIED_GOVT""",,"""P10_39_147""",,"""a55475b1""","""P91_47_168""",,,,"""CL""",,true,"""M""","""PRIMARY_MOBILE"""
156808,,73080.00,2,1.00,5.00,5.00,1994-03-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2017-01-15,"""MORE_ONE""","""GOVERNMENT""","""a55475b1""","""a55475b1""","""SINGLE""",,,,"""SALARIED_GOVT""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""SIBLING""","""SIBLING""",false,"""PE""",,true,"""M""","""PRIMARY_MOBILE"""
1795036,,64000.00,0,0.00,1.00,1.00,1975-01-01,,"""P133_12_111""",false,false,"""P124_111_73""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,,,"""EMPLOYED""",,"""P10_39_147""",,"""P133_12_111""","""P124_111_73""",,,,"""CL""",,true,"""F""","""PRIMARY_MOBILE"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57551,,59600.00,0,0.00,1.00,1.00,1982-05-01,,"""P11_36_178""",false,false,"""P97_107_128""","""a55475b1""",2002-01-08,,,"""a55475b1""","""a55475b1""",,,,,"""SALARIED_GOVT""",,"""P10_39_147""",,"""P11_36_178""","""P97_107_128""",,,,"""CL""",,false,"""F""","""PRIMARY_MOBILE"""
57549,,49800.00,1,1.00,5.00,5.00,1959-11-01,,"""a55475b1""",,,"""a55475b1""","""P106_81_188""",,,,"""a55475b1""","""a55475b1""","""SINGLE""",,,,"""RETIRED_PENSIONER""",,"""a55475b1""",,"""a55475b1""","""P111_112_180""","""COLLEAGUE""","""COLLEAGUE""",false,"""PE""",,true,"""F""","""PRIMARY_MOBILE"""
57552,,112000.00,0,0.00,1.00,1.00,1955-11-01,,"""P21_84_40""",false,false,"""P91_47_168""","""a55475b1""",,,,"""a55475b1""","""a55475b1""",,,"""OWNED""",,"""RETIRED_PENSIONER""",,"""P10_39_147""",,"""P21_87_50""","""P91_47_168""",,,,"""CL""",,true,"""M""","""PRIMARY_MOBILE"""


In [15]:
total_person_depth = preprocess("person_2","person_depth")
total_person_depth

case_id,person_depth_num_group1,person_depth_num_group2,person_depth_addres_district_368M,person_depth_addres_role_871L,person_depth_addres_zip_823M,person_depth_conts_role_79M,person_depth_empls_economicalst_849M,person_depth_empls_employedfrom_796D,person_depth_empls_employer_name_740M,person_depth_relatedpersons_role_762T
u32,u16,u16,str,str,str,str,str,date,str,str
925600,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
255913,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
2536023,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
…,…,…,…,…,…,…,…,…,…,…
57552,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
57551,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
57569,1,5,"""P121_131_159""","""PERMANENT""","""P47_66_61""","""P38_92_157""","""a55475b1""",,"""a55475b1""","""PARENT"""


In [16]:
total_other_shallow = preprocess("other_1","other_shallow")
total_other_shallow

case_id,other_shallow_amtdebitincoming_4809443A,other_shallow_amtdebitoutgoing_4809440A,other_shallow_amtdepositbalance_4809441A,other_shallow_amtdepositincoming_4809444A,other_shallow_amtdepositoutgoing_4809442A,other_shallow_num_group1
u32,f32,f32,f32,f32,f32,u16
248181,0.00,0.20,0.00,0.00,0.00,0
1843274,0.00,0.00,0.00,0.00,6.00,0
1766174,13333.40,13333.40,0.00,0.00,0.00,0
…,…,…,…,…,…,…
57657,17171.40,19970.00,228.40,0.00,1.80,0
57667,9930.80,9930.80,0.00,0.00,0.00,0
57648,15233.40,15233.40,0.00,0.00,0.00,0


In [17]:
total_deposit_shallow = preprocess("deposit_1","deposit_shallow")
total_deposit_shallow

case_id,deposit_shallow_amount_416A,deposit_shallow_num_group1,deposit_shallow_contractenddate_991D,deposit_shallow_openingdate_313D
u32,f32,u16,date,date
2638868,404.85,0,2018-11-27,2014-05-28
197353,0.00,0,2017-04-02,2014-04-03
168110,229.00,3,2017-10-04,2015-09-09
…,…,…,…,…
57644,0.00,2,2016-11-29,2013-08-08
57708,262.56,0,2019-04-21,2016-04-21
57715,1800.00,4,2016-03-16,2014-08-13


In [18]:
total_debitcard_shallow = preprocess("debitcard","card_shallow")
total_debitcard_shallow

case_id,card_shallow_last180dayaveragebalance_704A,card_shallow_last180dayturnover_1134A,card_shallow_last30dayturnover_651A,card_shallow_num_group1,card_shallow_openingdate_857D
u32,f32,f32,f32,u16,date
2682247,,,,0,2014-02-06
2691095,,,,0,2013-10-07
2628444,,,,2,2015-07-31
…,…,…,…,…,…
57644,,,,2,2013-11-29
57701,,,,0,2014-09-06
57709,,,,0,2015-09-14


In [19]:
total_credit_internal_shallow = preprocess("bureau_a_1","int_shallow")
total_credit_internal_shallow

case_id,int_shallow_annualeffectiverate_199L,int_shallow_annualeffectiverate_63L,int_shallow_contractsum_5085717L,int_shallow_credlmt_230A,int_shallow_credlmt_935A,int_shallow_debtoutstand_525A,int_shallow_debtoverdue_47A,int_shallow_dpdmax_139P,int_shallow_dpdmax_757P,int_shallow_instlamount_768A,int_shallow_instlamount_852A,int_shallow_interestrate_508L,int_shallow_monthlyinstlamount_332A,int_shallow_monthlyinstlamount_674A,int_shallow_nominalrate_281L,int_shallow_nominalrate_498L,int_shallow_num_group1,int_shallow_numberofcontrsvalue_258L,int_shallow_numberofcontrsvalue_358L,int_shallow_numberofinstls_229L,int_shallow_numberofinstls_320L,int_shallow_numberofoutstandinstls_520L,int_shallow_numberofoutstandinstls_59L,int_shallow_numberofoverdueinstlmax_1039L,int_shallow_numberofoverdueinstlmax_1151L,int_shallow_numberofoverdueinstls_725L,int_shallow_numberofoverdueinstls_834L,int_shallow_outstandingamount_354A,int_shallow_outstandingamount_362A,int_shallow_overdueamount_31A,int_shallow_overdueamount_659A,int_shallow_overdueamountmax2_14A,int_shallow_overdueamountmax2_398A,int_shallow_overdueamountmax_155A,int_shallow_overdueamountmax_35A,int_shallow_periodicityofpmts_1102L,…,int_shallow_totalamount_6A,int_shallow_totalamount_996A,int_shallow_totaldebtoverduevalue_178A,int_shallow_totaldebtoverduevalue_718A,int_shallow_totaloutstanddebtvalue_39A,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_financialinstitution_591M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_426M,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,int_shallow_subjectrole_182M,int_shallow_subjectrole_93M
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,str,str,str,str,date,date,date,date,date,str,str,str,str,str,str,str,date,date,date,date,date,date,str,str,str,str,str,str,date,str,str
1361335,,,,,0.00,33196.35,0.00,0.00,,0.00,,,5350.20,,45.00,,10,2.00,8.00,,12.00,,7.00,0.00,,0.00,,,33196.35,,0.00,0.00,,0.00,,,…,,50984.20,0.00,0.00,33196.35,0.00,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2019-11-11,,,2015-11-11,,"""a55475b1""",,"""12.0""","""2017.0""",,"""a55475b1""","""a55475b1""",2019-05-07,,,,,,,"""12.0""","""2017.0""",,"""a55475b1""","""a55475b1""",2019-05-21,"""a55475b1""","""a55475b1"""
1344716,,,,,,17677.44,0.00,9.00,,,,,2503.00,,43.30,,10,1.00,,,16.00,,8.00,10.00,,0.00,,,17677.44,,0.00,14.00,,14.00,,,…,,29998.00,0.00,,17677.44,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2019-11-28,,,2018-07-28,,"""a55475b1""",,"""2.0""","""2019.0""",,"""a55475b1""","""a55475b1""",2019-04-24,,,2019-02-07,,2019-02-07,,"""2.0""","""2019.0""",,"""a55475b1""","""a55475b1""",2019-05-03,"""a55475b1""","""a55475b1"""
2577452,,,,,10000.00,140111.11,0.00,0.00,,2095.20,,,9340.40,,42.00,,10,2.00,2.00,,24.00,,19.00,0.00,,0.00,,,129958.98,,0.00,0.00,,0.00,,,…,,150000.00,0.00,0.00,140111.11,0.00,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2020-12-15,,,2018-12-14,,"""a55475b1""",,"""1.0""","""2019.0""",,"""a55475b1""","""a55475b1""",2019-05-28,,,,,,,"""7.0""","""2019.0""",,"""a55475b1""","""a55475b1""",2019-06-10,"""a55475b1""","""a55475b1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57633,34.50,56.00,4800829.00,,1280000.00,2579517.75,0.00,0.00,0.00,10397.20,,,70888.40,56508.40,33.00,29.88,9,6.00,9.00,12.00,84.00,0.00,84.00,0.00,0.00,0.00,0.00,0.00,1304916.12,0.00,0.00,0.00,0.00,0.00,0.00,30.00,…,580000.00,1301400.00,0.00,0.00,2579517.75,0.00,"""ea6782cc""","""ea6782cc""","""7241344e""","""7241344e""",2023-10-02,2022-12-29,2021-04-05,2021-01-15,2021-11-22,"""a55475b1""","""10.0""","""1.0""","""2022.0""","""2021.0""","""b619fa46""","""b619fa46""",2022-01-28,2021-11-30,,,,,"""10.0""","""1.0""","""2022.0""","""2021.0""","""5d1b0cdd""","""5d1b0cdd""",2022-02-08,"""a55475b1""","""a55475b1"""
57549,657.00,56.00,911176.56,17800.00,257978.00,312615.59,0.00,0.00,108.00,13542.40,99.80,,13542.40,45490.40,27.10,657.00,10,3.00,11.00,1.00,48.00,0.00,48.00,0.00,120.00,0.00,0.00,0.00,120024.73,0.00,0.00,0.00,48690.40,0.00,48690.40,30.00,…,330000.00,118964.80,0.00,0.00,312615.59,0.00,"""a55475b1""","""00135d9c""","""a55475b1""","""7241344e""",2023-02-25,2024-08-10,2018-12-17,2021-10-12,2020-09-19,"""a55475b1""","""12.0""","""6.0""","""2022.0""","""2019.0""","""a0fa072e""","""a55475b1""",2022-01-25,2019-11-18,2017-02-02,,2017-12-18,,"""1.0""","""6.0""","""2022.0""","""2019.0""","""a55475b1""","""96a8fdfe""",2022-01-31,"""a55475b1""","""a55475b1"""
57760,42.58,56.00,1421529.38,100000.00,320000.00,284305.88,0.00,0.00,281.00,14448.60,867.01,,14448.60,63224.80,20.95,45.00,9,1.00,5.00,12.00,,0.00,,1.00,313.00,0.00,0.00,0.00,,0.00,0.00,950.62,3998.00,0.00,3998.00,30.00,…,247625.61,,0.00,0.00,284305.88,0.00,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2022-09-08,2019-09-02,2016-09-05,2016-09-08,2017-04-18,"""a55475b1""","""4.0""","""11.0""","""2020.0""","""2021.0""","""a55475b1""","""a55475b1""",2021-09-08,2017-05-15,2017-04-15,2017-09-09,2020-03-16,2017-09-09,"""12.0""","""11.0""","""2020.0""","""2021.0""","""a55475b1""","""a55475b1""",2021-09-13,"""a55475b1""","""a55475b1"""


In [20]:
total_credit_internal_depth = preprocess("bureau_a_2","int_depth")
total_credit_internal_depth

case_id,int_depth_collater_valueofguarantee_1124L,int_depth_collater_valueofguarantee_876L,int_depth_num_group1,int_depth_num_group2,int_depth_pmts_dpd_1073P,int_depth_pmts_dpd_303P,int_depth_pmts_overdue_1140A,int_depth_pmts_overdue_1152A,int_depth_collater_typofvalofguarant_298M,int_depth_collater_typofvalofguarant_407M,int_depth_collaterals_typeofguarante_359M,int_depth_collaterals_typeofguarante_669M,int_depth_pmts_month_158T,int_depth_pmts_month_706T,int_depth_pmts_year_1139T,int_depth_pmts_year_507T,int_depth_subjectroles_name_541M,int_depth_subjectroles_name_838M
u32,f32,f32,u16,u16,f32,f32,f32,f32,str,str,str,str,str,str,str,str,str,str
1279781,0.00,,2,23,0.00,,0.00,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""6.0""",,"""2019.0""",,"""a55475b1""","""a55475b1"""
1265082,0.00,,1,35,0.00,,0.00,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""1.0""",,"""2019.0""",,"""a55475b1""","""a55475b1"""
1286209,0.00,,1,23,30.00,,4012.00,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""9.0""",,"""2019.0""",,"""a55475b1""","""a55475b1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57633,7230000.00,0.00,8,1,0.00,0.00,0.00,0.00,"""9a0c095e""","""9a0c095e""","""c7a5ad39""","""c7a5ad39""","""2.0""","""2.0""","""2021.0""","""2021.0""","""ab3c25cf""","""ab3c25cf"""
57645,0.00,0.00,9,0,,,,,"""a55475b1""","""8fd95e4b""","""c7a5ad39""","""a55475b1""","""2.0""","""2.0""","""2020.0""","""2019.0""","""ab3c25cf""","""a55475b1"""
57549,0.00,0.00,9,0,,,,,"""a55475b1""","""9a0c095e""","""c7a5ad39""","""a55475b1""","""2.0""","""2.0""","""2022.0""","""2019.0""","""ab3c25cf""","""a55475b1"""


In [21]:
total_credit_external_shallow = preprocess("bureau_b_1","ext_shallow")
total_credit_external_shallow

case_id,ext_shallow_amount_1115A,ext_shallow_credlmt_1052A,ext_shallow_credlmt_228A,ext_shallow_credlmt_3940954A,ext_shallow_credquantity_1099L,ext_shallow_credquantity_984L,ext_shallow_debtpastduevalue_732A,ext_shallow_debtvalue_227A,ext_shallow_dpd_550P,ext_shallow_dpd_733P,ext_shallow_dpdmax_851P,ext_shallow_installmentamount_644A,ext_shallow_installmentamount_833A,ext_shallow_instlamount_892A,ext_shallow_interesteffectiverate_369L,ext_shallow_interestrateyearly_538L,ext_shallow_maxdebtpduevalodued_3940955A,ext_shallow_num_group1,ext_shallow_numberofinstls_810L,ext_shallow_overdueamountmax_950A,ext_shallow_pmtdaysoverdue_1135P,ext_shallow_pmtnumpending_403L,ext_shallow_residualamount_1093A,ext_shallow_residualamount_127A,ext_shallow_residualamount_3940956A,ext_shallow_totalamount_503A,ext_shallow_totalamount_881A,ext_shallow_classificationofcontr_1114M,ext_shallow_contractdate_551D,ext_shallow_contractmaturitydate_151D,ext_shallow_contractst_516M,ext_shallow_contracttype_653M,ext_shallow_credor_3940957M,ext_shallow_dpdmaxdatemonth_804T,ext_shallow_dpdmaxdateyear_742T,ext_shallow_lastupdate_260D,ext_shallow_overdueamountmaxdatemonth_494T,ext_shallow_overdueamountmaxdateyear_432T,ext_shallow_periodicityofpmts_997L,ext_shallow_periodicityofpmts_997M,ext_shallow_pmtmethod_731M,ext_shallow_purposeofcred_722M,ext_shallow_subjectrole_326M,ext_shallow_subjectrole_43M
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,date,date,str,str,str,str,str,date,str,str,str,str,str,str,str,str
1575670,17521.80,,137600.00,,1.00,4.00,0.00,6101.06,0.00,0.00,0.00,0.00,6101.06,6195.00,,35.60,0.00,1,3.00,0.00,0.00,1.00,0.00,,,17521.80,502821.81,"""ea6782cc""",2019-07-24,2019-10-24,"""7241344e""","""a55475b1""","""a55475b1""","""8.0""","""2019.0""",2019-10-10,"""8.0""","""2019.0""",,"""a0b598e4""","""a55475b1""","""a55475b1""","""a55475b1""","""ab3c25cf"""
1777521,,0.00,,0.00,1.00,1.00,0.00,,0.00,0.00,0.00,0.00,0.00,,,,0.00,0,,0.00,0.00,,,0.00,0.00,0.00,9300.20,"""ea6782cc""",2016-07-07,2020-07-07,"""7241344e""","""1c9c5356""","""b619fa46""","""8.0""","""2016.0""",2020-02-13,"""8.0""","""2016.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""ab3c25cf""","""ab3c25cf"""
708901,,19998.00,0.00,10000.00,2.00,1.00,0.00,,0.00,0.00,0.00,0.00,9055.00,,,,0.00,1,,0.00,0.00,,0.00,9055.00,5722.60,19998.00,40000.00,"""ea6782cc""",2018-01-28,2020-01-28,"""7241344e""","""1c9c5356""","""b619fa46""","""8.0""","""2018.0""",2019-06-13,"""8.0""","""2015.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""a55475b1""","""ab3c25cf"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57775,3000.00,,,,,8.00,0.00,0.00,,0.00,17168.00,0.00,,0.00,30.00,2.00,1.60,6,1.00,1.60,0.00,0.00,,,,,156400.00,"""a55475b1""",2020-06-26,2021-02-19,"""7241344e""","""4257cbed""","""3e0bc39b""","""4.0""","""2019.0""",2021-12-01,"""4.0""","""2019.0""",,"""3ecc50a0""","""dbcbe8f8""","""96a8fdfe""","""a55475b1""","""ab3c25cf"""
57754,1932619.38,132032.00,38000.00,76800.00,4.00,11.00,0.00,1473622.38,0.00,0.00,0.00,0.00,1473622.38,8052.58,,39.00,0.00,4,240.00,0.00,0.00,183.00,0.00,68059.91,50039.51,1932619.38,575772.44,"""ea6782cc""",2017-06-09,2022-09-16,"""7241344e""","""1c9c5356""","""b619fa46""","""11.0""","""2019.0""",2022-06-04,"""11.0""","""2019.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""a55475b1""","""a55475b1"""
57675,1488000.00,220598.00,10000.00,200598.00,2.00,19.00,0.00,1433179.00,0.00,0.00,37403.00,0.00,1433179.00,17030.26,,,0.00,2,181.00,10.60,0.00,160.00,0.00,182816.69,165095.89,1488000.00,480262.59,"""ea6782cc""",2020-06-01,2022-06-02,"""7241344e""","""1c9c5356""","""b619fa46""","""8.0""","""2019.0""",2021-12-27,"""12.0""","""2019.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""ab3c25cf""","""ab3c25cf"""


In [22]:
total_credit_external_depth = preprocess("bureau_b_2","ext_depth")
total_credit_external_depth

case_id,ext_depth_num_group1,ext_depth_num_group2,ext_depth_pmts_dpdvalue_108P,ext_depth_pmts_pmtsoverdue_635A,ext_depth_pmts_date_1107D
u32,u16,u16,f32,f32,date
1583518,2,36,0.00,0.00,2019-03-15
1791764,2,18,144.00,1.20,2020-01-15
121905,0,4,0.00,0.00,2019-03-15
…,…,…,…,…,…
752768,0,4,0.00,0.00,2019-03-15
257663,1,18,88.00,4.60,2019-07-15
57675,0,9,0.00,0.00,2020-09-13


In [23]:
total_registry_a = preprocess("registry_a","reg_a")
total_registry_a

case_id,reg_a_amount_4527230A,reg_a_num_group1,reg_a_name_4527232M,reg_a_recorddate_4527225D
u32,f32,u16,str,date
816697,4999.80,11,"""7864e164""",2019-10-13
1743547,1641.00,4,"""6edf048a""",2020-01-25
2665568,3084.20,5,"""4943b9d1""",2020-02-24
…,…,…,…,…
2674088,4454.80,5,"""4875f0a6""",2020-03-20
57754,3807.80,7,"""cc68a90a""",2022-06-16
57675,5061.80,5,"""ff9eb829""",2022-01-07


In [24]:
total_registry_b = preprocess("registry_b","reg_b")
total_registry_b

case_id,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_b_name_4917606M
u32,f32,u16,date,str
246627,84875.00,17,2020-03-31,"""cda1fd10"""
1880368,47744.60,10,2020-04-14,"""ac223a58"""
991029,62202.60,9,2020-01-03,"""e751f256"""
…,…,…,…,…
1900967,644.20,1,2020-08-05,"""5e180ef0"""
57543,27191.80,5,2021-02-03,"""787c689d"""
57551,14644.80,6,2020-07-30,"""d580dfef"""


In [25]:
total_registry_c = preprocess("registry_c","reg_c")
total_registry_c

case_id,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_employername_160M,reg_c_processingdate_168D
u32,u16,f32,str,date
611514,4,5378.60,"""a754a3ed""",2018-10-11
1250309,4,3870.00,"""4121018a""",2019-01-03
729239,4,2364.20,"""a59d2703""",2019-01-25
…,…,…,…,…
2568954,5,3068.36,"""70bcf308""",2019-04-12
1388243,9,157.60,"""f6dc2cd2""",2019-04-22
163506,10,1114.00,"""d5d9440f""",2019-10-15


In [26]:
# drop_columns = ['Date','800L','845L','801D','725L','4525192L','507T','5251769M','3546852L','4499204L','2T','994T','608L','727P','4493213L','828A','1059P','4499208L','867L','562L','440L','int_shallow_num_group1','411L','896T','973L','4525197P','4493211L','4499205L','628L','1084L','829L','859L','817L','4493216L','57A','4493210L','103L','533L','392P','216L','766L','1102L','934L','647L','657L','414L','3546850L','4493214L','718A','925L','279L','545M','3545846M','755M','1023L','838M','474P','182M','344P','person_shallow_num_group1','669M','338L','887L','893M','93M','25L','3813885D','1087L','943P','351M','88M','946L','3712952L','750L','927M','1086L','915L','426M','541M','1022L','360L','668A','629L','354A','1071L','622L','493L','783L','3712949L','304L','13M','1143L','3546845L','520L','464L','1130L','834L','3712950L','4145042M','334L','318L','479L','31A','359M','257L','157L','658L','100L','298M','706T','158T','109L','407M','519L','361L','1072L','926M','166L','114M','792L','1032L']
drop_columns = [
    "int_shallow_subjectrole_182M",
    "int_shallow_subjectrole_93M",
    "person_depth_empls_employer_name_740M",
    "person_shallow_contaddr_matchlist_1032L",
    "person_shallow_remitter_829L",
    "static_base_applicationcnt_361L",
    "static_base_bankacctype_710L",
    "static_base_commnoinclast6m_3546845L",
    "static_base_deferredmnthsnum_166L",
    "static_base_isdebitcard_729L",
    "static_base_mastercontrelectronic_519L",
    "static_base_mastercontrexist_109L",
    "static_base_paytype1st_925L",
    "static_base_paytype_783L",
    "static_base_typesuite_864L",
    "past_depth_cacccardblochreas_147M",
    "past_depth_cacccardblochreas_147M",
    "int_shallow_financialinstitution_591M",
    "person_depth_empls_economicalst_849M",
    "static_external_assignmentdate_4527235D",
    "int_shallow_purposeofcred_426M",
    "deposit_shallow_num_group1",
    "static_base_cardtype_51L",
    "reg_a_recorddate_4527225D",
    "static_external_responsedate_4527233D",
    "int_shallow_numberofoverdueinstls_834L",
    "int_shallow_numberofoutstandinstls_520L",
    "person_shallow_housetype_905L",
    "person_depth_conts_role_79M",
    "static_base_clientscnt_493L",
    "card_shallow_num_group1",
    "static_external_responsedate_4917613D",
    "static_base_lastrejectcommodtypec_5251769M",
    "static_base_clientscnt_257L"
]

In [27]:
total_df = (
    total_base
    .join(
        total_past_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_past_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_base,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_external,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_other_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_deposit_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_debitcard_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_a,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_b,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_c,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.String).cast(pl.Categorical)
    )
#     .drop(cs.contains(drop_columns))
    .drop(drop_columns)
    .pipe(select_impuatable)
    .pipe(select_low_catcols)
    .with_columns(
        pl.col(pl.INTEGER_DTYPES).fill_null(strategy="mean"),
        pl.col(pl.FLOAT_DTYPES).fill_null(strategy="mean"),
        pl.col(pl.Boolean).fill_null(value=pl.col(pl.Boolean).drop_nulls().mode().first()),
        pl.col(pl.Categorical).fill_null(value=pl.col(pl.Categorical).drop_nulls().mode().first())
    )
    .with_columns(
        pl.col(pl.Categorical).to_physical()
    )
    .with_columns(
        (pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]) - pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min())/(pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).max()-pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min()),
        (pl.col(pl.FLOAT_DTYPES) - pl.col(pl.FLOAT_DTYPES).mean())/(pl.col(pl.FLOAT_DTYPES).std())
    )
    .select(
        ~cs.by_name("target"),
        cs.by_name("target")
    )
)
total_df

case_id_base,Date,month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,…,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,reg_a_amount_4527230A,reg_a_num_group1,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D,target
u32,f64,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f64,f32,f64,f64,f64,f32,f64,u8
0,,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.79,0.00,0.79,0.20,0.50,0.79,0.89,0.92,0.20,0.83,0.25,0.77,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.75,0.90,0.59,0.00,0.00,0.18,0.29,0.06,0.00,0.98,0.78,0.71,0.88,0.70,0.88,0.64,0.73,0.29,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.56,0.04,0.00,0.60,0
1,,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.79,0.00,0.79,0.20,0.50,0.79,0.89,0.92,0.20,0.83,0.25,0.77,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.75,0.90,0.59,0.00,0.00,0.18,0.29,0.06,0.00,0.98,0.78,0.71,0.88,0.70,0.88,0.64,0.73,0.29,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.56,0.04,0.00,0.60,0
2,,0.00,0.00,0.67,0.00,-0.01,-1.08,0.00,-1.12,0.00,-0.45,0.00,0.00,0.00,-1.06,0.00,-0.35,-1.58,0.00,0.05,0.00,0.08,0.00,0.08,0.79,0.00,0.61,0.20,0.00,0.79,0.89,0.92,0.00,0.85,0.00,0.61,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.75,0.90,0.59,0.00,0.00,0.18,0.29,0.06,0.00,0.98,0.78,0.71,0.88,0.70,0.88,0.64,0.73,0.29,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.56,0.04,0.00,0.60,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57632,,0.09,0.08,0.83,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.79,0.00,0.79,0.20,0.50,0.79,0.89,0.92,0.20,0.83,0.25,0.77,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.75,0.90,0.59,0.00,0.00,0.18,0.29,0.06,0.00,0.98,0.78,0.71,0.88,0.70,0.88,0.64,0.73,0.29,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.56,0.04,0.00,0.60,0
57633,,0.00,0.06,0.17,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.79,0.00,0.79,0.20,0.50,0.79,0.89,0.92,0.20,0.83,0.25,0.77,…,-0.00,0.50,0.01,0.12,0.04,0.19,0.61,0.96,0.95,0.60,0.00,0.91,0.64,1.00,1.00,0.03,0.99,0.99,0.71,0.88,0.70,0.88,0.45,0.82,1.00,1.00,0.22,1.00,0.00,0.06,0.00,0.06,0.56,0.04,0.00,0.60,0
57634,,0.00,0.06,0.33,0.67,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.79,0.00,0.79,0.20,0.50,0.79,0.89,0.92,0.20,0.83,0.25,0.77,…,0.00,0.00,0.00,0.00,0.00,0.18,0.59,0.75,0.90,0.59,0.00,0.00,0.18,0.29,0.06,0.00,0.98,0.78,0.71,0.88,0.70,0.88,0.64,0.73,0.29,0.06,0.00,0.99,0.00,0.06,0.00,0.06,0.56,0.04,0.00,0.60,0


In [28]:
train_total = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base")).not_()).drop("case_id_base")
train_total.head()

Date,month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,…,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,reg_a_amount_4527230A,reg_a_num_group1,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D,target
f64,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f64,f32,f64,f64,f64,f32,f64,u8
,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.79,0.0,0.79,0.2,0.5,0.79,0.89,0.92,0.2,0.83,0.25,0.77,0.5,…,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.75,0.9,0.59,0.0,0.0,0.18,0.29,0.06,0.0,0.98,0.78,0.71,0.88,0.7,0.88,0.64,0.73,0.29,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6,0
,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.79,0.0,0.79,0.2,0.5,0.79,0.89,0.92,0.2,0.83,0.25,0.77,0.5,…,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.75,0.9,0.59,0.0,0.0,0.18,0.29,0.06,0.0,0.98,0.78,0.71,0.88,0.7,0.88,0.64,0.73,0.29,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6,0
,0.0,0.0,0.67,0.0,-0.01,-1.08,0.0,-1.12,0.0,-0.45,0.0,0.0,0.0,-1.06,0.0,-0.35,-1.58,0.0,0.05,0.0,0.08,0.0,0.08,0.79,0.0,0.61,0.2,0.0,0.79,0.89,0.92,0.0,0.85,0.0,0.61,0.0,…,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.75,0.9,0.59,0.0,0.0,0.18,0.29,0.06,0.0,0.98,0.78,0.71,0.88,0.7,0.88,0.64,0.73,0.29,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6,0
,0.0,0.0,0.5,0.0,-0.01,0.13,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.22,0.0,-0.35,-1.5,0.0,0.0,0.0,-0.97,0.0,-0.97,0.79,0.02,1.0,0.2,0.0,0.79,0.89,0.92,0.0,0.99,0.25,1.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.75,0.9,0.59,0.0,0.0,0.18,0.29,0.06,0.0,0.98,0.78,0.71,0.88,0.7,0.88,0.64,0.73,0.29,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6,0
,0.0,0.0,0.67,0.0,-0.01,-0.84,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.6,0.0,-0.35,-1.36,0.0,0.0,0.0,0.08,0.0,0.08,0.79,0.03,1.0,0.2,0.0,0.79,0.89,0.92,0.2,0.83,0.25,1.0,0.0,…,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.75,0.9,0.59,0.0,0.0,0.18,0.29,0.06,0.0,0.98,0.78,0.71,0.88,0.7,0.88,0.64,0.73,0.29,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6,1


In [29]:
submission_df = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base"))).drop(["target"])
submission_df.head()

case_id_base,Date,month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,…,int_shallow_totaloutstanddebtvalue_39A,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,reg_a_amount_4527230A,reg_a_num_group1,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D
u32,f64,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f64,f32,f64,f64,f64,f32,f64
57543,,0.36,0.35,0.67,0.67,-0.01,-0.14,-0.54,0.56,0.0,-0.45,0.0,0.0,0.0,0.62,-0.22,0.48,-0.69,0.04,0.47,-0.28,1.4,0.0,1.4,0.82,0.0,0.61,0.2,0.5,0.55,0.93,0.93,0.6,0.83,0.25,0.61,…,-0.1,-0.0,0.0,0.03,0.0,0.04,0.18,0.6,0.65,0.96,0.56,0.0,0.73,0.18,0.57,0.94,0.5,0.97,0.99,0.68,0.88,0.65,0.88,0.36,0.18,0.86,0.94,0.17,1.0,0.0,0.06,-0.69,0.05,0.43,0.04,0.0,0.6
57549,,0.0,0.04,0.0,1.0,-0.01,-0.5,0.99,-1.12,1.21,1.01,0.18,0.84,-0.46,-0.22,-0.28,-0.35,-0.13,-0.2,0.68,-0.32,1.14,4.18,1.14,0.98,0.0,0.34,0.0,1.0,0.38,0.57,0.99,0.2,0.59,0.5,0.38,…,0.05,-0.0,0.0,0.02,0.0,0.04,0.18,0.62,0.86,0.98,0.6,0.0,0.36,0.55,1.0,0.29,0.21,0.99,0.86,0.69,0.88,0.74,0.88,0.27,0.55,1.0,0.76,0.06,1.0,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6
57551,,0.91,0.92,0.67,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.79,0.0,0.79,0.2,0.5,0.79,0.89,0.92,0.2,0.83,0.25,0.77,…,0.19,-0.0,0.0,0.01,0.0,0.04,0.17,0.6,0.85,0.98,0.6,0.0,0.45,0.73,0.29,0.06,0.03,0.99,0.97,0.87,0.88,0.94,0.88,0.91,0.0,0.29,0.0,0.11,1.0,0.0,0.06,-1.68,0.06,0.32,0.04,0.0,0.6
57552,,0.91,0.92,0.67,0.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.79,0.0,0.79,0.2,0.5,0.79,0.89,0.92,0.2,0.83,0.25,0.77,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.75,0.9,0.59,0.0,0.0,0.18,0.29,0.06,0.0,0.98,0.78,0.71,0.88,0.7,0.88,0.64,0.73,0.29,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6
57569,,1.0,0.98,0.0,0.67,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.79,0.0,0.79,0.2,0.5,0.79,0.89,0.92,0.2,0.83,0.25,0.77,…,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.59,0.75,0.9,0.59,0.0,0.0,0.18,0.29,0.06,0.0,0.98,0.78,0.71,0.88,0.7,0.88,0.64,0.73,0.29,0.06,0.0,0.99,0.0,0.06,0.0,0.06,0.56,0.04,0.0,0.6


In [30]:
train_df,valid_df = train_test_split(
    train_total,
    stratify=train_total.select("target"),
    test_size=0.1,
    random_state=420,
    shuffle=True
    )

In [31]:
del total_df
del train_total
del total_past_shallow
del total_past_depth
del total_static_base
del total_static_external
del total_person_depth
del total_person_shallow
del total_other_shallow
del total_deposit_shallow
del total_debitcard_shallow
del total_credit_external_depth
del total_credit_external_shallow
del total_credit_internal_depth
del total_credit_internal_shallow
del total_registry_a
del total_registry_b
del total_registry_c
gc.collect()

0

In [32]:
print(train_df.shape)
print(valid_df.shape)

(1373993, 319)
(152666, 319)


In [33]:
X_train,y_train = train_df.select((~(cs.by_name("target"))).cast(pl.Float32)),train_df.select(pl.col("target").cast(pl.Float32))
X_valid,y_valid = valid_df.select((~(cs.by_name("target"))).cast(pl.Float32)),valid_df.select(pl.col("target").cast(pl.Float32))
del train_df,valid_df

In [34]:
params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 256,
    "learning_rate": 0.001,
    "n_estimators": 25000,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 42,
    "reg_alpha": 0.2,
    "reg_lambda": 20,
    "extra_trees": True,
    "num_leaves": 512,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

In [35]:
model = lgb.LGBMClassifier(**params)
model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],callbacks=[lgb.log_evaluation(100),lgb.early_stopping(2000)])



Training until validation scores don't improve for 2000 rounds
[100]	valid_0's auc: 0.805538
[200]	valid_0's auc: 0.807208
[300]	valid_0's auc: 0.808851
[400]	valid_0's auc: 0.810232
[500]	valid_0's auc: 0.811714
[600]	valid_0's auc: 0.813073
[700]	valid_0's auc: 0.814485
[800]	valid_0's auc: 0.815928
[900]	valid_0's auc: 0.817206
[1000]	valid_0's auc: 0.818472
[1100]	valid_0's auc: 0.819842
[1200]	valid_0's auc: 0.821083
[1300]	valid_0's auc: 0.822337
[1400]	valid_0's auc: 0.823609
[1500]	valid_0's auc: 0.824747
[1600]	valid_0's auc: 0.825884
[1700]	valid_0's auc: 0.827005
[1800]	valid_0's auc: 0.828124
[1900]	valid_0's auc: 0.82923
[2000]	valid_0's auc: 0.830221
[2100]	valid_0's auc: 0.831208
[2200]	valid_0's auc: 0.832146
[2300]	valid_0's auc: 0.833061
[2400]	valid_0's auc: 0.833904
[2500]	valid_0's auc: 0.834726
[2600]	valid_0's auc: 0.835521
[2700]	valid_0's auc: 0.836289
[2800]	valid_0's auc: 0.836996
[2900]	valid_0's auc: 0.837701
[3000]	valid_0's auc: 0.838354
[3100]	valid_0's 

In [36]:
sub_df = pd.DataFrame({
    "case_id": submission_df['case_id_base'].to_list(),
    "score": model.predict_proba(submission_df.drop("case_id_base"))[:,1]
}).set_index("case_id")
sub_df.to_csv("./submission.csv")