In [1]:
import os,sys,warnings,re,math,gc,time
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["KERAS_BACKEND"] = "tensorflow"
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
import lightgbm as lgb
tf.get_logger().setLevel("ERROR")
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_thousands_separator(",")
pl.Config.set_float_precision(2)
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(6)

polars.config.Config

In [3]:
path_to_train = "/home/sohail/Downloads/credit_risk/train"
path_to_test = "/home/sohail/Downloads/credit_risk/test"
path_to_features = "feature_definitions.csv"

In [4]:
feat_df = pl.read_csv(path_to_features)
feat_df.head()

Variable,Description
str,str
"""actualdpd_943P""","""Days Past Due (DPD) of previous contract (actual)."""
"""actualdpdtolerance_344P""","""DPD of client with tolerance."""
"""addres_district_368M""","""District of the person's address."""
"""addres_role_871L""","""Role of person's address."""
"""addres_zip_823M""","""Zip code of the address."""


In [5]:
all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
display(train_files_df.head())
display(test_files_df.head())

index,path,filename
i64,str,str
31,"""/home/sohail/Downloads/credit_risk/train/train_applprev_1_0.parquet""","""train_applprev_1_0.parquet"""
13,"""/home/sohail/Downloads/credit_risk/train/train_applprev_1_1.parquet""","""train_applprev_1_1.parquet"""
0,"""/home/sohail/Downloads/credit_risk/train/train_applprev_2.parquet""","""train_applprev_2.parquet"""
7,"""/home/sohail/Downloads/credit_risk/train/train_base.parquet""","""train_base.parquet"""
11,"""/home/sohail/Downloads/credit_risk/train/train_credit_bureau_a_1_0.parquet""","""train_credit_bureau_a_1_0.parquet"""


index,path,filename
i64,str,str
31,"""/home/sohail/Downloads/credit_risk/test/test_applprev_1_0.parquet""","""test_applprev_1_0.parquet"""
35,"""/home/sohail/Downloads/credit_risk/test/test_applprev_1_1.parquet""","""test_applprev_1_1.parquet"""
9,"""/home/sohail/Downloads/credit_risk/test/test_applprev_1_2.parquet""","""test_applprev_1_2.parquet"""
1,"""/home/sohail/Downloads/credit_risk/test/test_applprev_2.parquet""","""test_applprev_2.parquet"""
33,"""/home/sohail/Downloads/credit_risk/test/test_base.parquet""","""test_base.parquet"""


In [6]:
train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
train_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
0,2019-01-03,1,1,4,0,0
1,2019-01-03,1,1,4,0,0
2,2019-01-04,1,1,5,0,0
…,…,…,…,…,…,…
2703452,2020-10-05,10,41,1,1,0
2703453,2020-10-05,10,41,1,1,0
2703454,2020-10-05,10,41,1,1,0


In [7]:
test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
test_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
57543,2020-10-06,10,41,2,1,0
57549,2020-10-06,10,41,2,1,0
57551,2020-10-06,10,41,2,1,0
…,…,…,…,…,…,…
57632,2020-10-06,10,41,2,1,0
57633,2020-10-06,10,41,2,1,0
57634,2020-10-06,10,41,2,1,0


In [8]:
total_base = (
    pl.concat(
        [
            train_base,
            test_base
        ],
        how="vertical"
    )
)

In [9]:
def reduce_dtypes(df:pl.DataFrame):
    return (
        df
        .with_columns(
            cs.by_name("case_id").cast(pl.UInt32),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            cs.ends_with("T","M").cast(pl.String),
            cs.ends_with("P","A").cast(pl.Float32),
            (cs.ends_with("L") & cs.numeric()).cast(pl.Float32),
        )
    )

def grouping(df):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.numeric().max(),
            (~cs.numeric()).drop_nulls().mode().first()
        )
    )

def preprocess(filter_string:str,prefix_string:str):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(_).pipe(reduce_dtypes).pipe(grouping) for _ in train_files_list
                ]
            )
        )
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(_).select(train_df.columns).cast(train_df.schema).pipe(grouping) for _ in test_files_list
                ]
            )
        )
    return (
        pl.concat(
            [
                train_df,
                test_df
            ],
            how="vertical_relaxed"
        )
        .pipe(lambda df: df.rename({_:f"{prefix_string}_{_}" for _ in df.columns if not _ == 'case_id'}))
    )

def select_low_catcols(df:pl.DataFrame,thresh1=200,thresh2=2):
    col_names = []
    for col_name in df.select(cs.categorical()).columns:
        num_ = df.select(pl.col(col_name).value_counts()).shape[0]
        if (num_ > thresh1) | (num_ < thresh2):
            col_names.append(col_name)
    return df.select(~cs.by_name(col_names))
    

def select_impuatable(df:pl.DataFrame,thresh=0.95):
    cols =  (
        df
        .select(pl.all().is_null().mean())
        .transpose(include_header=True)
        .filter(pl.col("column_0") < thresh)
        ["column"].to_list()
    )
    return df.select(cols)

In [10]:
total_past_shallow = preprocess("applprev_1","past_shallow")
total_past_shallow

case_id,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_district_544M,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,past_shallow_isbidproduct_390L,past_shallow_isdebitcard_527L,past_shallow_postype_4733339M,past_shallow_profession_152M,past_shallow_rejectreason_755M,past_shallow_rejectreasonclient_4145042M,past_shallow_status_219L
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,date,str,date,str,str,date,str,date,date,str,date,str,date,str,bool,bool,str,str,str,str,str
1663422,0.00,3068.80,,1.00,,0.00,,,,40000.00,0.00,0.00,14000.00,163.00,2,0.00,24.00,,24.00,2013-11-06,"""a55475b1""",2012-09-28,,"""COL""",2012-10-18,"""P41_138_103""",2016-02-24,2016-02-24,"""P97_36_170""",2008-02-15,"""MARRIED""",2013-12-07,"""POS""",false,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""K"""
114977,0.00,3633.60,,0.00,,0.00,,,,40000.00,0.00,0.00,42000.00,1.00,2,0.00,24.00,,24.00,2017-09-10,"""a55475b1""",2017-09-10,,"""CAL""",2017-09-12,"""P217_160_113""",2018-09-20,2018-09-20,"""P97_36_170""",2017-07-15,"""MARRIED""",2017-10-11,"""CASH""",false,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""K"""
1404133,0.00,8137.60,,1.00,,0.00,,,,72994.00,0.00,0.00,130000.00,0.00,0,0.00,11.00,,11.00,2018-03-27,"""a55475b1""",2018-03-27,,"""COL""",2018-04-03,"""P131_33_167""",2018-12-27,2018-12-27,"""P33_146_175""",2007-01-15,"""MARRIED""",2018-04-27,"""POS""",false,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""K"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
101146,0.00,7800.80,1.00,0.00,,0.00,,,,200000.00,10500.46,0.00,30000.00,0.00,3,11158.20,48.00,,48.00,2022-06-24,"""a55475b1""",2016-10-06,,"""CAL""",2022-09-03,"""a55475b1""",,2022-08-23,"""a55475b1""",2015-07-15,"""SINGLE""",2015-11-23,"""CASH""",false,,"""P46_145_78""","""a55475b1""","""a55475b1""","""a55475b1""","""D"""
101005,0.00,5847.00,,,,0.00,,,,150000.00,,0.00,50000.00,,1,,60.00,,60.00,,"""P94_109_143""",2022-09-12,,"""CAL""",,"""P39_22_51""",,,"""a55475b1""",,,2022-10-15,"""CASH""",false,,"""P46_145_78""","""a55475b1""","""a55475b1""","""P94_109_143""","""D"""
100984,0.00,0.00,,,1816.71,0.00,4000.00,0.00,3.00,0.00,0.00,0.00,,0.00,0,0.00,,840473664.00,,2022-09-12,"""a55475b1""",2022-09-12,"""AC""","""REL""",,"""a55475b1""",,2022-09-20,"""a55475b1""",,,,"""NDF""",false,true,"""P67_102_161""","""a55475b1""","""a55475b1""","""a55475b1""","""N"""


In [11]:
total_past_depth = preprocess("applprev_2","past_depth")
total_past_depth

case_id,past_depth_num_group1,past_depth_num_group2,past_depth_cacccardblochreas_147M,past_depth_conts_type_509L,past_depth_credacc_cards_status_52L
u32,u16,u16,str,str,str
789127,2,0,"""a55475b1""","""PRIMARY_MOBILE""",
1335968,3,1,"""a55475b1""","""PRIMARY_MOBILE""",
1367185,1,0,"""a55475b1""","""PRIMARY_MOBILE""",
…,…,…,…,…,…
57549,0,2,,"""PHONE""",
57551,1,1,,"""PRIMARY_MOBILE""",
57552,0,2,,"""PRIMARY_MOBILE""",


In [12]:
total_static_base = preprocess("static_0","static_base")
total_static_base

case_id,static_base_actualdpdtolerance_344P,static_base_amtinstpaidbefduel24m_4187115A,static_base_annuity_780A,static_base_annuitynextmonth_57A,static_base_applicationcnt_361L,static_base_applications30d_658L,static_base_applicationscnt_1086L,static_base_applicationscnt_464L,static_base_applicationscnt_629L,static_base_applicationscnt_867L,static_base_avgdbddpdlast24m_3658932P,static_base_avgdbddpdlast3m_4187120P,static_base_avgdbdtollast24m_4525197P,static_base_avgdpdtolclosure24_3658938P,static_base_avginstallast24m_3658937A,static_base_avglnamtstart24m_4525187A,static_base_avgmaxdpdlast9m_3716943P,static_base_avgoutstandbalancel6m_4187114A,static_base_avgpmtlast12m_4525200A,static_base_clientscnt12m_3712952L,static_base_clientscnt3m_3712950L,static_base_clientscnt6m_3712949L,static_base_clientscnt_100L,static_base_clientscnt_1022L,static_base_clientscnt_1071L,static_base_clientscnt_1130L,static_base_clientscnt_136L,static_base_clientscnt_157L,static_base_clientscnt_257L,static_base_clientscnt_304L,static_base_clientscnt_360L,static_base_clientscnt_493L,static_base_clientscnt_533L,static_base_clientscnt_887L,static_base_clientscnt_946L,static_base_cntincpaycont9m_3716944L,…,static_base_credtype_322L,static_base_datefirstoffer_1144D,static_base_datelastinstal40dpd_247D,static_base_datelastunpaid_3546854D,static_base_disbursementtype_67L,static_base_dtlastpmtallstes_4499206D,static_base_equalitydataagreement_891L,static_base_equalityempfrom_62L,static_base_firstclxcampaign_1125D,static_base_firstdatedue_489D,static_base_inittransactioncode_186L,static_base_isbidproduct_1095L,static_base_isbidproductrequest_292L,static_base_isdebitcard_729L,static_base_lastactivateddate_801D,static_base_lastapplicationdate_877D,static_base_lastapprcommoditycat_1041M,static_base_lastapprcommoditytypec_5251766M,static_base_lastapprdate_640D,static_base_lastcancelreason_561M,static_base_lastdelinqdate_224D,static_base_lastrejectcommoditycat_161M,static_base_lastrejectcommodtypec_5251769M,static_base_lastrejectdate_50D,static_base_lastrejectreason_759M,static_base_lastrejectreasonclient_4145040M,static_base_lastrepayingdate_696D,static_base_lastst_736L,static_base_maxdpdinstldate_3546855D,static_base_opencred_647L,static_base_paytype1st_925L,static_base_paytype_783L,static_base_payvacationpostpone_4187118D,static_base_previouscontdistrict_112M,static_base_twobodfilling_608L,static_base_typesuite_864L,static_base_validfrom_1069D
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,str,date,date,date,str,date,bool,bool,date,date,str,bool,bool,bool,date,date,str,str,date,str,date,str,str,date,str,str,date,str,date,bool,str,str,date,str,str,str,date
678092,,,1186.40,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,…,"""REL""",,,,"""SBA""",,,,,,"""POS""",false,,false,,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,,,,"""OTHER""","""OTHER""",,"""a55475b1""","""FO""",,
1559592,0.00,0.00,1236.60,0.00,0.00,0.00,0.00,0.00,0.00,0.00,826.00,,826.00,826.00,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,…,"""COL""",2008-04-03,2017-05-30,2016-02-06,"""SBA""",2018-07-27,,,,2012-06-08,"""POS""",false,,,2015-02-11,2015-07-20,"""P159_130_59""","""a55475b1""",2015-02-06,"""a55475b1""",2016-02-06,"""P159_130_59""","""a55475b1""",2015-07-20,"""a55475b1""","""a55475b1""",,"""D""",2016-01-06,false,"""OTHER""","""OTHER""",,"""P204_99_158""","""FO""",,
1410314,0.00,65652.60,4456.60,0.00,0.00,0.00,0.00,0.00,0.00,4.00,4.00,4.00,,5.00,6565.80,,7.00,34436.38,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,11.00,…,"""COL""",2007-10-29,2018-10-30,2019-06-30,"""SBA""",,,,,2008-07-19,"""POS""",false,,,2018-07-02,2018-09-13,"""P33_29_177""","""a55475b1""",2018-06-30,"""P94_109_143""",2019-06-30,"""P33_29_177""","""a55475b1""",2018-09-13,"""P94_109_143""","""P94_109_143""",,"""D""",2018-10-30,false,"""OTHER""","""OTHER""",,"""P2_93_127""","""FO""",,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
101259,0.00,,37639.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,…,"""CAL""",,,,,,,,,,"""CASH""",false,,false,,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,,,,,,,"""a55475b1""","""FO""",,
101146,0.00,3740.00,2136.60,1870.00,0.00,0.00,0.00,0.00,0.00,2.00,-4.00,-4.00,-4.00,0.00,1870.00,13845.60,0.00,13058.20,1870.00,0.00,0.00,0.00,0.00,0.00,1.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.00,…,"""CAL""",,,,"""GBA""",2022-08-23,,,,2022-07-24,"""CASH""",true,,false,2022-09-03,2022-07-19,"""P12_6_178""","""P111_89_135""",2022-06-24,"""a55475b1""",,"""a55475b1""","""a55475b1""",2022-07-19,"""a55475b1""","""P30_86_84""",,"""D""",,false,,,,"""a55475b1""","""FO""","""AL""",
101278,0.00,,6216.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,,…,"""CAL""",,,,"""GBA""",,,,,,"""CASH""",false,,false,,,"""a55475b1""","""a55475b1""",,"""a55475b1""",,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,,,,"""OTHER""","""OTHER""",,"""a55475b1""","""FO""",,


In [13]:
total_static_external = (
    preprocess("static_cb","static_external")
    .with_columns(
        pl.col("static_external_riskassesment_302T").str.split("%").list.gather([0,1]).apply(lambda x: (int(x[0])+int(x[1].split("-")[1]))/200)
    )
)
total_static_external

case_id,static_external_contractssum_5085716L,static_external_days120_123L,static_external_days180_256L,static_external_days30_165L,static_external_days360_512L,static_external_days90_310L,static_external_firstquarter_103L,static_external_for3years_128L,static_external_for3years_504L,static_external_for3years_584L,static_external_formonth_118L,static_external_formonth_206L,static_external_formonth_535L,static_external_forquarter_1017L,static_external_forquarter_462L,static_external_forquarter_634L,static_external_fortoday_1092L,static_external_forweek_1077L,static_external_forweek_528L,static_external_forweek_601L,static_external_foryear_618L,static_external_foryear_818L,static_external_foryear_850L,static_external_fourthquarter_440L,static_external_numberofqueries_373L,static_external_pmtaverage_3A,static_external_pmtaverage_4527227A,static_external_pmtaverage_4955615A,static_external_pmtcount_4527229L,static_external_pmtcount_4955617L,static_external_pmtcount_693L,static_external_pmtscount_423L,static_external_pmtssum_45A,static_external_secondquarter_766L,static_external_thirdquarter_1082L,static_external_assignmentdate_238D,static_external_assignmentdate_4527235D,static_external_assignmentdate_4955616D,static_external_birthdate_574D,static_external_dateofbirth_337D,static_external_dateofbirth_342D,static_external_description_5085714M,static_external_education_1103M,static_external_education_88M,static_external_maritalst_385M,static_external_maritalst_893M,static_external_requesttype_4525192L,static_external_responsedate_1012D,static_external_responsedate_4527233D,static_external_responsedate_4917613D,static_external_riskassesment_302T,static_external_riskassesment_940T
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,date,date,date,date,date,date,str,str,str,str,str,str,date,date,date,f64,str
734449,,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,,,,,,,,,,0.00,0.00,,,,,,,6.00,30519.00,0.00,0.00,,,,1977-01-01,1977-01-01,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,2019-07-20,,,,
2594354,,1.00,1.00,1.00,4.00,1.00,1.00,,,,,,,,,,,,,,,,,0.00,4.00,17465.20,,,,,6.00,,,0.00,4.00,2009-03-17,,,1951-03-01,1951-03-01,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,2019-07-19,,,,
825962,,1.00,1.00,0.00,2.00,1.00,1.00,,,,,,,,,,,,,,,,,2.00,2.00,,,,,,,,,0.00,2.00,,,,,1988-05-01,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""DEDUCTION_6""",,2019-10-23,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57543,22130.26,9.00,9.00,9.00,10.00,9.00,4.00,,,,,,,,,,,,,,,,,4.00,10.00,,,,,,,,,1.00,6.00,,,,,1996-08-01,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,2020-10-20,,
57549,,0.00,0.00,0.00,0.00,0.00,0.00,,,,,,,,,,,,,,,,,0.00,0.00,,,,,,,,,2.00,0.00,,,,,1992-03-01,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,2020-10-20,,
57634,,,,,,,,,,,,,,,,,,,,,,,,,,,,6917.00,,14.00,,,,,,,,2017-12-15,,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,2020-10-20,,


In [14]:
total_person_shallow = preprocess("person_1","person_shallow")
total_person_shallow

case_id,person_shallow_childnum_185L,person_shallow_mainoccupationinc_384A,person_shallow_num_group1,person_shallow_personindex_1023L,person_shallow_persontype_1072L,person_shallow_persontype_792L,person_shallow_birth_259D,person_shallow_birthdate_87D,person_shallow_contaddr_district_15M,person_shallow_contaddr_matchlist_1032L,person_shallow_contaddr_smempladdr_334L,person_shallow_contaddr_zipcode_807M,person_shallow_education_927M,person_shallow_empl_employedfrom_271D,person_shallow_empl_employedtotal_800L,person_shallow_empl_industry_691L,person_shallow_empladdr_district_926M,person_shallow_empladdr_zipcode_114M,person_shallow_familystate_447L,person_shallow_gender_992L,person_shallow_housetype_905L,person_shallow_housingtype_772L,person_shallow_incometype_1044T,person_shallow_isreference_387L,person_shallow_language1_981M,person_shallow_maritalst_703L,person_shallow_registaddr_district_1083M,person_shallow_registaddr_zipcode_184M,person_shallow_relationshiptoclient_415T,person_shallow_relationshiptoclient_642T,person_shallow_remitter_829L,person_shallow_role_1084L,person_shallow_role_993L,person_shallow_safeguarantyflag_411L,person_shallow_sex_738L,person_shallow_type_25L
u32,f32,f32,u16,f32,f32,f32,date,date,str,bool,bool,str,str,date,str,str,str,str,str,str,str,str,str,bool,str,str,str,str,str,str,bool,str,str,bool,str,str
670893,,20000.00,3,1.00,4.00,4.00,1990-03-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2014-01-15,"""MORE_FIVE""","""OTHER""","""a55475b1""","""a55475b1""","""MARRIED""",,,,"""SALARIED_GOVT""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""SPOUSE""","""SPOUSE""",false,"""PE""",,true,"""F""","""PRIMARY_MOBILE"""
2696344,,40000.00,0,0.00,1.00,1.00,1973-05-01,,"""P121_27_170""",false,false,"""P39_16_16""","""a55475b1""",2016-08-20,,,"""a55475b1""","""a55475b1""",,,"""OWNED""",,"""EMPLOYED""",,"""P10_39_147""",,"""P121_27_170""","""P39_16_16""",,,,"""CL""",,true,"""F""","""PRIMARY_MOBILE"""
1027286,,72000.00,1,1.00,4.00,4.00,1960-07-01,,"""P153_41_170""",false,false,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""","""SINGLE""",,,,"""SALARIED_GOVT""",,"""a55475b1""",,"""P153_41_170""","""P138_102_156""","""SPOUSE""","""SPOUSE""",false,"""PE""",,false,"""M""","""PHONE"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57551,,24000.00,2,1.00,5.00,5.00,1990-08-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2019-01-15,"""MORE_FIVE""","""OTHER""","""a55475b1""","""a55475b1""","""MARRIED""",,,,"""EMPLOYED""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""FRIEND""","""FRIEND""",false,"""PE""",,false,"""M""","""PHONE"""
57549,,15000.00,3,1.00,5.00,5.00,1992-03-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2019-05-04,"""MORE_ONE""","""OTHER""","""a55475b1""","""a55475b1""","""SINGLE""",,,,"""EMPLOYED""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""SIBLING""","""SIBLING""",false,"""EM""",,false,"""M""","""PRIMARY_MOBILE"""
57543,,36000.00,2,1.00,5.00,5.00,1996-08-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2018-02-15,"""MORE_ONE""","""OTHER""","""a55475b1""","""a55475b1""","""MARRIED""",,,,"""EMPLOYED""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""OTHER""","""OTHER""",false,"""EM""",,false,"""M""","""PHONE"""


In [15]:
total_person_depth = preprocess("person_2","person_depth")
total_person_depth

case_id,person_depth_num_group1,person_depth_num_group2,person_depth_addres_district_368M,person_depth_addres_role_871L,person_depth_addres_zip_823M,person_depth_conts_role_79M,person_depth_empls_economicalst_849M,person_depth_empls_employedfrom_796D,person_depth_empls_employer_name_740M,person_depth_relatedpersons_role_762T
u32,u16,u16,str,str,str,str,str,date,str,str
845073,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
958719,1,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
2545988,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
…,…,…,…,…,…,…,…,…,…,…
57633,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
57631,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
57636,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",


In [16]:
total_other_shallow = preprocess("other_1","other_shallow")
total_other_shallow

case_id,other_shallow_amtdebitincoming_4809443A,other_shallow_amtdebitoutgoing_4809440A,other_shallow_amtdepositbalance_4809441A,other_shallow_amtdepositincoming_4809444A,other_shallow_amtdepositoutgoing_4809442A,other_shallow_num_group1
u32,f32,f32,f32,f32,f32,u16
2695269,2533.40,2533.40,0.00,0.00,0.00,0
253004,96.40,0.00,0.00,0.00,0.00,0
2702245,1848.80,1848.80,0.00,0.00,0.00,0
…,…,…,…,…,…,…
57694,0.00,0.00,246588.20,0.00,2294.80,0
57932,0.00,0.00,23212.00,0.00,178.20,0
57719,0.00,0.00,507.00,0.00,3.80,0


In [17]:
total_deposit_shallow = preprocess("deposit_1","deposit_shallow")
total_deposit_shallow

case_id,deposit_shallow_amount_416A,deposit_shallow_num_group1,deposit_shallow_contractenddate_991D,deposit_shallow_openingdate_313D
u32,f32,u16,date,date
1492237,317378.91,1,,2016-06-27
134883,0.00,0,2017-06-17,2015-06-18
2683086,505.06,1,2018-02-09,2014-02-28
…,…,…,…,…
57737,202.00,0,2018-05-16,2016-05-13
58038,78590.74,1,,2016-03-04
57968,2025.38,0,,2017-05-08


In [18]:
total_debitcard_shallow = preprocess("debitcard","card_shallow")
total_debitcard_shallow

case_id,card_shallow_last180dayaveragebalance_704A,card_shallow_last180dayturnover_1134A,card_shallow_last30dayturnover_651A,card_shallow_num_group1,card_shallow_openingdate_857D
u32,f32,f32,f32,u16,date
1596436,0.00,79.88,0.00,2,2016-01-28
1835274,,,,0,2016-03-08
205954,,,,1,2016-03-18
…,…,…,…,…,…
57925,,,,0,2017-04-07
57932,,,,0,2017-07-06
58038,,,,1,2016-03-04


In [19]:
total_credit_internal_shallow = preprocess("bureau_a_1","int_shallow")
total_credit_internal_shallow

case_id,int_shallow_annualeffectiverate_199L,int_shallow_annualeffectiverate_63L,int_shallow_contractsum_5085717L,int_shallow_credlmt_230A,int_shallow_credlmt_935A,int_shallow_debtoutstand_525A,int_shallow_debtoverdue_47A,int_shallow_dpdmax_139P,int_shallow_dpdmax_757P,int_shallow_instlamount_768A,int_shallow_instlamount_852A,int_shallow_interestrate_508L,int_shallow_monthlyinstlamount_332A,int_shallow_monthlyinstlamount_674A,int_shallow_nominalrate_281L,int_shallow_nominalrate_498L,int_shallow_num_group1,int_shallow_numberofcontrsvalue_258L,int_shallow_numberofcontrsvalue_358L,int_shallow_numberofinstls_229L,int_shallow_numberofinstls_320L,int_shallow_numberofoutstandinstls_520L,int_shallow_numberofoutstandinstls_59L,int_shallow_numberofoverdueinstlmax_1039L,int_shallow_numberofoverdueinstlmax_1151L,int_shallow_numberofoverdueinstls_725L,int_shallow_numberofoverdueinstls_834L,int_shallow_outstandingamount_354A,int_shallow_outstandingamount_362A,int_shallow_overdueamount_31A,int_shallow_overdueamount_659A,int_shallow_overdueamountmax2_14A,int_shallow_overdueamountmax2_398A,int_shallow_overdueamountmax_155A,int_shallow_overdueamountmax_35A,int_shallow_periodicityofpmts_1102L,…,int_shallow_totalamount_6A,int_shallow_totalamount_996A,int_shallow_totaldebtoverduevalue_178A,int_shallow_totaldebtoverduevalue_718A,int_shallow_totaloutstanddebtvalue_39A,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_financialinstitution_591M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_426M,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,int_shallow_subjectrole_182M,int_shallow_subjectrole_93M
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,str,str,str,str,date,date,date,date,date,str,str,str,str,str,str,str,date,date,date,date,date,date,str,str,str,str,str,str,date,str,str
1350152,,0.12,,,70000.00,115968.08,0.00,0.00,,9818.60,,,9818.60,,0.12,,16,4.00,11.00,,12.00,,10.00,12.00,,0.00,,,28336.04,,0.00,6929.05,,0.00,,,…,,33998.00,0.00,0.00,115968.08,0.00,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2020-04-03,,,2019-01-14,,"""a55475b1""",,"""2.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-04-23,,,2015-04-14,,2015-03-13,,"""5.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-05-08,"""a55475b1""","""a55475b1"""
2558305,,,,,170000.00,179641.42,0.00,0.00,,6980.60,,,6980.60,,40.00,,10,2.00,8.00,,24.00,,11.00,1.00,,0.00,,,39168.69,,0.00,2833.29,,0.00,,,…,,69504.00,0.00,0.00,179641.42,0.00,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2020-02-11,,,2018-02-12,,"""a55475b1""",,"""3.0""","""2017.0""",,"""a55475b1""","""a55475b1""",2019-04-10,,,2016-12-24,,2016-12-24,,"""5.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-04-17,"""a55475b1""","""a55475b1"""
1256674,,,,,10198.00,11879.04,0.00,0.00,,850.00,,,2168.60,,45.00,,10,2.00,,,5.00,,2.00,0.00,,0.00,,,4231.04,,0.00,0.00,,0.00,,,…,,9720.00,0.00,,11879.04,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2019-03-01,,,2014-11-22,,"""a55475b1""",,"""10.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-01-11,,,,,,,"""10.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-02-01,"""a55475b1""","""a55475b1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
87191,,,,,,0.00,0.00,,,,,,,,,,8,,,,,,,,,,,,,,,,,,,,…,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,"""a55475b1""",,,,,"""a55475b1""","""a55475b1""",,,,,,,,,,,"""a55475b1""","""a55475b1""",2022-03-26,"""a55475b1""","""a55475b1"""
87206,,56.00,1000000.00,,200000.00,1012822.69,0.00,0.00,0.00,9068.00,,,9068.00,6391.89,20.95,,0,4.00,2.00,24.00,,0.00,,0.00,0.00,0.00,0.00,0.00,,0.00,0.00,0.00,0.00,0.00,0.00,30.00,…,100000.00,,0.00,0.00,777849.06,0.00,"""ea6782cc""","""00135d9c""","""7241344e""","""7241344e""",2023-06-13,2022-07-29,2020-07-30,2017-06-13,2020-10-29,"""a55475b1""","""8.0""","""4.0""","""2020.0""","""2020.0""","""d6a7d943""","""b619fa46""",2022-03-11,2020-11-25,,,,,"""8.0""","""4.0""","""2020.0""","""2020.0""","""60c73645""","""96a8fdfe""",2022-03-26,"""ab3c25cf""","""ab3c25cf"""
100984,,56.00,0.00,75000.00,0.00,0.00,0.00,0.00,530.00,0.00,400.00,,0.00,400.00,0.00,,9,1.00,5.00,0.00,,0.00,,3.00,589.00,0.00,0.00,0.00,,0.00,0.00,7447.60,20654.40,0.00,20654.40,30.00,…,233376.00,,0.00,0.00,0.00,0.00,"""a55475b1""","""ea6782cc""","""a55475b1""","""a55475b1""",2023-06-30,2018-06-28,2011-07-13,2019-06-30,2018-08-10,"""a55475b1""","""1.0""","""10.0""","""2020.0""","""2018.0""","""a55475b1""","""a55475b1""",2022-09-07,2018-08-11,2018-08-02,2020-06-04,2018-08-02,2020-06-04,"""7.0""","""10.0""","""2020.0""","""2018.0""","""a55475b1""","""a55475b1""",2022-09-15,"""a55475b1""","""a55475b1"""


In [20]:
total_credit_internal_depth = preprocess("bureau_a_2","int_depth")
total_credit_internal_depth

case_id,int_depth_collater_valueofguarantee_1124L,int_depth_collater_valueofguarantee_876L,int_depth_num_group1,int_depth_num_group2,int_depth_pmts_dpd_1073P,int_depth_pmts_dpd_303P,int_depth_pmts_overdue_1140A,int_depth_pmts_overdue_1152A,int_depth_collater_typofvalofguarant_298M,int_depth_collater_typofvalofguarant_407M,int_depth_collaterals_typeofguarante_359M,int_depth_collaterals_typeofguarante_669M,int_depth_pmts_month_158T,int_depth_pmts_month_706T,int_depth_pmts_year_1139T,int_depth_pmts_year_507T,int_depth_subjectroles_name_541M,int_depth_subjectroles_name_838M
u32,f32,f32,u16,u16,f32,f32,f32,f32,str,str,str,str,str,str,str,str,str,str
1259961,0.00,,1,35,0.00,,0.00,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""9.0""",,"""2019.0""",,"""a55475b1""","""a55475b1"""
1265725,0.00,,3,35,0.00,,0.00,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""10.0""",,"""2019.0""",,"""a55475b1""","""a55475b1"""
1292604,0.00,,1,35,0.00,,0.00,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""4.0""",,"""2019.0""",,"""a55475b1""","""a55475b1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
82797,0.00,0.00,0,9,0.00,0.00,0.00,0.00,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""8.0""","""10.0""","""2020.0""","""2018.0""","""a55475b1""","""a55475b1"""
85784,0.00,,0,9,0.00,,0.00,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""10.0""",,"""2020.0""",,"""a55475b1""","""a55475b1"""
91257,0.00,0.00,0,9,0.00,0.00,0.00,0.00,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""3.0""","""4.0""","""2020.0""","""2017.0""","""a55475b1""","""a55475b1"""


In [21]:
total_credit_external_shallow = preprocess("bureau_b_1","ext_shallow")
total_credit_external_shallow

case_id,ext_shallow_amount_1115A,ext_shallow_credlmt_1052A,ext_shallow_credlmt_228A,ext_shallow_credlmt_3940954A,ext_shallow_credquantity_1099L,ext_shallow_credquantity_984L,ext_shallow_debtpastduevalue_732A,ext_shallow_debtvalue_227A,ext_shallow_dpd_550P,ext_shallow_dpd_733P,ext_shallow_dpdmax_851P,ext_shallow_installmentamount_644A,ext_shallow_installmentamount_833A,ext_shallow_instlamount_892A,ext_shallow_interesteffectiverate_369L,ext_shallow_interestrateyearly_538L,ext_shallow_maxdebtpduevalodued_3940955A,ext_shallow_num_group1,ext_shallow_numberofinstls_810L,ext_shallow_overdueamountmax_950A,ext_shallow_pmtdaysoverdue_1135P,ext_shallow_pmtnumpending_403L,ext_shallow_residualamount_1093A,ext_shallow_residualamount_127A,ext_shallow_residualamount_3940956A,ext_shallow_totalamount_503A,ext_shallow_totalamount_881A,ext_shallow_classificationofcontr_1114M,ext_shallow_contractdate_551D,ext_shallow_contractmaturitydate_151D,ext_shallow_contractst_516M,ext_shallow_contracttype_653M,ext_shallow_credor_3940957M,ext_shallow_dpdmaxdatemonth_804T,ext_shallow_dpdmaxdateyear_742T,ext_shallow_lastupdate_260D,ext_shallow_overdueamountmaxdatemonth_494T,ext_shallow_overdueamountmaxdateyear_432T,ext_shallow_periodicityofpmts_997L,ext_shallow_periodicityofpmts_997M,ext_shallow_pmtmethod_731M,ext_shallow_purposeofcred_722M,ext_shallow_subjectrole_326M,ext_shallow_subjectrole_43M
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,date,date,str,str,str,str,str,date,str,str,str,str,str,str,str,str
836028,9516.00,0.00,6000.00,0.00,1.00,8.00,0.00,2379.00,0.00,0.00,0.00,0.00,2379.00,2379.00,,0.00,0.00,1,4.00,0.00,0.00,1.00,0.00,0.00,0.00,9516.00,124444.40,"""ea6782cc""",2011-05-22,2019-11-11,"""7241344e""","""1c9c5356""","""b619fa46""","""6.0""","""2011.0""",2019-10-26,"""8.0""","""2011.0""",,"""a0b598e4""","""f6e26148""","""60c73645""","""ab3c25cf""","""ab3c25cf"""
111824,55167.60,,,,1.00,1.00,0.00,35647.43,0.00,0.00,17676.00,0.00,35647.43,4216.00,,43.00,0.60,0,18.00,0.60,0.00,10.00,,,,55167.60,11955.80,"""ea6782cc""",2018-06-04,2019-12-05,"""7241344e""","""4257cbed""","""P0_31_66""","""2.0""","""2019.0""",2019-03-01,"""2.0""","""2019.0""",,"""a0b598e4""","""f6e26148""","""60c73645""","""ab3c25cf""","""ab3c25cf"""
1391167,20338.40,8040.00,0.00,8040.00,1.00,3.00,0.00,11863.40,0.00,0.00,0.00,0.00,11863.40,1695.00,,0.00,0.00,1,12.00,0.00,0.00,7.00,0.00,5359.80,5359.80,20338.40,237079.28,"""ea6782cc""",2018-12-19,2021-05-17,"""7241344e""","""4257cbed""","""b619fa46""","""1.0""","""2019.0""",2019-06-14,"""1.0""","""2017.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""ab3c25cf""","""ab3c25cf"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57679,60000.00,0.00,,0.00,1.00,2.00,0.00,35481.71,0.00,0.00,17310.00,0.00,35481.71,3529.16,,,41.60,1,24.00,43.00,0.00,12.00,,0.00,0.00,60000.00,86800.00,"""ea6782cc""",2019-06-26,2021-10-04,"""7241344e""","""4257cbed""","""50babcd4""","""9.0""","""2020.0""",2020-10-04,"""10.0""","""2020.0""",,"""a0b598e4""","""dbcbe8f8""","""60c73645""","""ab3c25cf""","""a55475b1"""
57660,,133279.00,,113279.00,2.00,1.00,0.00,,0.00,0.00,32618.00,0.00,109099.58,,,,11.40,1,,16.20,0.00,,,109099.58,90845.98,133279.00,16800.00,"""ea6782cc""",2019-12-08,2021-03-14,"""7241344e""","""1c9c5356""","""b619fa46""","""12.0""","""2019.0""",2020-10-20,"""8.0""","""2019.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""ab3c25cf""","""ab3c25cf"""
57767,,0.00,64000.00,0.00,1.00,2.00,0.00,,0.00,0.00,0.00,0.00,0.00,,,,0.00,0,,0.00,0.00,,0.00,0.00,0.00,0.00,64000.00,"""ea6782cc""",2013-10-14,2021-10-01,"""7241344e""","""1c9c5356""","""b619fa46""","""11.0""","""2013.0""",2020-10-19,"""11.0""","""2013.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""ab3c25cf""","""ab3c25cf"""


In [22]:
total_credit_external_depth = preprocess("bureau_b_2","ext_depth")
total_credit_external_depth

case_id,ext_depth_num_group1,ext_depth_num_group2,ext_depth_pmts_dpdvalue_108P,ext_depth_pmts_pmtsoverdue_635A,ext_depth_pmts_date_1107D
u32,u16,u16,f32,f32,date
36862,0,3,0.00,0.00,2019-09-15
183127,1,6,0.00,0.00,2019-11-15
1884030,0,8,0.00,0.00,2020-07-15
…,…,…,…,…,…
1607294,2,30,0.00,0.00,2019-07-15
207487,0,5,0.00,0.00,2020-01-15
57660,1,34,8814.00,2.20,2020-08-15


In [23]:
total_registry_a = preprocess("registry_a","reg_a")
total_registry_a

case_id,reg_a_amount_4527230A,reg_a_num_group1,reg_a_name_4527232M,reg_a_recorddate_4527225D
u32,f32,u16,str,date
1692516,12263.60,11,"""41493a12""",2019-12-31
943517,850.00,2,"""100201b1""",2020-02-07
863833,5041.40,7,"""1e028df3""",2019-11-30
…,…,…,…,…
1817264,2235.60,10,"""9a0cb0f0""",2020-03-29
57679,3640.00,4,"""ba006408""",2020-10-20
57689,3214.80,6,"""aac4edf4""",2020-10-20


In [24]:
total_registry_b = preprocess("registry_b","reg_b")
total_registry_b

case_id,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_b_name_4917606M
u32,f32,u16,date,str
1872898,20131.80,5,2020-05-25,"""3570e91a"""
1885852,17086.20,4,2020-05-14,"""78403fe2"""
1887552,59032.80,3,2020-07-06,"""bd1743f9"""
…,…,…,…,…
1909429,134820.00,5,2020-06-01,"""38f38455"""
57543,31060.20,4,2020-09-17,"""d2142f5c"""
57549,8760.60,11,2020-05-06,"""bf93e092"""


In [25]:
total_registry_c = preprocess("registry_c","reg_c")
total_registry_c

case_id,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_employername_160M,reg_c_processingdate_168D
u32,u16,f32,str,date
1369802,5,20404.87,"""61b1e5e0""",2019-02-26
706150,4,6003.60,"""4a8b84d8""",2019-01-04
1332842,5,21563.95,"""4096c98c""",2019-02-19
…,…,…,…,…
647137,5,1193.75,"""3c947003""",2019-01-09
1285628,5,600.00,"""cf91d96f""",2018-09-07
1354448,5,7615.41,"""af8c129e""",2019-01-07


In [26]:
drop_columns = [
    "Date",
    # "static_base_commnoinclast6m_3546845L",
    # "static_base_deferredmnthsnum_166L",
    # "static_base_mastercontrelectronic_519L",
    # "static_base_mastercontrexist_109L",
    # "static_base_bankacctype_710L",
    # "static_base_isdebitcard_729L",
    # "static_base_paytype1st_925L",
    # "static_base_paytype_783L",
    # "static_base_typesuite_864L",
    # "person_shallow_contaddr_matchlist_1032L",
    # "person_shallow_remitter_829L",
    # "person_depth_empls_employer_name_740M",
    # "int_shallow_subjectrole_182M",
    # "person_shallow_relationshiptoclient_642T",
    # "int_shallow_refreshdate_3813885D",
    # "int_shallow_monthlyinstlamount_674A",
    # "static_base_maxlnamtstart6m_4525199A",
    # "static_base_dtlastpmtallstes_4499206D",
    # "static_base_cardtype_51L",
    # "static_base_applicationscnt_1086L",
    # "past_shallow_profession_152M",
    # "int_shallow_periodicityofpmts_1102L",
    # "past_shallow_annuity_853A",
    # "card_shallow_openingdate_857D",
    # "deposit_shallow_openingdate_313D",
    # "int_shallow_credlmt_230A",
    # "static_base_avgpmtlast12m_4525200A",
    # "int_shallow_monthlyinstlamount_332A",
    # "static_base_maxinstallast24m_3658928A",
    # "int_shallow_outstandingamount_354A",
    # "static_base_clientscnt_946L",
    # "int_shallow_credlmt_935A",
    # "int_shallow_dpdmaxdatemonth_442T",
    # "int_shallow_periodicityofpmts_837L",
    # "static_base_clientscnt_304L",
    # "int_shallow_totaloutstanddebtvalue_668A",
    # "int_shallow_residualamount_488A",
    # "person_shallow_contaddr_smempladdr_334L",
    # "person_depth_conts_role_79M",
    # "int_shallow_subjectrole_93M",
    # "int_shallow_contractsum_5085717L",
    # "past_depth_cacccardblochreas_147M",
    # "int_shallow_overdueamount_659A",
    # "int_shallow_debtoverdue_47A",
    # "int_shallow_totaldebtoverduevalue_178A",
    # "int_shallow_overdueamountmaxdatemonth_284T",
    # "person_shallow_empladdr_district_926M",
    # "person_depth_empls_economicalst_849M",
    # "static_base_maxpmtlast3m_4525190A",
    # "past_shallow_isbidproduct_390L",
    # "person_shallow_housetype_905L",
    # "person_shallow_empladdr_zipcode_114M",
    # "int_shallow_overdueamountmax_155A",
    # "int_shallow_overdueamountmax2_14A",
    # "int_shallow_financialinstitution_591M",
    # "static_base_totinstallast1m_4525188A",
    # "static_base_lastapprcommoditycat_1041M",
    # "static_base_applicationcnt_361L",
    # "person_shallow_safeguarantyflag_411L",
    # "person_depth_addres_zip_823M",
    # "person_depth_addres_district_368M",
    # "int_shallow_overdueamountmax_35A",
    # "static_base_clientscnt_493L",
    # "int_shallow_overdueamountmax2_398A",
    # "past_shallow_actualdpd_943P",
    # "past_depth_credacc_cards_status_52L",
    # "person_depth_num_group2",
    # "static_base_actualdpdtolerance_344P",
    ]

In [27]:
total_df = (
    total_base
    .join(
        total_past_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_past_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_base,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_external,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_other_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_deposit_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_debitcard_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_a,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_b,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_c,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.String).cast(pl.Categorical)
    )
    .drop(drop_columns)
    .pipe(select_impuatable)
    .pipe(select_low_catcols)
    .with_columns(
        pl.col(pl.INTEGER_DTYPES).fill_null(strategy="mean"),
        pl.col(pl.FLOAT_DTYPES).fill_null(strategy="mean"),
        pl.col(pl.Boolean).fill_null(value=pl.col(pl.Boolean).drop_nulls().mode().first()),
        pl.col(pl.Categorical).fill_null(value=pl.col(pl.Categorical).drop_nulls().mode().first())
    )
    .with_columns(
        pl.col(pl.Categorical).to_physical()
    )
    .with_columns(
        (pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]) - pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min())/(pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).max()-pl.col(pl.INTEGER_DTYPES).exclude(["case_id_base","target"]).min()),
        (pl.col(pl.FLOAT_DTYPES) - pl.col(pl.FLOAT_DTYPES).mean())/(pl.col(pl.FLOAT_DTYPES).std())
    )
    .select(
        ~cs.by_name("target"),
        cs.by_name("target")
    )
)
total_df

case_id_base,month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,…,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_financialinstitution_591M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_426M,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,int_shallow_subjectrole_182M,int_shallow_subjectrole_93M,reg_a_amount_4527230A,reg_a_num_group1,reg_a_recorddate_4527225D,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D,target
u32,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f32,f64,f64,f32,f64,f64,f64,f32,f64,u8
0,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,…,0.18,0.59,0.76,0.87,0.59,0.00,0.00,0.45,0.40,0.06,0.00,0.00,0.99,0.79,0.71,0.88,0.71,0.88,0.73,0.45,0.40,0.06,0.00,0.00,0.99,,0.00,0.00,0.06,0.91,0.00,0.06,0.55,0.04,0.00,0.63,0
1,0.00,0.00,0.50,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,…,0.18,0.59,0.76,0.87,0.59,0.00,0.00,0.45,0.40,0.06,0.00,0.00,0.99,0.79,0.71,0.88,0.71,0.88,0.73,0.45,0.40,0.06,0.00,0.00,0.99,,0.00,0.00,0.06,0.91,0.00,0.06,0.55,0.04,0.00,0.63,0
2,0.00,0.00,0.67,0.00,-0.01,-1.08,0.00,-1.12,0.00,-0.45,0.00,0.00,0.00,-1.06,0.00,-0.35,-1.58,0.00,0.05,0.00,0.08,0.00,0.08,0.78,0.00,0.61,0.20,0.00,0.78,0.88,0.91,0.00,0.85,0.00,0.61,0.00,…,0.18,0.59,0.76,0.87,0.59,0.00,0.00,0.45,0.40,0.06,0.00,0.00,0.99,0.79,0.71,0.88,0.71,0.88,0.73,0.45,0.40,0.06,0.00,0.00,0.99,,0.00,0.00,0.06,0.91,0.00,0.06,0.55,0.04,0.00,0.63,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57632,0.82,0.78,0.17,1.00,-0.01,0.19,0.00,0.00,0.00,-0.45,0.00,0.00,0.00,0.54,0.00,-0.35,0.99,0.00,0.00,0.00,0.08,0.00,0.08,0.78,0.12,1.00,0.20,0.00,0.78,0.88,0.91,0.20,0.83,0.25,1.00,0.00,…,0.18,0.59,0.76,0.87,0.59,0.00,0.00,0.45,0.40,0.06,0.00,0.00,0.99,0.79,0.71,0.88,0.71,0.88,0.73,0.45,0.40,0.06,0.00,0.00,0.99,,0.00,0.00,0.06,0.91,0.00,0.06,0.55,0.04,0.00,0.63,0
57633,0.82,0.78,0.17,1.00,-0.01,0.50,0.00,0.00,0.00,8.13,0.00,0.00,0.00,2.45,0.00,-0.35,3.77,0.00,0.16,0.00,1.14,0.00,1.14,0.78,0.02,0.77,0.20,0.00,0.78,0.88,0.91,0.20,0.90,0.25,0.77,0.00,…,0.18,0.59,0.76,0.87,0.59,0.00,0.00,0.45,0.40,0.06,0.00,0.00,0.99,0.79,0.71,0.88,0.71,0.88,0.73,0.45,0.40,0.06,0.00,0.00,0.99,,0.00,0.00,0.06,0.91,0.00,0.06,0.55,0.04,0.00,0.63,0
57634,0.82,0.78,0.17,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.21,0.00,0.00,0.00,0.00,0.78,0.00,0.78,0.20,0.50,0.78,0.88,0.91,0.20,0.83,0.25,0.76,0.50,…,0.18,0.59,0.76,0.87,0.59,0.00,0.00,0.45,0.40,0.06,0.00,0.00,0.99,0.79,0.71,0.88,0.71,0.88,0.73,0.45,0.40,0.06,0.00,0.00,0.99,,0.00,0.00,0.06,0.91,0.00,0.06,0.55,0.04,0.00,0.63,0


In [28]:
total_df.write_parquet("total_df.parquet")

In [28]:
train_total = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base")).not_()).drop("case_id_base")
train_total.head()

month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,past_shallow_isbidproduct_390L,…,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_financialinstitution_591M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_426M,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,int_shallow_subjectrole_182M,int_shallow_subjectrole_93M,reg_a_amount_4527230A,reg_a_num_group1,reg_a_recorddate_4527225D,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D,target
f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,u32,f64,u32,u32,f64,f64,f64,u32,f64,u32,f64,u32,bool,…,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,f64,u32,u32,f32,f64,f64,f32,f64,f64,f64,f32,f64,u8
0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.78,0,0.78,1,1,0.77,0.88,0.91,1,0.83,1,0.76,1,False,…,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62,0
0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.78,0,0.78,1,1,0.77,0.88,0.91,1,0.83,1,0.76,1,False,…,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62,0
0.0,0.0,0.67,0.0,-0.01,-1.08,0.0,-1.12,0.0,-0.45,0.0,0.0,0.0,-1.06,0.0,-0.35,-1.58,0.0,0.05,0.0,0.08,0.0,0.08,0.78,0,0.61,1,0,0.77,0.88,0.91,0,0.85,0,0.61,0,False,…,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62,0
0.0,0.0,0.5,0.0,-0.01,0.13,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.22,0.0,-0.35,-1.5,0.0,0.0,0.0,-0.97,0.0,-0.97,0.78,1,1.0,1,0,0.77,0.88,0.91,0,0.99,1,1.0,0,False,…,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62,0
0.0,0.0,0.67,0.0,-0.01,-0.84,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.6,0.0,-0.35,-1.36,0.0,0.0,0.0,0.08,0.0,0.08,0.78,2,1.0,1,0,0.77,0.88,0.91,1,0.83,1,1.0,0,False,…,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62,1


In [29]:
submission_df = total_df.filter(pl.col("case_id_base").is_in(test_base.select("case_id_base"))).drop(["target"])
submission_df.head()

case_id_base,month,week,weekday,year,past_shallow_actualdpd_943P,past_shallow_annuity_853A,past_shallow_byoccupationinc_3656910L,past_shallow_childnum_21L,past_shallow_credacc_actualbalance_314A,past_shallow_credacc_credlmt_575A,past_shallow_credacc_maxhisbal_375A,past_shallow_credacc_minhisbal_90A,past_shallow_credacc_transactions_402L,past_shallow_credamount_590A,past_shallow_currdebt_94A,past_shallow_downpmt_134A,past_shallow_mainoccupationinc_437A,past_shallow_maxdpdtolerance_577P,past_shallow_num_group1,past_shallow_outstandingdebt_522A,past_shallow_pmtnum_8L,past_shallow_revolvingaccount_394A,past_shallow_tenor_203L,past_shallow_approvaldate_319D,past_shallow_cancelreason_3545846M,past_shallow_creationdate_885D,past_shallow_credacc_status_367L,past_shallow_credtype_587L,past_shallow_dateactivated_425D,past_shallow_dtlastpmt_581D,past_shallow_dtlastpmtallstes_3545839D,past_shallow_education_1138M,past_shallow_employedfrom_700D,past_shallow_familystate_726L,past_shallow_firstnonzeroinstldate_307D,past_shallow_inittransactioncode_279L,…,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_financialinstitution_591M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_426M,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,int_shallow_subjectrole_182M,int_shallow_subjectrole_93M,reg_a_amount_4527230A,reg_a_num_group1,reg_a_recorddate_4527225D,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_processingdate_168D
u32,f64,f64,f64,f64,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f64,f32,f32,f32,f32,f64,u32,f64,u32,u32,f64,f64,f64,u32,f64,u32,f64,u32,…,u32,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,f64,u32,u32,f32,f64,f64,f32,f64,f64,f64,f32,f64
57543,0.82,0.78,0.17,1.0,-0.01,-1.12,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.64,0.0,-0.35,-0.97,0.0,0.0,0.0,0.7,0.0,0.7,0.78,1,0.85,1,1,0.77,0.88,0.91,1,0.83,1,0.84,1,…,0,0.39,0.59,0.76,0.82,0.59,0,0,2,3,1,0,0,1.0,0.79,0.7,1.0,0.7,0.95,8,5,0,1,0,0,0.67,0,0,0.0,0.06,0.91,-0.39,0.04,0.84,0.04,0.0,0.62
57549,0.82,0.78,0.17,1.0,-0.01,-0.98,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.98,0.0,-0.35,-1.47,0.0,0.0,0.0,-0.97,0.0,-0.97,0.78,1,0.9,1,0,0.77,0.88,0.91,0,0.94,0,0.9,0,…,1,0.4,0.61,0.64,0.87,0.6,0,7,2,2,16,5,0,1.0,0.98,0.98,0.88,0.92,0.88,2,2,2,14,0,3,0.99,0,1,0.0,0.06,0.91,-2.14,0.11,0.76,0.04,0.0,0.62
57551,0.82,0.78,0.17,1.0,-0.01,-0.07,0.0,0.0,0.0,-0.45,0.0,0.0,0.0,-0.22,0.0,-0.35,-0.41,0.0,0.05,0.0,-0.44,0.0,-0.44,0.78,1,0.94,1,0,0.77,0.88,0.91,0,0.95,1,0.94,0,…,0,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62
57552,0.82,0.78,0.33,1.0,-0.01,-0.23,0.0,-0.28,0.0,-0.45,0.0,0.0,0.0,-0.41,0.0,-0.35,-1.36,0.0,0.0,0.0,-0.44,0.0,-0.44,0.78,1,0.75,1,0,0.77,0.88,0.91,0,0.83,1,0.75,0,…,0,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62
57569,0.82,0.78,0.17,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.78,0,0.78,1,1,0.77,0.88,0.91,1,0.83,1,0.76,1,…,0,0.4,0.59,0.76,0.87,0.59,0,0,2,2,1,0,0,1.0,0.79,0.7,0.88,0.7,0.88,8,2,2,1,0,0,0.99,0,0,0.0,0.06,0.91,0.0,0.06,0.55,0.04,0.0,0.62


In [30]:
train_df,valid_df = train_test_split(
    train_total,
    stratify=train_total.select("target"),
    test_size=0.1,
    random_state=420,
    shuffle=True
    )

In [31]:
print(train_df.shape)
print(valid_df.shape)

(1373993, 351)
(152666, 351)


In [32]:
params = {
    "objective":"binary",
    "metric":"auc",
    "max_depth": 32,
    "learning_rate": 0.05,
    "n_estimators": 3000,
    "colsample_bynode": 0.9,
    "colsample_bytree": 0.9,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.2,
    "reg_lambda": 10,
    "extra_trees": True,
    "num_leaves": 64,
    "device": "gpu",
    "verbose": -1
}

In [33]:
X_train,y_train = train_df.select((~(cs.by_name("target"))).cast(pl.Float32)),train_df.select(pl.col("target").cast(pl.Float32))
X_valid,y_valid = valid_df.select((~(cs.by_name("target"))).cast(pl.Float32)),valid_df.select(pl.col("target").cast(pl.Float32))

In [34]:
model = lgb.LGBMClassifier(**params)
model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],callbacks=[lgb.log_evaluation(100)])

[100]	valid_0's auc: 0.838438
[200]	valid_0's auc: 0.848866
[300]	valid_0's auc: 0.85368
[400]	valid_0's auc: 0.856222
[500]	valid_0's auc: 0.857579
[600]	valid_0's auc: 0.858419
[700]	valid_0's auc: 0.85884
[800]	valid_0's auc: 0.859397
[900]	valid_0's auc: 0.859706
[1000]	valid_0's auc: 0.860029
[1100]	valid_0's auc: 0.860197
[1200]	valid_0's auc: 0.860348
[1300]	valid_0's auc: 0.860624
[1400]	valid_0's auc: 0.860788
[1500]	valid_0's auc: 0.860881
[1600]	valid_0's auc: 0.861002
[1700]	valid_0's auc: 0.861038
[1800]	valid_0's auc: 0.86113
[1900]	valid_0's auc: 0.861207
[2000]	valid_0's auc: 0.861237
[2100]	valid_0's auc: 0.861228
[2200]	valid_0's auc: 0.861313
[2300]	valid_0's auc: 0.861311
[2400]	valid_0's auc: 0.861336
[2500]	valid_0's auc: 0.861278
[2600]	valid_0's auc: 0.861144
[2700]	valid_0's auc: 0.861124
[2800]	valid_0's auc: 0.861172
[2900]	valid_0's auc: 0.861084
[3000]	valid_0's auc: 0.861057


In [35]:
train_df.select(pl.col("int_shallow_financialinstitution_591M").value_counts()).unnest(cs.all()).sort(by="count")

int_shallow_financialinstitution_591M,count
u32,u32
25,1
36,1
34,1
…,…
6,86
4,156
0,1373406


In [36]:
train_df.select(pl.col("target").value_counts()).unnest(cs.all())

target,count
u8,u32
0,1330798
1,43195


In [37]:
for num,col_name in sorted(zip(model.feature_importances_,X_train.columns)):
    print(num,col_name)

0 int_shallow_subjectrole_182M
0 int_shallow_subjectrole_93M
0 past_depth_cacccardblochreas_147M
0 person_depth_empls_employer_name_740M
0 person_shallow_contaddr_matchlist_1032L
0 person_shallow_remitter_829L
0 static_base_applicationcnt_361L
0 static_base_bankacctype_710L
0 static_base_commnoinclast6m_3546845L
0 static_base_deferredmnthsnum_166L
0 static_base_isdebitcard_729L
0 static_base_mastercontrelectronic_519L
0 static_base_mastercontrexist_109L
0 static_base_paytype1st_925L
0 static_base_paytype_783L
0 static_base_typesuite_864L
2 static_external_assignmentdate_4527235D
5 deposit_shallow_num_group1
5 int_shallow_purposeofcred_426M
8 static_base_cardtype_51L
10 person_depth_empls_economicalst_849M
11 int_shallow_numberofoverdueinstls_834L
15 static_base_clientscnt3m_3712950L
17 person_shallow_housetype_905L
17 reg_a_recorddate_4527225D
23 static_base_lastrejectcommodtypec_5251769M
25 static_external_responsedate_4527233D
28 int_shallow_contractst_964M
29 int_shallow_numberofout

In [38]:
sub_df = pd.DataFrame({
    "case_id": submission_df['case_id_base'].to_list(),
    "score": model.predict_proba(submission_df.drop("case_id_base"))[:,1]
}).set_index("case_id")
sub_df.to_csv("./submission.csv")