In [1]:
import os,sys,warnings,re,math,gc,time
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["KERAS_BACKEND"] = "tensorflow"
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from glob import glob
from sklearn.metrics import roc_auc_score,auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split,StratifiedKFold,StratifiedGroupKFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import tensorflow as tf
import keras
tf.get_logger().setLevel("ERROR")
%xmode Minimal

Exception reporting mode: Minimal


In [2]:
pl.Config.set_fmt_str_lengths(100)
pl.Config.set_thousands_separator(",")
pl.Config.set_fmt_float("full")
pl.Config.set_tbl_rows(6)

polars.config.Config

In [3]:
path_to_train = "Downloads/credit_risk/train"
path_to_test = "Downloads/credit_risk/test"
path_to_features = "Downloads/feature_definitions.csv"

In [4]:
feat_df = pl.read_csv(path_to_features)
feat_df.head()

Variable,Description
str,str
"""actualdpd_943P""","""Days Past Due (DPD) of previous contract (actual)."""
"""actualdpdtolerance_344P""","""DPD of client with tolerance."""
"""addres_district_368M""","""District of the person's address."""
"""addres_role_871L""","""Role of person's address."""
"""addres_zip_823M""","""Zip code of the address."""


In [5]:
all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
display(train_files_df.head())
display(test_files_df.head())

index,path,filename
i64,str,str
31,"""Downloads/credit_risk/train/train_applprev_1_0.parquet""","""train_applprev_1_0.parquet"""
13,"""Downloads/credit_risk/train/train_applprev_1_1.parquet""","""train_applprev_1_1.parquet"""
0,"""Downloads/credit_risk/train/train_applprev_2.parquet""","""train_applprev_2.parquet"""
7,"""Downloads/credit_risk/train/train_base.parquet""","""train_base.parquet"""
11,"""Downloads/credit_risk/train/train_credit_bureau_a_1_0.parquet""","""train_credit_bureau_a_1_0.parquet"""


index,path,filename
i64,str,str
31,"""Downloads/credit_risk/test/test_applprev_1_0.parquet""","""test_applprev_1_0.parquet"""
35,"""Downloads/credit_risk/test/test_applprev_1_1.parquet""","""test_applprev_1_1.parquet"""
9,"""Downloads/credit_risk/test/test_applprev_1_2.parquet""","""test_applprev_1_2.parquet"""
1,"""Downloads/credit_risk/test/test_applprev_2.parquet""","""test_applprev_2.parquet"""
33,"""Downloads/credit_risk/test/test_base.parquet""","""test_base.parquet"""


In [6]:
train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)
train_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
0,2019-01-03,1,1,4,0,0
1,2019-01-03,1,1,4,0,0
2,2019-01-04,1,1,5,0,0
…,…,…,…,…,…,…
2703452,2020-10-05,10,41,1,1,0
2703453,2020-10-05,10,41,1,1,0
2703454,2020-10-05,10,41,1,1,0


In [7]:
test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt32).alias("case_id_base"),
        cs.contains("date").str.to_date().alias("Date"),
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.week().alias("week"),
        pl.col("Date").dt.weekday().alias("weekday"),
        (pl.col("Date").dt.year() - 2019).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
)
test_base

case_id_base,Date,month,week,weekday,year,target
u32,date,i8,i8,i8,i32,u8
57543,2020-10-06,10,41,2,1,0
57549,2020-10-06,10,41,2,1,0
57551,2020-10-06,10,41,2,1,0
…,…,…,…,…,…,…
57632,2020-10-06,10,41,2,1,0
57633,2020-10-06,10,41,2,1,0
57634,2020-10-06,10,41,2,1,0


In [8]:
total_base = (
    pl.concat(
        [
            train_base,
            test_base
        ],
        how="vertical"
    )
)

In [9]:
def reduce_dtypes(df:pl.DataFrame):
    return (
        df
        .with_columns(
            cs.by_name("case_id").cast(pl.UInt32),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            cs.ends_with("T","M").cast(pl.String),
            cs.ends_with("P","A").cast(pl.Float32),
            (cs.ends_with("L") & cs.numeric()).cast(pl.Float32),
        )
    )

def grouping(df):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.numeric().max(),
            (~cs.numeric()).drop_nulls().mode().first()
        )
    )

def preprocess(filter_string:str,prefix_string:str):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(filter_string))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(_).pipe(reduce_dtypes).pipe(grouping) for _ in train_files_list
                ]
            )
        )
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(_).select(train_df.columns).cast(train_df.schema).pipe(grouping) for _ in test_files_list
                ]
            )
        )
    return (
        pl.concat(
            [
                train_df,
                test_df
            ],
            how="vertical_relaxed"
        )
        .pipe(lambda df: df.rename({_:f"{prefix_string}_{_}" for _ in df.columns if not _ == 'case_id'}))
    )


def select_impuatable(df:pl.DataFrame,thresh=0.95):
    cols =  (
        df
        .select(pl.all().is_null().mean())
        .transpose(include_header=True)
        .filter(pl.col("column_0") < thresh)
        ["column"].to_list()
    )
    return df.select(cols)

In [10]:
total_past_shallow = preprocess("applprev_1","past_shallow")
total_past_shallow

In [None]:
total_past_depth = preprocess("applprev_2","past_depth")
total_past_depth

case_id,past_depth_num_group1,past_depth_num_group2,past_depth_cacccardblochreas_147M,past_depth_conts_type_509L,past_depth_credacc_cards_status_52L
u32,u16,u16,str,str,str
1434046,2,3,"""a55475b1""","""PRIMARY_MOBILE""",
1487381,5,2,"""a55475b1""","""PRIMARY_MOBILE""",
1352644,1,2,"""a55475b1""","""PRIMARY_MOBILE""",
…,…,…,…,…,…
57543,0,0,,"""PRIMARY_MOBILE""",
57551,1,1,,"""PRIMARY_MOBILE""",
57552,0,2,,"""PRIMARY_MOBILE""",


In [None]:
total_static_base = preprocess("static_0","static_base")
total_static_base

case_id,static_base_actualdpdtolerance_344P,static_base_amtinstpaidbefduel24m_4187115A,static_base_annuity_780A,static_base_annuitynextmonth_57A,static_base_applicationcnt_361L,static_base_applications30d_658L,static_base_applicationscnt_1086L,static_base_applicationscnt_464L,static_base_applicationscnt_629L,static_base_applicationscnt_867L,static_base_avgdbddpdlast24m_3658932P,static_base_avgdbddpdlast3m_4187120P,static_base_avgdbdtollast24m_4525197P,static_base_avgdpdtolclosure24_3658938P,static_base_avginstallast24m_3658937A,static_base_avglnamtstart24m_4525187A,static_base_avgmaxdpdlast9m_3716943P,static_base_avgoutstandbalancel6m_4187114A,static_base_avgpmtlast12m_4525200A,static_base_clientscnt12m_3712952L,static_base_clientscnt3m_3712950L,static_base_clientscnt6m_3712949L,static_base_clientscnt_100L,static_base_clientscnt_1022L,static_base_clientscnt_1071L,static_base_clientscnt_1130L,static_base_clientscnt_136L,static_base_clientscnt_157L,static_base_clientscnt_257L,static_base_clientscnt_304L,static_base_clientscnt_360L,static_base_clientscnt_493L,static_base_clientscnt_533L,static_base_clientscnt_887L,static_base_clientscnt_946L,static_base_cntincpaycont9m_3716944L,…,static_base_credtype_322L,static_base_datefirstoffer_1144D,static_base_datelastinstal40dpd_247D,static_base_datelastunpaid_3546854D,static_base_disbursementtype_67L,static_base_dtlastpmtallstes_4499206D,static_base_equalitydataagreement_891L,static_base_equalityempfrom_62L,static_base_firstclxcampaign_1125D,static_base_firstdatedue_489D,static_base_inittransactioncode_186L,static_base_isbidproduct_1095L,static_base_isbidproductrequest_292L,static_base_isdebitcard_729L,static_base_lastactivateddate_801D,static_base_lastapplicationdate_877D,static_base_lastapprcommoditycat_1041M,static_base_lastapprcommoditytypec_5251766M,static_base_lastapprdate_640D,static_base_lastcancelreason_561M,static_base_lastdelinqdate_224D,static_base_lastrejectcommoditycat_161M,static_base_lastrejectcommodtypec_5251769M,static_base_lastrejectdate_50D,static_base_lastrejectreason_759M,static_base_lastrejectreasonclient_4145040M,static_base_lastrepayingdate_696D,static_base_lastst_736L,static_base_maxdpdinstldate_3546855D,static_base_opencred_647L,static_base_paytype1st_925L,static_base_paytype_783L,static_base_payvacationpostpone_4187118D,static_base_previouscontdistrict_112M,static_base_twobodfilling_608L,static_base_typesuite_864L,static_base_validfrom_1069D
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,str,date,date,date,str,date,bool,bool,date,date,str,bool,bool,bool,date,date,str,str,date,str,date,str,str,date,str,str,date,str,date,bool,str,str,date,str,str,str,date
1300184,0,,4581,0,0,0,0,0,0,1,0,,,0,9029,,,,,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,0,…,"""COL""",,,2018-05-29,"""SBA""",,,,,2017-12-29,"""POS""",false,,,2018-01-29,2017-11-29,"""P159_130_59""","""a55475b1""",2017-11-29,"""a55475b1""",2018-05-29,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,"""K""",2018-05-29,false,"""OTHER""","""OTHER""",,"""P192_103_107""","""FO""",,
1492892,0,21747,1497.800048828125,0,0,0,0,0,0,2,-2,,,0,2145.400146484375,,0,,,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,1,…,"""COL""",2016-03-01,,2018-12-05,"""SBA""",2018-12-06,,,2016-03-01,2015-05-11,"""POS""",false,,,2018-01-11,2017-12-05,"""P159_130_59""","""a55475b1""",2017-12-05,"""a55475b1""",2018-12-05,"""a55475b1""","""a55475b1""",,"""a55475b1""","""a55475b1""",,"""K""",2018-02-10,false,"""OTHER""","""OTHER""",,"""P98_137_111""","""FO""",,
1631428,0,12377,458.3999938964844,1579.5999755859375,0,0,0,0,0,4,-5,-5,-5,0,1074.5999755859375,9996,0,28420.373046875,1074.5999755859375,0,0,0,0,1,0,0,,0,0,0,0,0,0,0,0,10,…,"""COL""",,,,"""SBA""",2019-11-17,,,,2019-01-21,"""POS""",false,,,2019-10-31,2019-10-13,"""P12_6_178""","""a55475b1""",2019-10-13,"""a55475b1""",,"""P12_6_178""","""a55475b1""",2017-12-11,"""P99_56_166""","""P94_109_143""",,"""A""",,false,"""OTHER""","""OTHER""",,"""P173_115_85""","""FO""",,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
101065,0,,7795.80029296875,0,0,2,0,0,0,3,,,,,,,,,,0,0,0,0,0,0,0,,0,0,0,0,0,1,0,0,,…,"""CAL""",,,,,,,,,,"""CASH""",false,,false,2022-09-20,2022-09-20,"""a55475b1""","""a55475b1""",2022-09-20,"""P19_105_83""",,"""a55475b1""","""a55475b1""",2017-06-27,"""P198_131_9""","""P94_109_143""",,"""T""",,true,,,,"""a55475b1""","""FO""",,
101302,0,,6781.39990234375,0,0,0,0,0,0,3,,,,,,,,,,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,,…,"""CAL""",,,,"""GBA""",,,,,,"""CASH""",false,,false,,2019-03-19,"""a55475b1""","""a55475b1""",,"""P141_135_146""",,"""P109_133_183""","""P78_110_79""",2018-09-20,"""P94_109_143""","""P94_109_143""",,"""T""",,false,,,,"""a55475b1""","""FO""","""AL""",
101005,0,,1943.4000244140625,0,0,2,0,0,0,2,,,,,,,,,,0,0,0,0,0,0,0,,0,0,0,0,0,0,0,0,,…,"""CAL""",,,,"""GBA""",,,,,,"""CASH""",false,,false,,2022-09-13,"""a55475b1""","""a55475b1""",,"""P94_109_143""",,"""a55475b1""","""a55475b1""",2022-09-13,"""a55475b1""","""P94_109_143""",,"""D""",,false,"""OTHER""","""OTHER""",,"""a55475b1""","""FO""",,


In [None]:
total_static_external = (
    preprocess("static_cb","static_external")
    .with_columns(
        pl.col("static_external_riskassesment_302T").str.split("%").list.gather([0,1]).apply(lambda x: (int(x[0])+int(x[1].split("-")[1]))/200)
    )
)
total_static_external

case_id,static_external_contractssum_5085716L,static_external_days120_123L,static_external_days180_256L,static_external_days30_165L,static_external_days360_512L,static_external_days90_310L,static_external_firstquarter_103L,static_external_for3years_128L,static_external_for3years_504L,static_external_for3years_584L,static_external_formonth_118L,static_external_formonth_206L,static_external_formonth_535L,static_external_forquarter_1017L,static_external_forquarter_462L,static_external_forquarter_634L,static_external_fortoday_1092L,static_external_forweek_1077L,static_external_forweek_528L,static_external_forweek_601L,static_external_foryear_618L,static_external_foryear_818L,static_external_foryear_850L,static_external_fourthquarter_440L,static_external_numberofqueries_373L,static_external_pmtaverage_3A,static_external_pmtaverage_4527227A,static_external_pmtaverage_4955615A,static_external_pmtcount_4527229L,static_external_pmtcount_4955617L,static_external_pmtcount_693L,static_external_pmtscount_423L,static_external_pmtssum_45A,static_external_secondquarter_766L,static_external_thirdquarter_1082L,static_external_assignmentdate_238D,static_external_assignmentdate_4527235D,static_external_assignmentdate_4955616D,static_external_birthdate_574D,static_external_dateofbirth_337D,static_external_dateofbirth_342D,static_external_description_5085714M,static_external_education_1103M,static_external_education_88M,static_external_maritalst_385M,static_external_maritalst_893M,static_external_requesttype_4525192L,static_external_responsedate_1012D,static_external_responsedate_4527233D,static_external_responsedate_4917613D,static_external_riskassesment_302T,static_external_riskassesment_940T
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,date,date,date,date,date,date,str,str,str,str,str,str,date,date,date,f64,str
1783085,,7,10,4,16,7,17,,,,,,,,,,,,,,,,,6,16,,,,,,,,,10,10,,,,,1978-03-01,,"""a55475b1""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""","""DEDUCTION_6""",,2020-02-26,,,
1282621,,,,,,,,,,,,,,,,,,,,,,,,,,18319.732421875,,,,,6,,,,,2008-04-09,,,1950-04-01,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,2019-02-28,,,,
112470,,3,3,0,6,3,5,,,,,,,,,,,,,,,,,5,6,,,,,,,7,10208.7998046875,4,1,,,,1990-10-01,1990-10-01,,"""a55475b1""","""6b2ae0fa""","""a55475b1""","""3439d993""","""a55475b1""",,2019-03-07,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57552,,0,0,0,0,0,0,,,,,,,,,,,,,,,,,0,0,,,16327,,14,,,,0,0,,,2012-03-02,,1948-10-01,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,2020-10-19,,
57634,,,,,,,,,,,,,,,,,,,,,,,,,,,,6917,,14,,,,,,,,2017-12-15,,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,2020-10-20,,
57631,750875.4375,6,7,1,12,1,6,,,,,,,,,,,,,,,,,2,12,,,16863,,12,,,,6,0,,,2013-04-12,,1955-03-01,,"""2fc785b2""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,2020-10-20,,


In [None]:
total_person_shallow = preprocess("person_1","person_shallow")
total_person_shallow

case_id,person_shallow_childnum_185L,person_shallow_mainoccupationinc_384A,person_shallow_num_group1,person_shallow_personindex_1023L,person_shallow_persontype_1072L,person_shallow_persontype_792L,person_shallow_birth_259D,person_shallow_birthdate_87D,person_shallow_contaddr_district_15M,person_shallow_contaddr_matchlist_1032L,person_shallow_contaddr_smempladdr_334L,person_shallow_contaddr_zipcode_807M,person_shallow_education_927M,person_shallow_empl_employedfrom_271D,person_shallow_empl_employedtotal_800L,person_shallow_empl_industry_691L,person_shallow_empladdr_district_926M,person_shallow_empladdr_zipcode_114M,person_shallow_familystate_447L,person_shallow_gender_992L,person_shallow_housetype_905L,person_shallow_housingtype_772L,person_shallow_incometype_1044T,person_shallow_isreference_387L,person_shallow_language1_981M,person_shallow_maritalst_703L,person_shallow_registaddr_district_1083M,person_shallow_registaddr_zipcode_184M,person_shallow_relationshiptoclient_415T,person_shallow_relationshiptoclient_642T,person_shallow_remitter_829L,person_shallow_role_1084L,person_shallow_role_993L,person_shallow_safeguarantyflag_411L,person_shallow_sex_738L,person_shallow_type_25L
u32,f32,f32,u16,f32,f32,f32,date,date,str,bool,bool,str,str,date,str,str,str,str,str,str,str,str,str,bool,str,str,str,str,str,str,bool,str,str,bool,str,str
753462,,30000,2,1,5,5,1994-09-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2019-04-15,"""LESS_ONE""","""EDUCATION""","""a55475b1""","""a55475b1""","""SINGLE""",,,,"""PRIVATE_SECTOR_EMPLOYEE""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""PARENT""","""PARENT""",false,"""CL""",,true,"""M""","""PHONE"""
807455,,80000,2,1,4,4,1996-01-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2018-08-15,"""MORE_ONE""","""OTHER""","""a55475b1""","""a55475b1""","""MARRIED""",,,,"""PRIVATE_SECTOR_EMPLOYEE""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""SPOUSE""","""SPOUSE""",false,"""PE""",,true,"""F""","""PRIMARY_MOBILE"""
3933,,40200,2,2,5,5,1957-04-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",,,,"""a55475b1""","""a55475b1""","""MARRIED""",,,,"""RETIRED_PENSIONER""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""CHILD""","""CHILD""",false,"""PE""",,true,"""F""","""PHONE"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57551,,24000,2,1,5,5,1990-08-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2019-01-15,"""MORE_FIVE""","""OTHER""","""a55475b1""","""a55475b1""","""MARRIED""",,,,"""EMPLOYED""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""FRIEND""","""FRIEND""",false,"""EM""",,false,"""M""","""PHONE"""
57549,,15000,3,1,5,5,1992-03-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2019-05-04,"""MORE_ONE""","""OTHER""","""a55475b1""","""a55475b1""","""SINGLE""",,,,"""EMPLOYED""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""SIBLING""","""SIBLING""",false,"""EM""",,false,"""M""","""PRIMARY_MOBILE"""
57543,,36000,2,1,5,5,1996-08-01,,"""a55475b1""",false,false,"""a55475b1""","""a55475b1""",2018-02-15,"""MORE_ONE""","""OTHER""","""a55475b1""","""a55475b1""","""MARRIED""",,,,"""EMPLOYED""",,"""a55475b1""",,"""a55475b1""","""a55475b1""","""OTHER""","""OTHER""",false,"""CL""",,false,"""M""","""PHONE"""


In [None]:
total_person_depth = preprocess("person_2","person_depth")
total_person_depth

case_id,person_depth_num_group1,person_depth_num_group2,person_depth_addres_district_368M,person_depth_addres_role_871L,person_depth_addres_zip_823M,person_depth_conts_role_79M,person_depth_empls_economicalst_849M,person_depth_empls_employedfrom_796D,person_depth_empls_employer_name_740M,person_depth_relatedpersons_role_762T
u32,u16,u16,str,str,str,str,str,date,str,str
843132,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
887705,2,1,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
1661597,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
…,…,…,…,…,…,…,…,…,…,…
57631,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
57569,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",
57636,0,0,"""a55475b1""",,"""a55475b1""","""a55475b1""","""a55475b1""",,"""a55475b1""",


In [None]:
total_other_shallow = preprocess("other_1","other_shallow")
total_other_shallow

case_id,other_shallow_amtdebitincoming_4809443A,other_shallow_amtdebitoutgoing_4809440A,other_shallow_amtdepositbalance_4809441A,other_shallow_amtdepositincoming_4809444A,other_shallow_amtdepositoutgoing_4809442A,other_shallow_num_group1
u32,f32,f32,f32,f32,f32,u16
215261,0,0,0,0,2.799999952316284,0
237534,25000,25000,0,0,0,0
1840115,4000,4000,0,0,0,0
…,…,…,…,…,…,…
57823,0,0,101817,0,33950.80078125,0
57774,0,0,988.6000366210938,0,8.199999809265137,0
57635,0,0,12028.7998046875,0,4011,0


In [None]:
total_deposit_shallow = preprocess("deposit_1","deposit_shallow")
total_deposit_shallow

case_id,deposit_shallow_amount_416A,deposit_shallow_num_group1,deposit_shallow_contractenddate_991D,deposit_shallow_openingdate_313D
u32,f32,u16,date,date
173582,393.0220031738281,0,2017-11-13,2013-11-13
147733,827.2440795898438,2,2016-12-17,2016-07-13
196245,269.7220153808594,0,2017-12-16,2014-12-17
…,…,…,…,…
57951,289.17401123046875,0,2017-10-09,2013-10-10
57925,208.1840057373047,0,,2017-04-07
57932,1008.9960327148438,0,,2017-07-06


In [None]:
total_debitcard_shallow = preprocess("debitcard","card_shallow")
total_debitcard_shallow

case_id,card_shallow_last180dayaveragebalance_704A,card_shallow_last180dayturnover_1134A,card_shallow_last30dayturnover_651A,card_shallow_num_group1,card_shallow_openingdate_857D
u32,f32,f32,f32,u16,date
2697761,,,,2,2013-12-26
2644831,,,,0,2014-05-26
197701,0,20000,0,4,2014-02-21
…,…,…,…,…,…
58038,,,,1,2016-03-04
57925,,,,0,2017-04-07
57719,,,,0,2016-09-22


In [None]:
total_credit_internal_shallow = preprocess("bureau_a_1","int_shallow")
total_credit_internal_shallow

case_id,int_shallow_annualeffectiverate_199L,int_shallow_annualeffectiverate_63L,int_shallow_contractsum_5085717L,int_shallow_credlmt_230A,int_shallow_credlmt_935A,int_shallow_debtoutstand_525A,int_shallow_debtoverdue_47A,int_shallow_dpdmax_139P,int_shallow_dpdmax_757P,int_shallow_instlamount_768A,int_shallow_instlamount_852A,int_shallow_interestrate_508L,int_shallow_monthlyinstlamount_332A,int_shallow_monthlyinstlamount_674A,int_shallow_nominalrate_281L,int_shallow_nominalrate_498L,int_shallow_num_group1,int_shallow_numberofcontrsvalue_258L,int_shallow_numberofcontrsvalue_358L,int_shallow_numberofinstls_229L,int_shallow_numberofinstls_320L,int_shallow_numberofoutstandinstls_520L,int_shallow_numberofoutstandinstls_59L,int_shallow_numberofoverdueinstlmax_1039L,int_shallow_numberofoverdueinstlmax_1151L,int_shallow_numberofoverdueinstls_725L,int_shallow_numberofoverdueinstls_834L,int_shallow_outstandingamount_354A,int_shallow_outstandingamount_362A,int_shallow_overdueamount_31A,int_shallow_overdueamount_659A,int_shallow_overdueamountmax2_14A,int_shallow_overdueamountmax2_398A,int_shallow_overdueamountmax_155A,int_shallow_overdueamountmax_35A,int_shallow_periodicityofpmts_1102L,…,int_shallow_totalamount_6A,int_shallow_totalamount_996A,int_shallow_totaldebtoverduevalue_178A,int_shallow_totaldebtoverduevalue_718A,int_shallow_totaloutstanddebtvalue_39A,int_shallow_totaloutstanddebtvalue_668A,int_shallow_classificationofcontr_13M,int_shallow_classificationofcontr_400M,int_shallow_contractst_545M,int_shallow_contractst_964M,int_shallow_dateofcredend_289D,int_shallow_dateofcredend_353D,int_shallow_dateofcredstart_181D,int_shallow_dateofcredstart_739D,int_shallow_dateofrealrepmt_138D,int_shallow_description_351M,int_shallow_dpdmaxdatemonth_442T,int_shallow_dpdmaxdatemonth_89T,int_shallow_dpdmaxdateyear_596T,int_shallow_dpdmaxdateyear_896T,int_shallow_financialinstitution_382M,int_shallow_financialinstitution_591M,int_shallow_lastupdate_1112D,int_shallow_lastupdate_388D,int_shallow_numberofoverdueinstlmaxdat_148D,int_shallow_numberofoverdueinstlmaxdat_641D,int_shallow_overdueamountmax2date_1002D,int_shallow_overdueamountmax2date_1142D,int_shallow_overdueamountmaxdatemonth_284T,int_shallow_overdueamountmaxdatemonth_365T,int_shallow_overdueamountmaxdateyear_2T,int_shallow_overdueamountmaxdateyear_994T,int_shallow_purposeofcred_426M,int_shallow_purposeofcred_874M,int_shallow_refreshdate_3813885D,int_shallow_subjectrole_182M,int_shallow_subjectrole_93M
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,f32,f32,f32,f32,f32,f32,str,str,str,str,date,date,date,date,date,str,str,str,str,str,str,str,date,date,date,date,date,date,str,str,str,str,str,str,date,str,str
1301783,,,,,44798,39023.75390625,0,10,,2938.688232421875,,,2938.688232421875,,43,,10,2,,,18,,15,17,,0,,,16584.6953125,,0,2380.35400390625,,2380.35400390625,,,…,,18540,0,,39023.75390625,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2020-05-15,,,2014-01-25,,"""a55475b1""",,"""12.0""","""2017.0""",,"""a55475b1""","""a55475b1""",2019-03-05,,,2014-10-12,,2018-12-29,,"""1.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-03-21,"""a55475b1""","""a55475b1"""
618576,,,,,20176,23115.552734375,0,0,,3362.800048828125,,,3362.800048828125,,42.5,,10,2,,,11,,9,0,,0,,,2939.552001953125,,0,0,,0,,,…,,3374,0,,23115.552734375,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2020-07-20,,,2018-10-29,,"""a55475b1""",,"""8.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-01-29,,,,,,,"""8.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-02-08,"""a55475b1""","""a55475b1"""
1281195,,,,,38000,35179.28515625,0,0,,4541.80029296875,,,4541.80029296875,,39,,10,2,3,,,,,0,,0,,,,,0,0,,0,,,…,,,0,0,35179.28515625,0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2021-07-14,,,2018-10-04,,"""a55475b1""",,"""10.0""","""2019.0""",,"""a55475b1""","""a55475b1""",2019-02-07,,,,,,,"""10.0""","""2018.0""",,"""a55475b1""","""a55475b1""",2019-02-27,"""a55475b1""","""a55475b1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
87191,,,,,,0,0,,,,,,,,,,8,,,,,,,,,,,,,,,,,,,,…,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,,,,"""a55475b1""",,,,,"""a55475b1""","""a55475b1""",,,,,,,,,,,"""a55475b1""","""a55475b1""",2022-03-26,"""a55475b1""","""a55475b1"""
87206,,56,1000000,,200000,1012822.6875,0,0,0,9068,,,9068,6391.89013671875,20.950000762939453,,0,4,2,24,,0,,0,0,0,0,0,,0,0,0,0,0,0,30,…,100000,,0,0,777849.0625,0,"""ea6782cc""","""00135d9c""","""7241344e""","""7241344e""",2023-06-13,2022-07-29,2020-07-30,2017-06-13,2020-10-29,"""a55475b1""","""8.0""","""4.0""","""2020.0""","""2020.0""","""d6a7d943""","""b619fa46""",2022-03-11,2020-11-25,,,,,"""8.0""","""4.0""","""2020.0""","""2020.0""","""60c73645""","""96a8fdfe""",2022-03-26,"""ab3c25cf""","""ab3c25cf"""
100984,,56,0,75000,0,0,0,0,530,0,400,,0,400,0,,9,1,5,0,,0,,3,589,0,0,0,,0,0,7447.60009765625,20654.3984375,0,20654.3984375,30,…,233376,,0,0,0,0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2023-06-30,2017-05-14,2011-07-13,2019-06-30,2018-08-10,"""a55475b1""","""1.0""","""10.0""","""2020.0""","""2018.0""","""a55475b1""","""a55475b1""",2022-09-07,2018-08-11,2018-08-02,2020-06-04,2018-08-02,2020-06-04,"""1.0""","""10.0""","""2020.0""","""2019.0""","""a55475b1""","""a55475b1""",2022-09-15,"""a55475b1""","""a55475b1"""


In [None]:
total_credit_internal_depth = preprocess("bureau_a_2","int_depth")
total_credit_internal_depth

case_id,int_depth_collater_valueofguarantee_1124L,int_depth_collater_valueofguarantee_876L,int_depth_num_group1,int_depth_num_group2,int_depth_pmts_dpd_1073P,int_depth_pmts_dpd_303P,int_depth_pmts_overdue_1140A,int_depth_pmts_overdue_1152A,int_depth_collater_typofvalofguarant_298M,int_depth_collater_typofvalofguarant_407M,int_depth_collaterals_typeofguarante_359M,int_depth_collaterals_typeofguarante_669M,int_depth_pmts_month_158T,int_depth_pmts_month_706T,int_depth_pmts_year_1139T,int_depth_pmts_year_507T,int_depth_subjectroles_name_541M,int_depth_subjectroles_name_838M
u32,f32,f32,u16,u16,f32,f32,f32,f32,str,str,str,str,str,str,str,str,str,str
1251053,0,,0,23,0,,0,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""5.0""",,"""2018.0""",,"""a55475b1""","""a55475b1"""
630628,16188,,1,35,0,,0,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""7.0""",,"""2018.0""",,"""a55475b1""","""a55475b1"""
642448,0,,2,35,21,,1229.3919677734375,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""11.0""",,"""2018.0""",,"""a55475b1""","""a55475b1"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
82797,0,0,0,9,0,0,0,0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""4.0""","""2.0""","""2020.0""","""2018.0""","""a55475b1""","""a55475b1"""
85784,0,,0,9,0,,0,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""2.0""",,"""2020.0""",,"""a55475b1""","""a55475b1"""
91257,0,0,0,9,0,0,0,0,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""10.0""","""10.0""","""2020.0""","""2017.0""","""a55475b1""","""a55475b1"""


In [None]:
total_credit_external_shallow = preprocess("bureau_b_1","ext_shallow")
total_credit_external_shallow

case_id,ext_shallow_amount_1115A,ext_shallow_credlmt_1052A,ext_shallow_credlmt_228A,ext_shallow_credlmt_3940954A,ext_shallow_credquantity_1099L,ext_shallow_credquantity_984L,ext_shallow_debtpastduevalue_732A,ext_shallow_debtvalue_227A,ext_shallow_dpd_550P,ext_shallow_dpd_733P,ext_shallow_dpdmax_851P,ext_shallow_installmentamount_644A,ext_shallow_installmentamount_833A,ext_shallow_instlamount_892A,ext_shallow_interesteffectiverate_369L,ext_shallow_interestrateyearly_538L,ext_shallow_maxdebtpduevalodued_3940955A,ext_shallow_num_group1,ext_shallow_numberofinstls_810L,ext_shallow_overdueamountmax_950A,ext_shallow_pmtdaysoverdue_1135P,ext_shallow_pmtnumpending_403L,ext_shallow_residualamount_1093A,ext_shallow_residualamount_127A,ext_shallow_residualamount_3940956A,ext_shallow_totalamount_503A,ext_shallow_totalamount_881A,ext_shallow_classificationofcontr_1114M,ext_shallow_contractdate_551D,ext_shallow_contractmaturitydate_151D,ext_shallow_contractst_516M,ext_shallow_contracttype_653M,ext_shallow_credor_3940957M,ext_shallow_dpdmaxdatemonth_804T,ext_shallow_dpdmaxdateyear_742T,ext_shallow_lastupdate_260D,ext_shallow_overdueamountmaxdatemonth_494T,ext_shallow_overdueamountmaxdateyear_432T,ext_shallow_periodicityofpmts_997L,ext_shallow_periodicityofpmts_997M,ext_shallow_pmtmethod_731M,ext_shallow_purposeofcred_722M,ext_shallow_subjectrole_326M,ext_shallow_subjectrole_43M
u32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u16,f32,f32,f32,f32,f32,f32,f32,f32,f32,str,date,date,str,str,str,str,str,date,str,str,str,str,str,str,str,str
188522,612455.875,20000,40000,20000,2,9,0,559850.625,0,0,28517,0,563349.8125,6357.60205078125,4.590000152587891,39,0.6000000238418579,2,120,2,0,107,0,19986.291015625,19986.291015625,626453.875,1523398.375,"""ea6782cc""",2019-04-26,2020-01-02,"""7241344e""","""4257cbed""","""P0_31_66""","""8.0""","""2019.0""",2019-12-20,"""5.0""","""2019.0""",,"""a0b598e4""","""f6e26148""","""60c73645""","""ab3c25cf""","""ab3c25cf"""
1826601,,0,,0,1,,0,,0,,0,,0,,,39,0,0,,0,0,,,0,0,0,,"""ea6782cc""",2019-03-14,2022-01-14,"""7241344e""","""1c9c5356""","""P0_31_66""","""4.0""","""2019.0""",2020-04-04,"""4.0""","""2019.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""ab3c25cf""","""a55475b1"""
1336900,101280,100000,0,100000,2,5,187.68800354003906,2880.93212890625,844,0,16378,0,81657.515625,3657.60009765625,22.950000762939453,45,1.2000000476837158,2,36,4,0,3,0,81657.515625,81657.515625,117080,170753.40625,"""ea6782cc""",2018-07-07,2019-04-14,"""7241344e""","""4257cbed""","""P0_31_66""","""4.0""","""2018.0""",2019-04-17,"""1.0""","""2018.0""",,"""a0b598e4""","""f6e26148""","""60c73645""","""ab3c25cf""","""ab3c25cf"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
57660,,133279,,113279,2,1,0,,0,0,32618,0,109099.578125,,,,11.40000057220459,1,,16.200000762939453,0,,,109099.578125,90845.9765625,133279,16800,"""ea6782cc""",2019-12-08,2021-03-14,"""7241344e""","""1c9c5356""","""b619fa46""","""10.0""","""2017.0""",2020-10-10,"""12.0""","""2019.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""a55475b1""","""a55475b1"""
57689,,144958.796875,,114958.8046875,2,,0,,0,,0,,82776.8046875,,,,0,1,,0,0,,,82776.8046875,72465.8046875,144958.796875,,"""ea6782cc""",2019-08-20,2021-12-31,"""7241344e""","""1c9c5356""","""b619fa46""","""2.0""","""2020.0""",2020-10-10,"""2.0""","""2020.0""",,"""a55475b1""","""a55475b1""","""60c73645""","""a55475b1""","""a55475b1"""
57700,425600,0,,0,2,,6062.72607421875,678996,27282,,27911,,843961.6875,18861,,,4,2,53,4,3,36,,0,0,625600,,"""ea6782cc""",2020-10-12,2023-10-12,"""8f3a197f""","""4257cbed""","""74bd67a8""","""10.0""","""2020.0""",2020-10-14,"""8.0""","""2020.0""",,"""a0b598e4""","""e914c86c""","""60c73645""","""ab3c25cf""","""a55475b1"""


In [None]:
total_credit_external_depth = preprocess("bureau_b_2","ext_depth")
total_credit_external_depth

case_id,ext_depth_num_group1,ext_depth_num_group2,ext_depth_pmts_dpdvalue_108P,ext_depth_pmts_pmtsoverdue_635A,ext_depth_pmts_date_1107D
u32,u16,u16,f32,f32,date
2575210,1,36,0,0,2018-11-15
855690,1,8,0,0,2019-10-15
104753,2,35,52730,2,2018-10-15
…,…,…,…,…,…
1686588,1,6,0,0,2019-09-15
1946270,1,35,0,0,2020-09-15
57660,1,34,8814,2.200000047683716,2020-08-15


In [None]:
total_registry_a = preprocess("registry_a","reg_a")
total_registry_a

case_id,reg_a_amount_4527230A,reg_a_num_group1,reg_a_name_4527232M,reg_a_recorddate_4527225D
u32,f32,u16,str,date
1552673,4837.60009765625,5,"""88a34b2a""",2019-10-09
939084,1596.2000732421875,8,"""1dfc94c5""",2020-02-01
857527,3412.800048828125,15,"""d4ad0a11""",2019-11-24
…,…,…,…,…
1704672,2596,5,"""5670e628""",2020-01-06
57679,3640,4,"""ba006408""",2020-10-20
57689,3214.800048828125,6,"""aac4edf4""",2020-10-20


In [None]:
total_registry_b = preprocess("registry_b","reg_b")
total_registry_b

case_id,reg_b_amount_4917619A,reg_b_num_group1,reg_b_deductiondate_4917603D,reg_b_name_4917606M
u32,f32,u16,date,str
240940,10135,5,2020-06-05,"""b0356c56"""
1946693,8910,5,2020-05-07,"""cb8cda49"""
985345,18630,5,2020-02-03,"""4e7d21f5"""
…,…,…,…,…
1899732,20873.80078125,5,2020-07-14,"""f9a186d6"""
57549,8760.6005859375,11,2020-08-06,"""bf93e092"""
57543,31060.201171875,4,2020-09-17,"""d2142f5c"""


In [None]:
total_registry_c = preprocess("registry_c","reg_c")
total_registry_c

case_id,reg_c_num_group1,reg_c_pmtamount_36A,reg_c_employername_160M,reg_c_processingdate_168D
u32,u16,f32,str,date
135303,9,1331.5999755859375,"""18ac4741""",2019-04-09
718866,4,1400,"""b37ec2f9""",2019-06-10
1365205,12,4499.60009765625,"""d4fe4906""",2019-03-29
…,…,…,…,…
1530489,1,5535.80029296875,"""09de35c5""",2019-09-24
701130,5,850,"""527d1b27""",2019-04-19
18197,2,850,"""587ca2ba""",2019-04-25


In [None]:
drop_columns = [
    "Date",
    "static_base_commnoinclast6m_3546845L",
    "static_base_deferredmnthsnum_166L",
    "static_base_mastercontrelectronic_519L",
    "static_base_mastercontrexist_109L",
    "static_base_bankacctype_710L",
    "static_base_isdebitcard_729L",
    "static_base_paytype1st_925L",
    "static_base_paytype_783L",
    "static_base_typesuite_864L",
    "person_shallow_contaddr_matchlist_1032L",
    "person_shallow_remitter_829L",
    "person_depth_empls_employer_name_740M",
    "int_shallow_subjectrole_182M",
    "person_shallow_relationshiptoclient_642T",
    "int_shallow_refreshdate_3813885D",
    "int_shallow_monthlyinstlamount_674A",
    "static_base_maxlnamtstart6m_4525199A",
    "static_base_dtlastpmtallstes_4499206D",
    "static_base_cardtype_51L",
    "static_base_applicationscnt_1086L",
    "past_shallow_profession_152M",
    "int_shallow_periodicityofpmts_1102L",
    "past_shallow_annuity_853A",
    "card_shallow_openingdate_857D",
    "deposit_shallow_openingdate_313D",
    "int_shallow_credlmt_230A",
    "static_base_avgpmtlast12m_4525200A",
    "int_shallow_monthlyinstlamount_332A",
    "static_base_maxinstallast24m_3658928A",
    "int_shallow_outstandingamount_354A",
    "static_base_clientscnt_946L",
    "int_shallow_credlmt_935A",
    "int_shallow_dpdmaxdatemonth_442T",
    "int_shallow_periodicityofpmts_837L",
    "static_base_clientscnt_304L",
    "int_shallow_totaloutstanddebtvalue_668A",
    "int_shallow_residualamount_488A",
    "person_shallow_contaddr_smempladdr_334L",
    "person_depth_conts_role_79M",
    "int_shallow_subjectrole_93M",
    "int_shallow_contractsum_5085717L",
    "past_depth_cacccardblochreas_147M",
    "int_shallow_overdueamount_659A",
    "int_shallow_debtoverdue_47A",
    "int_shallow_totaldebtoverduevalue_178A",
    "int_shallow_overdueamountmaxdatemonth_284T",
    "person_shallow_empladdr_district_926M",
    "person_depth_empls_economicalst_849M",
    "static_base_maxpmtlast3m_4525190A",
    "past_shallow_isbidproduct_390L",
    "person_shallow_housetype_905L",
    "person_shallow_empladdr_zipcode_114M",
    "int_shallow_overdueamountmax_155A",
    "int_shallow_overdueamountmax2_14A",
    "int_shallow_financialinstitution_591M",
    "static_base_totinstallast1m_4525188A",
    "static_base_lastapprcommoditycat_1041M",
    "static_base_applicationcnt_361L",
    "person_shallow_safeguarantyflag_411L",
    "person_depth_addres_zip_823M",
    "person_depth_addres_district_368M",
    "int_shallow_overdueamountmax_35A",
    "static_base_clientscnt_493L",
    "int_shallow_overdueamountmax2_398A",
    "past_shallow_actualdpd_943P",
    "past_depth_credacc_cards_status_52L",
    "person_depth_num_group2",
    "static_base_actualdpdtolerance_344P",
    ]

In [None]:
gain_df = pl.read_csv("/home/sohail/Downloads/gains.csv")
drop_list = gain_df.filter(pl.col("gain") < 5000)["col_name"].to_list()
select_list = gain_df.filter(pl.col("gain") > 20000)["col_name"].to_list()

In [None]:
df = (
    total_base
    .join(
        total_past_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_past_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_static_base,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_person_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_other_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_deposit_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_debitcard_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_internal_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_shallow,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_credit_external_depth,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_a,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_b,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .join(
        total_registry_c,
        left_on="case_id_base",
        right_on="case_id",
        how="left"
    )
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.String).cast(pl.Categorical)
    )
    .drop("Date")
    # .with_columns(
    #     pl.col(pl.INTEGER_DTYPES).fill_null(strategy="mean"),
    #     pl.col(pl.FLOAT_DTYPES).fill_null(strategy="mean"),
    #     pl.col(pl.Boolean).fill_null(value=pl.col(pl.Boolean).drop_nulls().mode().first()),
    #     pl.col(pl.Categorical).fill_null(value=pl.col(pl.Categorical).drop_nulls().mode().first())
    # )
)

In [None]:
df.write_parquet("/home/sohail/Downloads/total_df.parquet")

In [30]:
# corr_df = pd.DataFrame()
# for i,col in enumerate(total_df.columns):
#     corr_df.loc[i,"col"] = col
#     corr_df.loc[i,"val"] = (
#         total_df
#         .select(pl.corr("target",col))
#         .item()
#     )
# corr_df = pl.from_pandas(corr_df).sort(by="val")
# cols_list = corr_df.filter((pl.col("val") > -0.0025) & (pl.col("val") < 0.0025))["col"].to_list()

In [31]:
test_base.select("case_id_base").to_series().to_list()

[57543, 57549, 57551, 57552, 57569, 57630, 57631, 57632, 57633, 57634]

In [32]:
submission_df = df[df["case_id_base"].isin(test_base.select("case_id_base").sort(by="case_id_base").to_series().to_list())].drop(columns=["target"])
submission_df

AttributeError: 'Series' object has no attribute 'isin'

In [None]:
X_total = df.loc[~df["case_id_base"].isin(test_base.select("case_id_base").sort(by="case_id_base").to_series().to_list())].drop(columns=["target","case_id_base"])
y_total = df.loc[~df["case_id_base"].isin(test_base.select("case_id_base").sort(by="case_id_base").to_series().to_list()),["target"]]

In [None]:
del df
del total_past_shallow
del total_past_depth
del total_static_base
del total_person_depth
del total_person_shallow
del total_other_shallow
del total_deposit_shallow
del total_debitcard_shallow
del total_credit_external_depth
del total_credit_external_shallow
del total_credit_internal_depth
del total_credit_internal_shallow
del total_registry_a
del total_registry_b
del total_registry_c
gc.collect()

27

In [None]:
params = {
    "objective":"binary",
    "metric":"auc",
    "max_depth": 16,
    "learning_rate": 0.05,
    "n_estimators": 2000,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees": True,
    "num_leaves": 64,
    "device": "gpu",
    "verbose": -1
}

In [None]:
X_train,X_valid,y_train,y_valid = train_test_split(X_total,y_total,stratify=y_total,random_state=69,shuffle=True,test_size=0.15)
del X_total,y_total

In [None]:
model = lgb.LGBMClassifier(**params)
model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],callbacks=[lgb.log_evaluation(100),lgb.early_stopping(500)])

Training until validation scores don't improve for 500 rounds
[100]	valid_0's auc: 0.832495
[200]	valid_0's auc: 0.843628
[300]	valid_0's auc: 0.848076
[400]	valid_0's auc: 0.850703
[500]	valid_0's auc: 0.85213
[600]	valid_0's auc: 0.853176
[700]	valid_0's auc: 0.853795
[800]	valid_0's auc: 0.854136
[900]	valid_0's auc: 0.854428
[1000]	valid_0's auc: 0.854753
[1100]	valid_0's auc: 0.854902
[1200]	valid_0's auc: 0.855
[1300]	valid_0's auc: 0.855121
[1400]	valid_0's auc: 0.855187
[1500]	valid_0's auc: 0.85526
[1600]	valid_0's auc: 0.855392
[1700]	valid_0's auc: 0.855437
[1800]	valid_0's auc: 0.855495
[1900]	valid_0's auc: 0.855499
[2000]	valid_0's auc: 0.855426
Did not meet early stopping. Best iteration is:
[1823]	valid_0's auc: 0.85554


In [None]:
train_ds = tf.data.Dataset.from_tensor_slices(X_train,y_train)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
model = keras.Sequential()
model.add(keras.layers.Dense(100))
model.add(keras.layers.Dense(100))
model.add(keras.layers.Dense(100))
model.add(keras.layers.Dense(100))
model.add(keras.layers.Dense(1))
model.compile(
    loss="binary_crossentropy",
    metrics=[keras.metrics.AUC()],
    optimizer=keras.optimizers.Adam()
)
model.fit(X_train.to_numpy(),y_train.to_numpy(),validation_data=(X_valid.to_numpy(),y_valid.to_numpy()),batch_size=32)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
# splits = 3
# models = [
#     lgb.LGBMClassifier(**params) for _ in range(splits)
# ]
# cv = StratifiedGroupKFold(n_splits=splits)
# for i,(train_ind,valid_ind) in enumerate(cv.split(X_total,y_total,groups=X_total["week"])):
#     X_train,y_train = X_total.iloc[train_ind],y_total.iloc[train_ind]
#     X_valid,y_valid = X_total.iloc[valid_ind],y_total.iloc[valid_ind]
#     models[i].fit(X_train,y_train,eval_set=[(X_valid,y_valid)],callbacks=[lgb.log_evaluation(100),lgb.early_stopping(50)])
#     y_pred = models[i].predict_proba(X_valid)[:,1]
#     print(f"<-------- Roc_Auc score for split {i+1} is {roc_auc_score(y_true=y_valid,y_score=y_pred)} -------->")

In [None]:
# cols = []
# for col in X_train.columns:
#     if "person_shallow" in col:
#         cols.append(col)
# X_train[cols]

In [None]:
# class Model:

#     def __init__(self,_models:list) -> None:
#         self.models = _models

#     def predict(self,X):
#         return np.mean([_.predict(X) for _ in self.models],axis=0)
    
#     def predict_proba(self,X):
#         return np.mean([_.predict_proba(X)[:,1] for _ in self.models],axis=0)

In [None]:
# model = Model(models)
# y_pred = model.predict_proba(X_valid)
# roc_auc_score(y_valid,y_pred)

In [None]:
sub_df = pd.DataFrame({
    "case_id": submission_df['case_id_base'].to_list(),
    "score": model.predict_proba(submission_df.drop(columns=["case_id_base"]))
}).set_index("case_id")
sub_df.to_csv("./submission.csv")

ValueError: Per-column arrays must each be 1-dimensional