In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = '/home/sohail/Downloads/credit_risk'

In [2]:
from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [3]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.7:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df

In [4]:
class Aggregator:
    
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]  # max & replace col name
        return expr_max
    
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [5]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

In [6]:


def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base



In [7]:


def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols



In [8]:


ROOT            = Path("/home/sohail/Downloads/credit_risk")

TRAIN_DIR       = ROOT / "train"
TEST_DIR        = ROOT / "test"



In [9]:
train_credit = read_files(TRAIN_DIR/"train_credit_bureau_a_1_*.parquet",1)
train_credit.head()

case_id,max_credlmt_230A,max_credlmt_935A,max_debtoutstand_525A,max_debtoverdue_47A,max_dpdmax_139P,max_dpdmax_757P,max_instlamount_768A,max_instlamount_852A,max_monthlyinstlamount_332A,max_monthlyinstlamount_674A,max_outstandingamount_354A,max_outstandingamount_362A,max_overdueamount_31A,max_overdueamount_659A,max_overdueamountmax2_14A,max_overdueamountmax2_398A,max_overdueamountmax_155A,max_overdueamountmax_35A,max_residualamount_488A,max_residualamount_856A,max_totalamount_6A,max_totalamount_996A,max_totaldebtoverduevalue_178A,max_totaldebtoverduevalue_718A,max_totaloutstanddebtvalue_39A,max_totaloutstanddebtvalue_668A,max_dateofcredend_289D,max_dateofcredend_353D,max_dateofcredstart_181D,max_dateofcredstart_739D,max_dateofrealrepmt_138D,max_lastupdate_1112D,max_lastupdate_388D,max_numberofoverdueinstlmaxdat_148D,max_numberofoverdueinstlmaxdat_641D,max_overdueamountmax2date_1002D,…,max_contractst_964M,max_description_351M,max_financialinstitution_382M,max_financialinstitution_591M,max_purposeofcred_426M,max_purposeofcred_874M,max_subjectrole_182M,max_subjectrole_93M,max_annualeffectiverate_199L,max_annualeffectiverate_63L,max_contractsum_5085717L,max_dpdmaxdatemonth_442T,max_dpdmaxdatemonth_89T,max_dpdmaxdateyear_596T,max_dpdmaxdateyear_896T,max_interestrate_508L,max_nominalrate_281L,max_nominalrate_498L,max_numberofcontrsvalue_258L,max_numberofcontrsvalue_358L,max_numberofinstls_229L,max_numberofinstls_320L,max_numberofoutstandinstls_520L,max_numberofoutstandinstls_59L,max_numberofoverdueinstlmax_1039L,max_numberofoverdueinstlmax_1151L,max_numberofoverdueinstls_725L,max_numberofoverdueinstls_834L,max_overdueamountmaxdatemonth_284T,max_overdueamountmaxdatemonth_365T,max_overdueamountmaxdateyear_2T,max_overdueamountmaxdateyear_994T,max_periodicityofpmts_1102L,max_periodicityofpmts_837L,max_prolongationcount_1120L,max_prolongationcount_599L,max_num_group1
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,date,date,date,date,date,date,date,date,date,date,…,str,str,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64
1570096,0.0,80000.0,130906.98,0.0,0.0,1032.0,6169.8003,0.0,6169.8003,8822.601,0.0,45613.12,0.0,0.0,2273.0461,89100.7,0.0,49502.656,0.0,80000.0,112500.0,47365.6,0.0,0.0,130906.98,0.0,2021-10-05,2019-08-04,2019-05-05,2019-10-05,2019-08-05,2019-10-15,2019-08-05,2018-12-27,2017-03-04,2016-11-01,…,"""b83056f9""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""e19fdece""","""ab3c25cf""","""ab3c25cf""",39.44,0.13,,8.0,11.0,2019.0,2019.0,,45.0,31.43,4.0,7.0,36.0,12.0,0.0,11.0,33.0,1174.0,0.0,0.0,11.0,11.0,2019.0,2019.0,30.0,30.0,,,10
141859,,0.0,20332.45,0.0,0.0,,0.0,,3729.8,,,20332.45,,0.0,0.0,,0.0,,,0.0,,35524.2,0.0,,20332.45,,2020-06-26,,,2018-12-14,,2019-07-11,,,,,…,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""ab3c25cf""","""a55475b1""",,,,,7.0,2019.0,,,45.0,,2.0,,,12.0,,6.0,0.0,,0.0,,,7.0,2019.0,,,30.0,,,8
1012334,,326000.0,265916.47,0.0,0.0,0.0,9747.438,,9747.438,4200.492,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,265916.47,110000.0,,0.0,0.0,265916.47,0.0,2023-08-30,2020-04-23,2019-05-06,2019-08-30,2019-08-30,2020-09-03,2019-09-28,,,,…,"""a55475b1""","""a55475b1""","""d6a7d943""","""a55475b1""","""a55475b1""","""e19fdece""","""ab3c25cf""","""ab3c25cf""",,,1329600.0,9.0,9.0,2019.0,2019.0,,,,1.0,6.0,12.0,,0.0,,0.0,0.0,0.0,0.0,9.0,9.0,2019.0,2019.0,30.0,,,,8
1793193,24798.0,20000.0,0.0,0.0,9.0,52.0,0.0,626.626,0.0,27831.475,0.0,,0.0,0.0,11507.524,2044.0,11507.524,2044.0,0.0,0.0,32445.201,,0.0,0.0,0.0,0.0,2022-03-05,2021-02-14,2019-06-04,2018-10-21,2020-01-08,2020-03-04,2020-01-08,2019-12-04,2018-06-15,2019-11-04,…,"""a55475b1""","""a55475b1""","""a55475b1""","""b619fa46""","""a55475b1""","""a55475b1""","""ab3c25cf""","""ab3c25cf""",,,,12.0,11.0,2018.0,2019.0,,,45.0,2.0,9.0,7.0,,0.0,,11.0,57.0,0.0,0.0,12.0,11.0,2019.0,2019.0,30.0,,0.0,,10
242387,,0.0,63980.54,0.0,0.0,23.0,0.0,,8344.32,2450.8,0.0,63980.54,0.0,0.0,4994.56,2307.2,0.0,50.4,,0.0,28860.0,99600.0,0.0,0.0,63980.54,0.0,2021-08-31,2020-06-28,2019-12-28,2019-12-28,2020-06-28,2020-08-12,2020-06-28,2016-02-09,2017-07-14,2016-01-20,…,"""a55475b1""","""a55475b1""","""b619fa46""","""b619fa46""","""a55475b1""","""a55475b1""","""ab3c25cf""","""ab3c25cf""",,,319902.7,12.0,9.0,2020.0,2020.0,,,40.05,2.0,6.0,24.0,16.0,0.0,9.0,11.0,25.0,0.0,0.0,12.0,9.0,2020.0,2020.0,30.0,30.0,,,8


In [9]:


data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}


In [10]:
df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)

train data shape:	 (1526659, 487)


In [11]:
df_train

case_id,WEEK_NUM,target,month_decision,weekday_decision,assignmentdate_238D,assignmentdate_4527235D,assignmentdate_4955616D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,dateofbirth_342D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,for3years_128L,for3years_504L,for3years_584L,formonth_118L,formonth_206L,formonth_535L,forquarter_1017L,forquarter_462L,forquarter_634L,fortoday_1092L,forweek_1077L,forweek_528L,forweek_601L,foryear_618L,foryear_818L,foryear_850L,…,max_pmts_pmtsoverdue_635A,max_pmts_date_1107D,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_subjectroles_name_541M,max_subjectroles_name_838M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_pmts_month_158T,max_pmts_month_706T,max_pmts_year_1139T,max_pmts_year_507T,max_num_group1_13,max_num_group2_13,max_cacccardblochreas_147M,max_conts_type_509L,max_credacc_cards_status_52L,max_num_group1_14,max_num_group2_14,max_empls_employedfrom_796D,max_addres_district_368M,max_addres_zip_823M,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M,max_addres_role_871L,max_relatedpersons_role_762T,max_num_group1_15,max_num_group2_15
i64,i64,i64,i8,i8,i64,i64,i64,i64,f64,i64,i64,f64,f64,f64,f64,f64,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,i64,i64,i64,f64,f64,f64,f64,str,str,str,str,str,str,f64,f64,f64,f64,f64,f64,i64,i64,str,str,str,i64,i64,i64,str,str,str,str,str,str,str,i64,i64
0,0,0,1,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0,0,1,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0,0,1,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,"""PRIMARY_MOBILE…",,1,1,,,,,,,,,,
3,0,0,1,4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,"""PRIMARY_MOBILE…",,0,2,,,,,,,,,,
4,0,1,1,5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,"""PRIMARY_MOBILE…",,0,1,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2703450,91,0,10,1,,,-998,,52863.59,-22193,,0.0,0.0,0.0,0.0,0.0,"""2fc785b2""","""a55475b1""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,…,,,,,0.0,44.0,0.0,4316.44,"""a55475b1""","""a55475b1""","""c7a5ad39""","""c7a5ad39""","""ab3c25cf""","""ab3c25cf""",0.0,0.0,12.0,12.0,2021.0,2021.0,10,35,"""a55475b1""","""PRIMARY_MOBILE…","""INACTIVE""",12,2,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,0,0
2703451,91,0,10,1,,,-5591,,324608.52,-25541,,0.0,0.0,0.0,0.0,0.0,"""2fc785b2""","""a55475b1""","""a55475b1""",1.0,,,,,,,,,,,,,,,,,…,,,,,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""c7a5ad39""","""c7a5ad39""","""ab3c25cf""","""ab3c25cf""",0.0,0.0,12.0,12.0,2021.0,2021.0,3,23,"""a55475b1""","""PRIMARY_MOBILE…","""ACTIVE""",5,3,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,0,0
2703452,91,0,10,1,,,,,102738.76,-15771,,2.0,2.0,0.0,3.0,2.0,"""2fc785b2""","""a55475b1""","""a55475b1""",0.0,,,,,,,,,,,,,,,,,…,,,,,16.0,0.0,4884.2983,0.0,"""a55475b1""","""a55475b1""","""c7a5ad39""","""c7a5ad39""","""ab3c25cf""","""ab3c25cf""",0.0,0.0,12.0,12.0,2021.0,2020.0,2,35,"""a55475b1""","""PRIMARY_MOBILE…",,2,1,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,0,0
2703453,91,0,10,1,,,-4616,,212683.29,-25814,,2.0,2.0,1.0,4.0,1.0,"""2fc785b2""","""6b2ae0fa""","""a55475b1""",1.0,,,,,,,,,,,,,,,,,…,,,,,0.0,23.0,0.0,2693.2,"""a55475b1""","""a55475b1""","""c7a5ad39""","""c7a5ad39""","""ab3c25cf""","""ab3c25cf""",0.0,0.0,12.0,12.0,2021.0,2020.0,12,35,"""a55475b1""","""PRIMARY_MOBILE…","""ACTIVE""",8,3,,"""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",,,0,0


In [12]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [13]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)

test data shape:	 (10, 486)


In [14]:
drop_list = ['max_empl_employedtotal_800L', 'monthsannuity_845L', 'lastactivateddate_801D', 
             'max_numberofoverdueinstls_725L', 'requesttype_4525192L', 'max_pmts_year_507T', 
             'lastrejectcommodtypec_5251769M', 'numinstpaidlate1d_3546852L', 'numinstmatpaidtearly2d_4499204L', 
             'max_overdueamountmaxdateyear_2T', 'max_overdueamountmaxdateyear_994T', 'twobodfilling_608L', 
             'maxdpdlast12m_727P', 'numinsttopaygrest_4493213L', 'currdebtcredtyperange_828A', 'maxdpdlast9m_1059P', 
             'numinstpaid_4499208L', 'applicationscnt_867L', 'numinstlswithoutdpd_562L', 'fourthquarter_440L', 
             'max_num_group1_6', 'max_safeguarantyflag_411L', 'max_dpdmaxdateyear_896T', 'numinstregularpaid_973L', 
             'avgdbdtollast24m_4525197P', 'numinstpaidearly5dest_4493211L', 'numinstpaidearly5dobd_4499205L', 
             'homephncnt_628L', 'max_role_1084L', 'max_remitter_829L', 'numrejects9m_859L', 
             'numinstlallpaidearly3d_817L', 'numinstpaidearly3dest_4493216L', 'annuitynextmonth_57A', 
             'numinstregularpaidest_4493210L', 'firstquarter_103L', 'clientscnt_533L', 'maxdpdlast3m_392P', 
             'sellerplacescnt_216L', 'secondquarter_766L', 'max_periodicityofpmts_1102L', 'numinstlsallpaid_934L', 
             'opencred_647L', 'numinstls_657L', 'numactivecredschannel_414L', 'numinstpaidearly3d_3546850L', 
             'numinstpaidearlyest_4493214L', 'max_totaldebtoverduevalue_718A', 'paytype1st_925L', 
             'max_inittransactioncode_279L', 'max_contractst_545M', 'max_cancelreason_3545846M', 
             'max_rejectreason_755M', 'max_personindex_1023L', 'max_subjectroles_name_838M', 'maxdpdlast6m_474P', 
             'max_subjectrole_182M', 'actualdpdtolerance_344P', 'max_num_group1_9', 'max_collaterals_typeofguarante_669M', 
             'numinstpaidearly_338L', 'clientscnt_887L', 'maritalst_893M', 'max_subjectrole_93M', 'max_type_25L', 
             'max_refreshdate_3813885D', 'numinstpaidearly5d_1087L', 'max_actualdpd_943P', 'max_description_351M', 
             'education_88M', 'clientscnt_946L', 'clientscnt12m_3712952L', 'numactiverelcontr_750L', 
             'max_education_927M', 'applicationscnt_1086L', 'sellerplacecnt_915L', 'max_purposeofcred_426M', 
             'max_subjectroles_name_541M', 'clientscnt_1022L', 'clientscnt_360L', 'max_totaloutstanddebtvalue_668A', 
             'applicationscnt_629L', 'max_outstandingamount_354A', 'clientscnt_1071L', 'numactivecreds_622L', 
             'clientscnt_493L', 'paytype_783L', 'clientscnt6m_3712949L', 'clientscnt_304L', 'max_classificationofcontr_13M', 
             'numnotactivated_1143L', 'commnoinclast6m_3546845L', 'max_numberofoutstandinstls_520L', 
             'applicationscnt_464L', 'clientscnt_1130L', 'max_numberofoverdueinstls_834L', 'clientscnt3m_3712950L', 
             'max_rejectreasonclient_4145042M', 'max_contaddr_smempladdr_334L', 'numpmtchanneldd_318L', 
             'numcontrs3months_479L', 'max_overdueamount_31A', 'max_collaterals_typeofguarante_359M', 
             'clientscnt_257L', 'clientscnt_157L', 'applications30d_658L', 'clientscnt_100L', 
             'max_collater_typofvalofguarant_298M', 'max_pmts_month_706T', 'max_pmts_month_158T', 
             'mastercontrexist_109L', 'max_collater_typofvalofguarant_407M', 'mastercontrelectronic_519L', 
             'applicationcnt_361L', 'max_persontype_1072L', 'max_empladdr_district_926M', 'deferredmnthsnum_166L', 
             'max_empladdr_zipcode_114M', 'max_persontype_792L', 'max_contaddr_matchlist_1032L']

In [15]:
_ = df_train[["clientscnt_493L","target","max_persontype_792L","max_isbidproduct_390L"]]
_.corr()

clientscnt_493L,target,max_persontype_792L,max_isbidproduct_390L
f64,f64,f64,f64
1.0,0.002045,0.007691,
0.002045,1.0,0.014058,
0.007691,0.014058,1.0,
,,,


In [16]:
df_train = df_train.drop(drop_list)
df_test = df_test.drop(drop_list)

In [17]:
df_train = df_train.pipe(Pipeline.filter_cols)
df_test = df_test.select([col for col in df_train.columns if col != "target"])

print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

train data shape:	 (1526659, 207)
test data shape:	 (10, 206)


In [18]:
df_train, cat_cols = to_pandas(df_train)
df_test, cat_cols = to_pandas(df_test, cat_cols)

In [19]:
del data_store
gc.collect()

0

In [20]:
cat_list = [col for col in df_train.columns if df_train[col].dtype.name == 'category']

catfreq_dict = {}
catcatfreq_dict = {}

for col in cat_list:
    catfreq_dict[col] = len(list(df_train[col].value_counts()))
    catcatfreq_dict[col] = {}
    for d in dict(df_train[col].value_counts()).items():
        catcatfreq_dict[col][d[0]] = d[1]

catfreq_df = pd.DataFrame.from_dict(catfreq_dict, orient='index', columns=['Categories'])
display(catfreq_df.sort_values(by="Categories", ascending=False).head())
display(catfreq_df.sort_values(by="Categories", ascending=True).head())

Unnamed: 0,Categories
max_financialinstitution_382M,148
max_contractst_964M,141
max_classificationofcontr_400M,118
max_financialinstitution_591M,85
lastcancelreason_561M,74


Unnamed: 0,Categories
max_empls_employer_name_740M,1
max_sex_738L,2
max_isbidproduct_390L,2
description_5085714M,2
disbursementtype_67L,3


In [21]:
ordinal_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
df_train[cat_list] = ordinal_enc.fit_transform(df_train[cat_list])
df_test[cat_list] = ordinal_enc.transform(df_test[cat_list])
df_train[cat_list].head()

Unnamed: 0,description_5085714M,education_1103M,maritalst_385M,credtype_322L,disbursementtype_67L,inittransactioncode_186L,lastapprcommoditycat_1041M,lastcancelreason_561M,lastrejectcommoditycat_161M,lastrejectreason_759M,...,max_familystate_447L,max_incometype_1044T,max_relationshiptoclient_415T,max_relationshiptoclient_642T,max_sex_738L,max_cacccardblochreas_147M,max_conts_type_509L,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M
0,,,,0.0,1.0,0.0,44.0,73.0,44.0,17.0,...,2.0,7.0,9.0,9.0,0.0,,,,,
1,,,,0.0,1.0,0.0,44.0,73.0,44.0,17.0,...,0.0,7.0,8.0,8.0,1.0,,,,,
2,,,,0.0,1.0,0.0,44.0,73.0,44.0,17.0,...,2.0,0.0,9.0,9.0,0.0,,4.0,,,
3,,,,0.0,1.0,0.0,44.0,67.0,44.0,15.0,...,2.0,0.0,9.0,9.0,0.0,,4.0,,,
4,,,,0.0,1.0,0.0,44.0,41.0,44.0,17.0,...,2.0,0.0,8.0,8.0,0.0,,4.0,,,


In [22]:
nan_list = []
for col, boo in df_train.isnull().any().items():
    if boo == True:
        nan_list.append(col)

print(f"Number of col contains Nan value: {len(nan_list)}")
for i, v in df_train.isna().sum().items():
    if v/len(df_train)>0.6:
        print(f"{i} : \t {round((v/len(df_train))*100)}% Nan ")

Number of col contains Nan value: 186
birthdate_574D : 	 60% Nan 
pmtscount_423L : 	 62% Nan 
pmtssum_45A : 	 62% Nan 
avgdbddpdlast3m_4187120P : 	 62% Nan 
avgpmtlast12m_4525200A : 	 67% Nan 
dtlastpmtallstes_4499206D : 	 64% Nan 
lastdelinqdate_224D : 	 64% Nan 
maxdbddpdlast1m_3658939P : 	 63% Nan 
maxlnamtstart6m_4525199A : 	 68% Nan 
mindbdtollast24m_4525191P : 	 64% Nan 
max_byoccupationinc_3656910L : 	 63% Nan 
max_pmtamount_36A : 	 68% Nan 
max_processingdate_168D : 	 68% Nan 
max_num_group1_5 : 	 68% Nan 
max_credlmt_230A : 	 68% Nan 
max_residualamount_488A : 	 68% Nan 
max_numberofoverdueinstlmaxdat_641D : 	 66% Nan 
max_overdueamountmax2date_1142D : 	 66% Nan 
max_empl_employedfrom_271D : 	 63% Nan 
max_empl_industry_691L : 	 66% Nan 
max_relationshiptoclient_415T : 	 61% Nan 
max_relationshiptoclient_642T : 	 61% Nan 


In [23]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
df_train[nan_list] = imp.fit_transform(df_train[nan_list])
df_test[nan_list] = imp.transform(df_test[nan_list])

In [27]:
X = df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
y = df_train["target"]
weeks = df_train["WEEK_NUM"]

cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.05,
    "n_estimators": 2000,  
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": "gpu", 
    "verbose": -1,
}

fitted_models = []
cv_scores = []

for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(60)] )
    fitted_models.append(model)
    
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores.append(auc_score)
    
print("CV AUC scores: ", cv_scores)
print("Maximum CV AUC score: ", max(cv_scores))

Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.840531
[400]	valid_0's auc: 0.846133
[600]	valid_0's auc: 0.848306
[800]	valid_0's auc: 0.849228
[1000]	valid_0's auc: 0.849683
[1200]	valid_0's auc: 0.850028
[1400]	valid_0's auc: 0.850256
[1600]	valid_0's auc: 0.850358
Early stopping, best iteration is:
[1557]	valid_0's auc: 0.850386
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.842538
[400]	valid_0's auc: 0.848707
[600]	valid_0's auc: 0.850751
[800]	valid_0's auc: 0.851871
[1000]	valid_0's auc: 0.852405
[1200]	valid_0's auc: 0.852642
[1400]	valid_0's auc: 0.852863
Early stopping, best iteration is:
[1397]	valid_0's auc: 0.852873
Training until validation scores don't improve for 60 rounds
[200]	valid_0's auc: 0.845587
[400]	valid_0's auc: 0.851824
[600]	valid_0's auc: 0.854178
[800]	valid_0's auc: 0.85535
[1000]	valid_0's auc: 0.856141
[1200]	valid_0's auc: 0.856488
[1400]	valid_0's auc: 0.856786
[1600]	valid_0's

In [28]:


class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models)

In [29]:
X_test = df_test.drop(columns=["WEEK_NUM"])
X_test = X_test.set_index("case_id")

lgb_pred = pd.Series(model.predict_proba(X_test)[:, 1], index=X_test.index)

In [30]:
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = lgb_pred

FileNotFoundError: [Errno 2] No such file or directory: '/home/sohail/Downloads/credit_risk/sample_submission.csv'

In [None]:
df_subm.to_csv("submission.csv")