In [9]:
import numpy as np
import pandas as pd
import polars as pl
import os

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

## Preprocessing

[Data Info](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data) <br>
[Discussion on how the data is setup](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/473950) <br>
[Starter Notebook](https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook)
* depth=0 - These are static features directly tied to a specific case_id.
* depth=1 - Each case_id has an associated historical record, indexed by num_group1.
* depth=2 - Each case_id has an associated historical record, indexed by both num_group1 and num_group2.

In [11]:
def set_table_dtypes(df: pl.DataFrame) -> pl.DataFrame:
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float64))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))
    return df

def convert_strings(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:  
        if df[col].dtype.name in ['object', 'string']:
            df[col] = df[col].astype("string").astype('category')
            current_categories = df[col].cat.categories
            new_categories = current_categories.to_list() + ["Unknown"]
            new_dtype = pd.CategoricalDtype(categories=new_categories, ordered=True)
            df[col] = df[col].astype(new_dtype)
    return df

    

In [30]:
class DatasetBuilder:
    """ This class is used to create the dataset """
    def __init__(self, 
                 n_samples   = None, 
                 parent_path = "/kaggle/input/home-credit-credit-risk-model-stability"):
        


        self.parent_path = parent_path
        self.n_samples = n_samples

        self.feat_info = pd.read_csv(f"{parent_path}/feature_definitions.csv")
        self.date_cols = []
        self.features  = []
        
        self.run()

    def explain_feat(self,feat_name:str):
        assert feat_name in self.feat_info['Variable'].unique(), "feature not found in feature info dataframe"
        return self.feat_info[self.feat_info['Variable']==feat_name]['Description'].values[0]
    
    def create_base_dataset(self):
        
        # load in the training dataset 
        if self.n_samples:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .pipe(set_table_dtypes).sample(n=self.n_samples).with_columns(pl.lit('train').alias('partition'))
        else:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .pipe(set_table_dtypes).with_columns(pl.lit('train').alias('partition'))
        
        # load in the test dataset
        test =  pl.read_parquet(f"{self.parent_path}/parquet_files/test/test_base.parquet")\
                .pipe(set_table_dtypes).with_columns(pl.lit('test').alias('partition'))
        
        # concat train and test
        self.df = pl.concat([train,test],how='diagonal_relaxed')
        
        # get all case_ids
        self.case_ids = self.df.get_column('case_id').to_list()
        

    def read_in_files_with_criteria(self, criteria:str):
        train_df  = pl.concat([pl.read_parquet(f"{self.parent_path}/parquet_files/train/{x}").pipe(set_table_dtypes).filter(pl.col('case_id').is_in(self.case_ids))
                       for x in os.listdir(f"{self.parent_path}/parquet_files/train") if (criteria in x)],how='diagonal_relaxed')
        test_df  =  pl.concat([pl.read_parquet(f"{self.parent_path}/parquet_files/test/{x}").pipe(set_table_dtypes)
                       for x in os.listdir(f"{self.parent_path}/parquet_files/test") if (criteria in x)],how='diagonal_relaxed')
        
        # for some reason, being in train partition doesnt gaurentee it is in the test partition, so we have to ensure it 
        columns_in_common = list(set(train_df.columns).intersection(test_df.columns))
        
        df = pl.concat([train_df.select(columns_in_common),
                         test_df.select(columns_in_common)],how='diagonal_relaxed')
        
        
        return df

    
    def read_in_depth0(self):
        """
        These files can be used as is except for the dates, so just collect them, do feature engineering on the dates, then 
        throw out the date columns
        """
        depth0_criterias = ["static_0","static_cb_0"]
        for criteria in depth0_criterias:
            self.df = self.df.join(self.read_in_files_with_criteria(criteria), on='case_id', how='inner')

        
    def run(self):
        self.create_base_dataset()
        self.read_in_depth0()

In [31]:
test = DatasetBuilder(n_samples=100)
test.df

case_id,date_decision,MONTH,WEEK_NUM,target,partition,numinstunpaidmaxest_4493212L,avgdbddpdlast3m_4187120P,clientscnt6m_3712949L,posfpd10lastmonth_333P,inittransactionamount_650A,cntincpaycont9m_3716944L,numinstpaid_4499208L,sellerplacecnt_915L,currdebtcredtyperange_828A,numinstlswithoutdpd_562L,avgoutstandbalancel6m_4187114A,isbidproductrequest_292L,paytype1st_925L,daysoverduetolerancedd_3976961L,numincomingpmts_3546848L,mastercontrelectronic_519L,avgdpdtolclosure24_3658938P,lastrepayingdate_696D,applications30d_658L,numinstpaidearly_338L,numrejects9m_859L,maxdpdinstlnum_3546846P,maxdbddpdlast1m_3658939P,maxpmtlast3m_4525190A,clientscnt_1071L,datefirstoffer_1144D,numinsttopaygrest_4493213L,commnoinclast6m_3546845L,avglnamtstart24m_4525187A,inittransactioncode_186L,avgmaxdpdlast9m_3716943P,…,fourthquarter_440L,forquarter_1017L,foryear_818L,requesttype_4525192L,formonth_118L,pmtcount_4527229L,pmtscount_423L,forquarter_634L,maritalst_893M,pmtcount_693L,formonth_206L,pmtssum_45A,assignmentdate_4955616D,days180_256L,thirdquarter_1082L,days120_123L,for3years_128L,education_88M,assignmentdate_4527235D,pmtaverage_3A,responsedate_4917613D,firstquarter_103L,foryear_618L,maritalst_385M,days30_165L,riskassesment_940T,riskassesment_302T,pmtcount_4955617L,pmtaverage_4955615A,fortoday_1092L,numberofqueries_373L,responsedate_1012D,birthdate_574D,days90_310L,formonth_535L,forweek_1077L,description_5085714M
i64,date,i64,i64,i64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,bool,str,f64,f64,f64,f64,date,f64,f64,f64,f64,f64,f64,f64,date,f64,f64,f64,str,f64,…,f64,f64,f64,str,f64,f64,f64,f64,str,f64,f64,f64,date,f64,f64,f64,f64,str,date,f64,date,f64,f64,str,f64,f64,str,f64,f64,f64,f64,date,date,f64,f64,f64,str
28399,2019-08-28,201908,34,0,"""train""",,,0.0,0.0,,,,0.0,0.0,,,,"""OTHER""",,,0.0,,,0.0,,0.0,,,,0.0,,,0.0,,"""CASH""",,…,6.0,,,,,,6.0,,"""a55475b1""",,,10765.4,,1.0,4.0,1.0,,"""a55475b1""",,,,2.0,,"""a55475b1""",0.0,,,,,,3.0,2019-09-11,1960-08-01,0.0,,,"""a55475b1"""
121491,2019-04-09,201904,14,0,"""train""",,,0.0,0.0,,0.0,,0.0,0.0,70.0,,,"""OTHER""",17.0,94.0,0.0,1.0,,0.0,37.0,2.0,8.0,,,0.0,2008-04-03,,0.0,,"""CASH""",,…,3.0,,,,,,,,"""a55475b1""",6.0,,,,2.0,5.0,0.0,,"""a55475b1""",,15608.0,,1.0,,"""b6cabe76""",0.0,,,,,,5.0,2019-04-23,1953-12-01,0.0,,,"""a55475b1"""
136988,2019-06-19,201906,24,0,"""train""",,-18.0,0.0,0.0,,16.0,,0.0,49454.6,26.0,69719.15,,"""OTHER""",0.0,29.0,0.0,0.0,,2.0,27.0,1.0,13.0,13.0,,0.0,2019-06-23,,0.0,,"""CASH""",0.0,…,7.0,,,,,,8.0,,"""a55475b1""",,,2136.83,,1.0,1.0,0.0,,"""a55475b1""",,,,11.0,,"""3439d993""",0.0,,,,,,2.0,2019-07-02,1984-07-01,0.0,,,"""a55475b1"""
157739,2019-09-09,201909,35,0,"""train""",27.0,-13.0,0.0,0.0,,20.0,234.0,1.0,112027.44,206.0,110628.58,,"""OTHER""",46.0,234.0,0.0,0.0,,0.0,101.0,2.0,18.0,-6.0,5325.6,0.0,2007-10-29,38.0,0.0,,"""CASH""",0.0,…,3.0,,,"""PENSION_6""",,6.0,,,"""a55475b1""",6.0,,,,4.0,3.0,3.0,,"""a55475b1""",2019-09-23,7553.0,,7.0,,"""a7fcb6e5""",0.0,,,,,,6.0,2019-09-23,,3.0,,,"""a55475b1"""
161305,2019-09-23,201909,37,0,"""train""",0.0,,0.0,0.0,,0.0,4.0,0.0,0.0,4.0,,,"""OTHER""",0.0,3.0,0.0,0.0,,0.0,4.0,0.0,,,,0.0,2007-10-03,0.0,0.0,,"""CASH""",,…,6.0,,,"""DEDUCTION_6""",,,6.0,,"""a55475b1""",,,70761.2,,9.0,9.0,6.0,,"""a55475b1""",,,,10.0,,"""3439d993""",3.0,,,,,,12.0,2019-10-07,,6.0,,,"""a55475b1"""
165315,2019-10-08,201910,40,0,"""train""",17.0,-16.0,0.0,0.0,,20.0,65.0,0.0,163890.0,58.0,190855.72,,"""OTHER""",9.0,68.0,0.0,0.0,,0.0,38.0,1.0,6.0,-16.0,11606.8,0.0,2007-10-29,29.0,0.0,,"""CASH""",0.0,…,4.0,,,"""PENSION_6""",,6.0,,,"""a55475b1""",6.0,,,,0.0,1.0,0.0,,"""a55475b1""",2019-10-22,19563.4,,1.0,,"""3439d993""",0.0,,,,,,2.0,2019-10-22,,0.0,,,"""a55475b1"""
599112,2019-01-01,201901,0,0,"""train""",,,0.0,0.0,,,,0.0,0.0,,,,"""OTHER""",,,0.0,,,0.0,,0.0,,,,0.0,,,,,"""POS""",,…,,0.0,0.0,,0.0,,7.0,0.0,"""46b968c3""",,0.0,13530.8,,,,,0.0,"""a55475b1""",,,,,0.0,"""a55475b1""",,,,,,0.0,,2019-01-15,1965-07-01,,0.0,0.0,"""a55475b1"""
607980,2019-01-11,201901,1,0,"""train""",,,0.0,0.0,,4.0,,0.0,11545.2,3.0,,,"""OTHER""",0.0,4.0,0.0,0.0,,0.0,0.0,0.0,,0.0,,0.0,,,0.0,,"""POS""",0.0,…,1.0,,,,,,5.0,,"""a55475b1""",,,5348.4,,1.0,0.0,1.0,,"""a55475b1""",,,,0.0,,"""a55475b1""",0.0,,,,,,1.0,2019-01-25,1994-04-01,0.0,,,"""a55475b1"""
617243,2019-01-23,201901,3,0,"""train""",,,0.0,0.0,3980.0,,,1.0,0.0,,,,"""OTHER""",,,0.0,,,0.0,,0.0,,,,0.0,,,0.0,,"""POS""",,…,4.0,,,,,,,,"""a55475b1""",,,,,4.0,2.0,4.0,,"""a55475b1""",,,,0.0,,"""a55475b1""",0.0,,,,,,6.0,2019-02-06,1975-10-01,2.0,,,"""a55475b1"""
633085,2019-02-15,201902,6,0,"""train""",,,0.0,0.0,,,,0.0,0.0,,,,"""OTHER""",,,,,,0.0,,0.0,,,,0.0,,,,,"""POS""",,…,,,,,,,6.0,,"""a55475b1""",,,3823.8,,,,,,"""a55475b1""",,,,,,"""a55475b1""",,,,,,,,2019-03-01,1970-08-01,,,,"""a55475b1"""


In [None]:
class DatasetBuilder:
    """ This class is used to create the dataset """
    def __init__(self, 
                 n_samples = None, 
                 partition = "train",
                 parent_path = "/kaggle/input/home-credit-credit-risk-model-stability"):
        
        assert partition in ["train","test"], "partition can only be 'train','test' "
        
        self.parent_path = parent_path
        self.partition = partition
        self.n_samples = n_samples

        self.feat_info = pd.read_csv(f"{parent_path}/feature_definitions.csv")
        self.date_cols = []
        self.features = []
        # run process
        self.run()
 
    def explain_feat(self,feat_name:str):
        assert feat_name in self.feat_info['Variable'].unique(), "feature not found in feature info dataframe"
        return self.feat_info[self.feat_info['Variable']==feat_name]['Description'].values[0]

    def create_base_dataset(self):
        if self.n_samples:
            self.df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{self.partition}_base.csv").sample(n=self.n_samples)
        else:
            self.df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{self.partition}_base.csv")
        
        self.df['date_decision'] = pd.to_datetime(self.df['date_decision'])
        self.df['MONTH'] = pd.to_datetime(self.df['MONTH'].astype(str).str[:4] + '-' + self.df['MONTH'].astype(str).str[-2:] +'-01')
        self.base_cols = self.df.columns.tolist()
        self.case_ids = self.df['case_id'].unique().tolist()
    
    def read_in_file(self, file_name:str):
        df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")
        df = df[df['case_id'].isin(self.case_ids)]
        return df
    
    def add_df_to_dataset(self,df_to_add: pd.DataFrame,convert_types = True):
        if convert_types:
            for col in df_to_add.columns:
                if (df_to_add[col].dtype == 'object'):
                    if ('date' in col) or (col in ['dtlastpmtallstes_4499206D','firstclxcampaign_1125D']):
                        self.date_cols.append(col)
                        df_to_add[col] = pd.to_datetime(df_to_add[col])
        self.df = self.df.merge(df_to_add,on='case_id',how='left')
        
    def find_all_files_that_contain(self,criteria:str):
        files = [x for x in os.listdir(f"{self.parent_path}/csv_files/{self.partition}") if (criteria in x) and (self.partition in x)]
        return files 
    
    def add_all_level0_files(self):
        level0_criterias = ["static_0","static_cb_0"]
        for crit in level0_criterias:
            df_to_concat = []
            for file in self.find_all_files_that_contain(crit):
                print(f"adding {file}...")
                df_to_concat.append(self.read_in_file(file))
            self.add_df_to_dataset(pd.concat(df_to_concat,axis=0))
    

    def make_level0_features(self):
        print("making level0 features...")
        # features that can be used as is are already in numeric or categorical format
        provided_feats = [col for col in self.df.columns[6:] if self.df[col].dtype in ['int64','float64','category']]
            
        # date transformations
        ## number of days prior to the decision date, cannot be negative otherwise we wouldnt have this information when the decision was made 
        date_feats = []
        for dt_col in self.date_cols:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (self.df['date_decision'] - self.df[dt_col]).dt.days
            dt_feat_series = (dt_feat_series).mask(dt_feat_series < 0, np.nan)
            self.df[new_col] = dt_feat_series
            date_feats.append(new_col)
        
        self.features = provided_feats + date_feats

        
    def process_level1_files(self):
        
        print("adding previous applications...")
        # previous applications
        ## gather all files and concat
        appl_prev = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("applprev_1")],axis=0)
        ## only focus on the person for group 0 because that is the person we are making the decision on
        ## additionally, only focus on the most recent application that existed prior to the case's decision date
        ## it might be worth revisting this to look at all past applications because someone might be approved in a previous application but we are only going to
        ## look at the most recent application
        
        appl_prev['creationdate_885D'] = pd.to_datetime(appl_prev['creationdate_885D'])
        appl_prev = appl_prev.merge(self.df[['case_id','date_decision']],on='case_id')
        appl_prev = appl_prev[  (appl_prev['creationdate_885D'] < appl_prev['date_decision']) 
                              & (appl_prev['num_group1'] == 0)].sort_values('creationdate_885D',ascending=False).drop_duplicates(subset=['case_id'])
        ## convert the date columns to days since
        for dt_col in ["creationdate_885D","approvaldate_319D","dateactivated_425D","employedfrom_700D","firstnonzeroinstldate_307D","dtlastpmt_581D","dtlastpmtallstes_3545839D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (appl_prev['date_decision'] - pd.to_datetime(appl_prev[dt_col]) ).dt.days
            appl_prev[new_col] = dt_feat_series
            del appl_prev[dt_col]
        ## change the names so it is clear that this is the most recent application
        del appl_prev['num_group1']
        del appl_prev['date_decision']
        appl_prev.columns = ['case_id'] + [f"{x}_MOST_RECENT_APPLICATION" for x in appl_prev.columns[1:]]
        ## add previous applications to the dataframe
        self.features.extend(appl_prev.columns.tolist()[1:])
        self.add_df_to_dataset(appl_prev)
        ## free up memory
        del appl_prev
        
        print("adding other...")        
        # other file
        ## going to keep it real simple, just grab the first record for the person who we are making decision on
        other = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("other_1")],axis=0)
        other = other[other['num_group1']==0].drop_duplicates(subset=['case_id'])
        ## add other to the dataframe
        self.features.extend(other.columns.tolist()[1:])
        self.add_df_to_dataset(other)        
        ## free up memory
        del other       
        
        print("adding tax registry a...")             
        # tax registry a, look at individual and across all groups
        tra = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_a")],axis=0)
        tra = tra.merge(self.df[['case_id','date_decision']],on='case_id')
        tra['recorddate_4527225D'] = pd.to_datetime(tra['recorddate_4527225D'])
        tra = tra[tra['recorddate_4527225D']<tra['date_decision']]
        ## individual
        individual_tra = tra[tra['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["recorddate_4527225D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_tra['date_decision'] - pd.to_datetime(individual_tra[dt_col]) ).dt.days
            individual_tra[new_col] = dt_feat_series
            del individual_tra[dt_col]
        del individual_tra['date_decision']
        individual_tra.columns = ['case_id'] + [f"{x}_TRA_INDIV" for x in individual_tra.columns[1:]]
        ## add individual_tra to the dataframe
        self.features.extend(individual_tra.columns.tolist()[1:])
        self.add_df_to_dataset(individual_tra)        
        ## free up memory
        del individual_tra   
        ## agg
        agg_tra = tra.groupby('case_id',as_index=False).agg(
            amount_4527230A_MIN = ('amount_4527230A','min'),
            amount_4527230A_MAX = ('amount_4527230A','max'),
            amount_4527230A_STD = ('amount_4527230A','std'),
            amount_4527230A_AVG = ('amount_4527230A','mean'),
            amount_4527230A_MEDIAN = ('amount_4527230A','median'),
            amount_4527230A_SUM = ('amount_4527230A','sum'),    
        )
        self.features.extend(agg_tra.columns.tolist()[1:])
        self.add_df_to_dataset(agg_tra)        
        ## free up memory
        del agg_tra        
        
        
        print("adding tax registry b...") 
        # tax registry b, look at individual and across all groups
        trb = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_b")],axis=0)
        trb = trb.merge(self.df[['case_id','date_decision']],on='case_id')
        trb['deductiondate_4917603D'] = pd.to_datetime(trb['deductiondate_4917603D'])
        trb = trb[trb['deductiondate_4917603D']<trb['date_decision']]
        ## individual
        individual_trb = trb[trb['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["deductiondate_4917603D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_trb['date_decision'] - pd.to_datetime(individual_trb[dt_col]) ).dt.days
            individual_trb[new_col] = dt_feat_series
            del individual_trb[dt_col]
        del individual_trb['date_decision']
        individual_trb.columns = ['case_id'] + [f"{x}_TRB_INDIV" for x in individual_trb.columns[1:]]
        ## add individual_trb to the dataframe
        self.features.extend(individual_trb.columns.tolist()[1:])
        self.add_df_to_dataset(individual_trb)        
        ## free up memory
        del individual_trb   
        ## agg
        agg_trb = trb.groupby('case_id',as_index=False).agg(
            amount_4917619A_MIN = ('amount_4917619A','min'),
            amount_4917619A_MAX = ('amount_4917619A','max'),
            amount_4917619A_STD = ('amount_4917619A','std'),
            amount_4917619A_AVG = ('amount_4917619A','mean'),
            amount_4917619A_MEDIAN = ('amount_4917619A','median'),
            amount_4917619A_SUM = ('amount_4917619A','sum'),    
        )
        self.features.extend(agg_trb.columns.tolist()[1:])
        self.add_df_to_dataset(agg_trb)        
        ## free up memory
        del agg_trb              
        
        print("adding tax registry c...")        
        # tax registry c, look at individual and across all groups
        trc = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_c")],axis=0)
        trc = trc.merge(self.df[['case_id','date_decision']],on='case_id')
        trc['processingdate_168D'] = pd.to_datetime(trc['processingdate_168D'])
        trc = trc[trc['processingdate_168D']<trc['date_decision']]
        ## individual
        individual_trc = trc[trc['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["processingdate_168D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_trc['date_decision'] - pd.to_datetime(individual_trc[dt_col]) ).dt.days
            individual_trc[new_col] = dt_feat_series
            del individual_trc[dt_col]
        del individual_trc['date_decision']
        individual_trc.columns = ['case_id'] + [f"{x}_TRC_INDIV" for x in individual_trc.columns[1:]]
        ## add individual_trc to the dataframe
        self.features.extend(individual_trc.columns.tolist()[1:])
        self.add_df_to_dataset(individual_trc)        
        ## free up memory
        del individual_trc   
        ## agg
        agg_trc = trc.groupby('case_id',as_index=False).agg(
            pmtamount_36A_MIN = ('pmtamount_36A','min'),
            pmtamount_36A_MAX = ('pmtamount_36A','max'),
            pmtamount_36A_STD = ('pmtamount_36A','std'),
            pmtamount_36A_AVG = ('pmtamount_36A','mean'),
            pmtamount_36A_MEDIAN = ('pmtamount_36A','median'),
            pmtamount_36A_SUM = ('pmtamount_36A','sum'),    
        )
        self.features.extend(agg_trc.columns.tolist()[1:])
        self.add_df_to_dataset(agg_trc)        
        ## free up memory
        del agg_trc            
        
        
        
        
        
        
#         level1_criterias = [,"other_",
#                             "tax_registry_a","tax_registry_b","tax_registry_c",
#                             "credit_bureau_a_1","credit_bureau_b_1",
#                             "deposit_1","person_1"]

            
        
    def run(self):
        self.create_base_dataset()
        
        self.add_all_level0_files()
        self.make_level0_features()
        
        self.process_level1_files()
    
    def get_modeling_dataset(self):
        return self.df[self.base_cols + self.features]

In [None]:
# train_ds_builder = DatasetBuilder(n_samples = 10)
# train_ds = train_ds_builder.get_modeling_dataset()

In [None]:
# test = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_applprev_1_0.csv")
# # test.head()

# test 

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [None]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [None]:
train_ds_builder = DatasetBuilder(partition="train")
test_ds_builder = DatasetBuilder(partition="test")

In [None]:
train_ds = train_ds_builder.get_modeling_dataset()
test_ds = test_ds_builder.get_modeling_dataset()

# for some reason, the same features arent available across the two partition
feats = list(set(train_ds_builder.features).intersection(set(test_ds_builder.features)))

ds = pd.concat([train_ds,test_ds],axis=0).reset_index(drop=True)


In [None]:
for col in feats:
    if train_ds[col].dtype == 'float64':
        ds[col] = ds[col].astype(float)
    else:
        ds[col] = ds[col].astype('category')

train = ds.iloc[:train_ds.shape[0],:]
test = ds.iloc[train_ds.shape[0]:,:]

In [None]:
idx = np.arange(len(train))
np.random.shuffle(idx)
train_idx = idx[:int(len(train)*.7)]
valid_idx = idx[int(len(train)*.7):]

X_train = train.loc[train_idx,:]
y_train = train.loc[train_idx,'target']

X_valid = train.loc[valid_idx,:]
y_valid = train.loc[valid_idx,'target']

In [None]:
lgb_train = lgb.Dataset(X_train[feats].copy(), label=y_train)
lgb_valid = lgb.Dataset(X_valid[feats].copy(), label=y_valid, reference=lgb_train)
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)

Evaluation with AUC and then comparison with the stability metric is shown below.

In [None]:
X_train[feats]

In [None]:
eval_train = X_train.copy()
eval_train['score'] = gbm.predict(X_train[feats], num_iteration=gbm.best_iteration)

eval_valid = X_valid.copy()
eval_valid['score'] = gbm.predict(X_valid[feats], num_iteration=gbm.best_iteration)

In [None]:
from sklearn.metrics import roc_auc_score 
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(eval_train)
stability_score_valid = gini_stability(eval_valid)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 


## Submission


In [None]:

test_ds['score'] = gbm.predict(test[feats], num_iteration=gbm.best_iteration)

In [None]:
submission = test_ds[['case_id','score']]
submission.to_csv('submission.csv', index=False)
submission.head()