In [1]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
def calculate_woe_iv_categorical(data, feature, target):
    # Create a DataFrame for counts per category
    df = data.groupby([feature])[target].agg(['count', 'sum'])
    df.columns = ['Total', 'Bad']
    
    # Calculate the number of good outcomes
    df['Good'] = df['Total'] - df['Bad']
    
    # Handle cases where the count is 0 to avoid division by zero in WoE calculation
    df['Bad'] = np.where(df['Bad'] == 0, 0.0001, df['Bad'])
    df['Good'] = np.where(df['Good'] == 0, 0.0001, df['Good'])
    
    # Calculate the percentage of bads and goods
    df['Distr_Bad'] = df['Bad'] / df['Bad'].sum()
    df['Distr_Good'] = df['Good'] / df['Good'].sum()
    
    # Calculate WoE
    df['WoE'] = np.log(df['Distr_Good'] / df['Distr_Bad'])
    
    # Calculate IV
    df['IV'] = (df['Distr_Good'] - df['Distr_Bad']) * df['WoE']
    
    # Sum the IV values for the feature
    IV = df['IV'].sum()
    
    # Prepare a report
    report = df.reset_index()[[feature, 'WoE', 'IV']]
    
    return IV, report

def calculate_woe_iv_numeric(data, feature, target, bins=10):
    # Bin the data
    data['binned'] = pd.qcut(data[feature], q=bins, duplicates='drop')

    # Group by the binned feature
    grouped = data.groupby('binned')[target].agg(['count', 'sum'])
    grouped.columns = ['Total', 'Bad']

    # Calculate the number of good outcomes
    grouped['Good'] = grouped['Total'] - grouped['Bad']
    
    # Handle cases where the count is 0 to avoid division by zero in WoE calculation
    grouped['Bad'] = np.where(grouped['Bad'] == 0, 0.0001, grouped['Bad'])
    grouped['Good'] = np.where(grouped['Good'] == 0, 0.0001, grouped['Good'])

    # Calculate the distribution of bads and goods
    grouped['Distr_Bad'] = grouped['Bad'] / grouped['Bad'].sum()
    grouped['Distr_Good'] = grouped['Good'] / grouped['Good'].sum()

    # Calculate WoE
    grouped['WoE'] = np.log(grouped['Distr_Good'] / grouped['Distr_Bad'])

    # Calculate IV
    grouped['IV'] = (grouped['Distr_Good'] - grouped['Distr_Bad']) * grouped['WoE']

    # Sum the IV values for the feature
    IV = grouped['IV'].sum()

    # Prepare a report
    report = grouped.reset_index()[['binned', 'WoE', 'IV']]

    return IV, report

## Preprocessing

[Data Info](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data) <br>
[Discussion on how the data is setup](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/473950)
* depth=0 - These are static features directly tied to a specific case_id.
* depth=1 - Each case_id has an associated historical record, indexed by num_group1.
* depth=2 - Each case_id has an associated historical record, indexed by both num_group1 and num_group2.

In [3]:
class DatasetBuilder:
    """ This class is used to create the dataset """
    def __init__(self, 
                 n_samples = None, 
                 partition = "train",
                 parent_path = "/kaggle/input/home-credit-credit-risk-model-stability"):
        
        assert partition in ["train","test"], "partition can only be 'train','test' "
        
        self.parent_path = parent_path
        self.partition = partition
        self.n_samples = n_samples

        self.feat_info = pd.read_csv(f"{parent_path}/feature_definitions.csv")
        self.date_cols = []
        self.features = []
        # run process
        self.run()
 
    def explain_feat(self,feat_name:str):
        assert feat_name in self.feat_info['Variable'].unique(), "feature not found in feature info dataframe"
        return self.feat_info[self.feat_info['Variable']==feat_name]['Description'].values[0]

    def create_base_dataset(self):
        if self.n_samples:
            self.df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{self.partition}_base.csv").sample(n=self.n_samples)
        else:
            self.df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{self.partition}_base.csv")
        
        self.df['date_decision'] = pd.to_datetime(self.df['date_decision'])
        self.df['MONTH'] = pd.to_datetime(self.df['MONTH'].astype(str).str[:4] + '-' + self.df['MONTH'].astype(str).str[-2:] +'-01')
        self.base_cols = self.df.columns.tolist()
        self.case_ids = self.df['case_id'].unique().tolist()
    
    def read_in_file(self, file_name:str):
        df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")
        df = df[df['case_id'].isin(self.case_ids)]
        return df
    
    def add_df_to_dataset(self,df_to_add: pd.DataFrame,convert_types = True):
        if convert_types:
            for col in df_to_add.columns:
                if (df_to_add[col].dtype == 'object'):
                    if ('date' in col) or (col in ['dtlastpmtallstes_4499206D','firstclxcampaign_1125D']):
                        self.date_cols.append(col)
                        df_to_add[col] = pd.to_datetime(df_to_add[col])
        self.df = self.df.merge(df_to_add,on='case_id',how='left')
        
    def find_all_files_that_contain(self,criteria:str):
        files = [x for x in os.listdir(f"{self.parent_path}/csv_files/{self.partition}") if (criteria in x) and (self.partition in x)]
        return files 
    
    def add_all_level0_files(self):
        level0_criterias = ["static_0","static_cb_0"]
        for crit in level0_criterias:
            df_to_concat = []
            for file in self.find_all_files_that_contain(crit):
                print(f"adding {file}...")
                df_to_concat.append(self.read_in_file(file))
            self.add_df_to_dataset(pd.concat(df_to_concat,axis=0))
    

    def make_level0_features(self):
        print("making level0 features...")
        # features that can be used as is are already in numeric or categorical format
        provided_feats = [col for col in self.df.columns[6:] if self.df[col].dtype in ['int64','float64','category']]
            
        # date transformations
        ## number of days prior to the decision date, cannot be negative otherwise we wouldnt have this information when the decision was made 
        date_feats = []
        for dt_col in self.date_cols:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (self.df['date_decision'] - self.df[dt_col]).dt.days
            dt_feat_series = (dt_feat_series).mask(dt_feat_series < 0, np.nan)
            self.df[new_col] = dt_feat_series
            date_feats.append(new_col)
        
        self.features = provided_feats + date_feats

        
    def process_level1_files(self):
        
        print("adding previous applications...")
        # previous applications
        ## gather all files and concat
        appl_prev = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("applprev_1")],axis=0)
        ## only focus on the person for group 0 because that is the person we are making the decision on
        ## additionally, only focus on the most recent application that existed prior to the case's decision date
        ## it might be worth revisting this to look at all past applications because someone might be approved in a previous application but we are only going to
        ## look at the most recent application
        
        appl_prev['creationdate_885D'] = pd.to_datetime(appl_prev['creationdate_885D'])
        appl_prev = appl_prev.merge(self.df[['case_id','date_decision']],on='case_id')
        appl_prev = appl_prev[  (appl_prev['creationdate_885D'] < appl_prev['date_decision']) 
                              & (appl_prev['num_group1'] == 0)].sort_values('creationdate_885D',ascending=False).drop_duplicates(subset=['case_id'])
        ## convert the date columns to days since
        for dt_col in ["creationdate_885D","approvaldate_319D","dateactivated_425D","employedfrom_700D","firstnonzeroinstldate_307D","dtlastpmt_581D","dtlastpmtallstes_3545839D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (appl_prev['date_decision'] - pd.to_datetime(appl_prev[dt_col]) ).dt.days
            appl_prev[new_col] = dt_feat_series
            del appl_prev[dt_col]
        ## change the names so it is clear that this is the most recent application
        del appl_prev['num_group1']
        del appl_prev['date_decision']
        appl_prev.columns = ['case_id'] + [f"{x}_MOST_RECENT_APPLICATION" for x in appl_prev.columns[1:]]
        ## add previous applications to the dataframe
        self.features.extend(appl_prev.columns.tolist()[1:])
        self.add_df_to_dataset(appl_prev)
        ## free up memory
        del appl_prev
        
        print("adding other...")        
        # other file
        ## going to keep it real simple, just grab the first record for the person who we are making decision on
        other = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("other_1")],axis=0)
        other = other[other['num_group1']==0].drop_duplicates(subset=['case_id'])
        ## add other to the dataframe
        self.features.extend(other.columns.tolist()[1:])
        self.add_df_to_dataset(other)        
        ## free up memory
        del other       
        
        print("adding tax registry a...")             
        # tax registry a, look at individual and across all groups
        tra = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_a")],axis=0)
        tra = tra.merge(self.df[['case_id','date_decision']],on='case_id')
        tra['recorddate_4527225D'] = pd.to_datetime(tra['recorddate_4527225D'])
        tra = tra[tra['recorddate_4527225D']<tra['date_decision']]
        ## individual
        individual_tra = tra[tra['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["recorddate_4527225D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_tra['date_decision'] - pd.to_datetime(individual_tra[dt_col]) ).dt.days
            individual_tra[new_col] = dt_feat_series
            del individual_tra[dt_col]
        del individual_tra['date_decision']
        individual_tra.columns = ['case_id'] + [f"{x}_TRA_INDIV" for x in individual_tra.columns[1:]]
        ## add individual_tra to the dataframe
        self.features.extend(individual_tra.columns.tolist()[1:])
        self.add_df_to_dataset(individual_tra)        
        ## free up memory
        del individual_tra   
        ## agg
        agg_tra = tra.groupby('case_id',as_index=False).agg(
            amount_4527230A_MIN = ('amount_4527230A','min'),
            amount_4527230A_MAX = ('amount_4527230A','max'),
            amount_4527230A_STD = ('amount_4527230A','std'),
            amount_4527230A_AVG = ('amount_4527230A','mean'),
            amount_4527230A_MEDIAN = ('amount_4527230A','median'),
            amount_4527230A_SUM = ('amount_4527230A','sum'),    
        )
        self.features.extend(agg_tra.columns.tolist()[1:])
        self.add_df_to_dataset(agg_tra)        
        ## free up memory
        del agg_tra        
        
        
        print("adding tax registry b...") 
        # tax registry b, look at individual and across all groups
        trb = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_b")],axis=0)
        trb = trb.merge(self.df[['case_id','date_decision']],on='case_id')
        trb['deductiondate_4917603D'] = pd.to_datetime(trb['deductiondate_4917603D'])
        trb = trb[trb['deductiondate_4917603D']<trb['date_decision']]
        ## individual
        individual_trb = trb[trb['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["deductiondate_4917603D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_trb['date_decision'] - pd.to_datetime(individual_trb[dt_col]) ).dt.days
            individual_trb[new_col] = dt_feat_series
            del individual_trb[dt_col]
        del individual_trb['date_decision']
        individual_trb.columns = ['case_id'] + [f"{x}_TRB_INDIV" for x in individual_trb.columns[1:]]
        ## add individual_trb to the dataframe
        self.features.extend(individual_trb.columns.tolist()[1:])
        self.add_df_to_dataset(individual_trb)        
        ## free up memory
        del individual_trb   
        ## agg
        agg_trb = trb.groupby('case_id',as_index=False).agg(
            amount_4917619A_MIN = ('amount_4917619A','min'),
            amount_4917619A_MAX = ('amount_4917619A','max'),
            amount_4917619A_STD = ('amount_4917619A','std'),
            amount_4917619A_AVG = ('amount_4917619A','mean'),
            amount_4917619A_MEDIAN = ('amount_4917619A','median'),
            amount_4917619A_SUM = ('amount_4917619A','sum'),    
        )
        self.features.extend(agg_trb.columns.tolist()[1:])
        self.add_df_to_dataset(agg_trb)        
        ## free up memory
        del agg_trb              
        
        print("adding tax registry c...")        
        # tax registry c, look at individual and across all groups
        trc = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_c")],axis=0)
        trc = trc.merge(self.df[['case_id','date_decision']],on='case_id')
        trc['processingdate_168D'] = pd.to_datetime(trc['processingdate_168D'])
        trc = trc[trc['processingdate_168D']<trc['date_decision']]
        ## individual
        individual_trc = trc[trc['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["processingdate_168D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_trc['date_decision'] - pd.to_datetime(individual_trc[dt_col]) ).dt.days
            individual_trc[new_col] = dt_feat_series
            del individual_trc[dt_col]
        del individual_trc['date_decision']
        individual_trc.columns = ['case_id'] + [f"{x}_TRC_INDIV" for x in individual_trc.columns[1:]]
        ## add individual_trc to the dataframe
        self.features.extend(individual_trc.columns.tolist()[1:])
        self.add_df_to_dataset(individual_trc)        
        ## free up memory
        del individual_trc   
        ## agg
        agg_trc = trc.groupby('case_id',as_index=False).agg(
            pmtamount_36A_MIN = ('pmtamount_36A','min'),
            pmtamount_36A_MAX = ('pmtamount_36A','max'),
            pmtamount_36A_STD = ('pmtamount_36A','std'),
            pmtamount_36A_AVG = ('pmtamount_36A','mean'),
            pmtamount_36A_MEDIAN = ('pmtamount_36A','median'),
            pmtamount_36A_SUM = ('pmtamount_36A','sum'),    
        )
        self.features.extend(agg_trc.columns.tolist()[1:])
        self.add_df_to_dataset(agg_trc)        
        ## free up memory
        del agg_trc            
        
        
        
        
        
        
#         level1_criterias = [,"other_",
#                             "tax_registry_a","tax_registry_b","tax_registry_c",
#                             "credit_bureau_a_1","credit_bureau_b_1",
#                             "deposit_1","person_1"]

            
        
    def run(self):
        self.create_base_dataset()
        
        self.add_all_level0_files()
        self.make_level0_features()
        
        self.process_level1_files()
    
    def get_modeling_dataset(self):
        return self.df[self.base_cols + self.features]

In [4]:
# train_ds_builder = DatasetBuilder(n_samples = 10)
# train_ds = train_ds_builder.get_modeling_dataset()

In [5]:
# test = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_applprev_1_0.csv")
# # test.head()

# test 

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [6]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [7]:
train_ds_builder = DatasetBuilder(partition="train")
test_ds_builder = DatasetBuilder(partition="test")

adding train_static_0_0.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding train_static_0_1.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding train_static_cb_0.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


making level0 features...
adding previous applications...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")
  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding other...
adding tax registry a...
adding tax registry b...
adding tax registry c...
adding test_static_0_0.csv...
adding test_static_0_2.csv...
adding test_static_0_1.csv...
adding test_static_cb_0.csv...
making level0 features...
adding previous applications...
adding other...
adding tax registry a...
adding tax registry b...
adding tax registry c...


In [8]:
train_ds = train_ds_builder.get_modeling_dataset()
test_ds = test_ds_builder.get_modeling_dataset()

# for some reason, the same features arent available across the two partition
feats = list(set(train_ds_builder.features).intersection(set(test_ds_builder.features)))

ds = pd.concat([train_ds,test_ds],axis=0).reset_index(drop=True)


In [9]:
for col in feats:
    if train_ds[col].dtype == 'float64':
        ds[col] = ds[col].astype(float)
    else:
        ds[col] = ds[col].astype('category')

train = ds.iloc[:train_ds.shape[0],:]
test = ds.iloc[train_ds.shape[0]:,:]

In [10]:
idx = np.arange(len(train))
np.random.shuffle(idx)
train_idx = idx[:int(len(train)*.7)]
valid_idx = idx[int(len(train)*.7):]

X_train = train.loc[train_idx,:]
y_train = train.loc[train_idx,'target']

X_valid = train.loc[valid_idx,:]
y_valid = train.loc[valid_idx,'target']

In [11]:
lgb_train = lgb.Dataset(X_train[feats].copy(), label=y_train)
lgb_valid = lgb.Dataset(X_valid[feats].copy(), label=y_valid, reference=lgb_train)
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.05,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)



Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.768005
[100]	valid_0's auc: 0.785727
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.785727


Evaluation with AUC and then comparison with the stability metric is shown below.

In [12]:
X_train[feats]

Unnamed: 0,maxdebt4_972A,avgpmtlast12m_4525200A,amount_4917619A_MIN,pmtaverage_4527227A,for3years_504L,pctinstlsallpaidearl3d_427L,clientscnt_946L,numincomingpmts_3546848L,clientscnt12m_3712952L,numinstpaidearlyest_4493214L,amount_4917619A_MEDIAN,amount_4527230A_AVG,clientscnt_1022L,days_since_dateofbirth_337D,pmtamount_36A_MEDIAN,familystate_726L_MOST_RECENT_APPLICATION,isdebitcard_527L_MOST_RECENT_APPLICATION,avgdbddpdlast24m_3658932P,outstandingdebt_522A_MOST_RECENT_APPLICATION,applications30d_658L,clientscnt_304L,numberofqueries_373L,days_since_datefirstoffer_1144D,pctinstlsallpaidlat10d_839L,days_since_firstclxcampaign_1125D,avgmaxdpdlast9m_3716943P,clientscnt_1071L,numinstlswithdpd10_728L,district_544M_MOST_RECENT_APPLICATION,education_1138M_MOST_RECENT_APPLICATION,credacc_transactions_402L_MOST_RECENT_APPLICATION,posfpd10lastmonth_333P,homephncnt_628L,clientscnt_533L,lastapprcredamount_781A,avginstallast24m_3658937A,forquarter_1017L,price_1097A,pmtcount_4527229L,applicationscnt_464L,days_since_firstnonzeroinstldate_307D_MOST_RECENT_APPLICATION,currdebt_22A,isbidproduct_390L_MOST_RECENT_APPLICATION,contractssum_5085716L,cntincpaycont9m_3716944L,deferredmnthsnum_166L,forweek_528L,revolvingaccount_394A_MOST_RECENT_APPLICATION,numinstpaidlastcontr_4325080L,for3years_584L,days_since_lastactivateddate_801D,formonth_535L,foryear_618L,rejectreason_755M_MOST_RECENT_APPLICATION,annuitynextmonth_57A,pmtamount_36A_MAX,numinstregularpaid_973L,days_since_firstdatedue_489D,amount_4917619A_STD,numactivecreds_622L,maxannuity_4075009A,pctinstlsallpaidlate4d_3546849L,clientscnt3m_3712950L,numinstpaidearly_338L,pmtamount_36A_STD,sellerplacescnt_216L,downpmt_116A,totinstallast1m_4525188A,amount_4527230A_STD,downpmt_134A_MOST_RECENT_APPLICATION,actualdpd_943P_MOST_RECENT_APPLICATION,credacc_minhisbal_90A_MOST_RECENT_APPLICATION,amount_4527230A_MIN,avgdpdtolclosure24_3658938P,days_since_deductiondate_4917603D_TRB_INDIV,maxdbddpdlast1m_3658939P,credacc_maxhisbal_375A_MOST_RECENT_APPLICATION,numnotactivated_1143L,maxinstallast24m_3658928A,clientscnt_493L,applicationscnt_629L,numactivecredschannel_414L,maxdbddpdtollast12m_3658940P,childnum_21L_MOST_RECENT_APPLICATION,days_since_dtlastpmt_581D_MOST_RECENT_APPLICATION,lastdependentsnum_448L,eir_270L,amount_4917619A_SUM,days_since_dtlastpmtallstes_3545839D_MOST_RECENT_APPLICATION,maxdbddpdtollast6m_4187119P,avgoutstandbalancel6m_4187114A,numinstlallpaidearly3d_817L,secondquarter_766L,clientscnt_136L,mainoccupationinc_437A_MOST_RECENT_APPLICATION,maxdpdlast3m_392P,maxannuity_159A,num_group1,amtdepositincoming_4809444A,foryear_850L,credacc_status_367L_MOST_RECENT_APPLICATION,numinstunpaidmaxest_4493212L,mastercontrelectronic_519L,avglnamtstart24m_4525187A,maxdpdinstlnum_3546846P,days360_512L,days_since_datelastinstal40dpd_247D,tenor_203L_MOST_RECENT_APPLICATION,maxdpdlast6m_474P,name_4917606M_TRB_INDIV,pmtnum_254L,formonth_118L,days_since_assignmentdate_4955616D,days_since_lastdelinqdate_224D,forquarter_634L,days_since_lastapprdate_640D,inittransactioncode_279L_MOST_RECENT_APPLICATION,pctinstlsallpaidlate1d_3546856L,amtdebitincoming_4809443A,sumoutstandtotal_3546847A,monthsannuity_845L,sellerplacecnt_915L,days_since_creationdate_885D_MOST_RECENT_APPLICATION,mastercontrexist_109L,avgdbddpdlast3m_4187120P,numinstlswithoutdpd_562L,firstquarter_103L,maxdpdtolerance_577P_MOST_RECENT_APPLICATION,numpmtchanneldd_318L,name_4527232M_TRA_INDIV,days90_310L,numinstregularpaidest_4493210L,sumoutstandtotalest_4493215A,numcontrs3months_479L,days_since_datelastunpaid_3546854D,numinstpaidearly5d_1087L,pmtssum_45A,avgdbdtollast24m_4525197P,amount_4917619A_AVG,amount_4917619A_MAX,amtdebitoutgoing_4809440A,numinstmatpaidtearly2d_4499204L,credamount_770A,days_since_approvaldate_319D_MOST_RECENT_APPLICATION,rejectreasonclient_4145042M_MOST_RECENT_APPLICATION,commnoinclast6m_3546845L,amtdepositoutgoing_4809442A,numactiverelcontr_750L,clientscnt_887L,maxdpdlast24m_143P,maxoutstandbalancel12m_4187113A,numinstpaidearly3d_3546850L,fourthquarter_440L,byoccupationinc_3656910L_MOST_RECENT_APPLICATION,mindbdtollast24m_4525191P,mindbddpdlast24m_3658935P,numinstlswithdpd5_4187116L,thirdquarter_1082L,profession_152M_MOST_RECENT_APPLICATION,numinstpaidearly3dest_4493216L,days_since_responsedate_4917613D,credamount_590A_MOST_RECENT_APPLICATION,amtdepositbalance_4809441A,numinstpaidearly5dobd_4499205L,days_since_lastapplicationdate_877D,posfstqpd30lastmonth_3976962P,days_since_maxdpdinstldate_3546855D,mobilephncnt_593L,days_since_employedfrom_700D_MOST_RECENT_APPLICATION,applicationscnt_1086L,maxdpdlast9m_1059P,pmtamount_36A_MIN,maininc_215A,clientscnt_100L,amount_4527230A_MAX,formonth_206L,foryear_818L,lastrejectcredamount_222A,fortoday_1092L,pmtscount_423L,pmtcount_693L,currdebtcredtyperange_828A,totaldebt_9A,days_since_recorddate_4527225D_TRA_INDIV,pmtamount_36A_TRC_INDIV,status_219L_MOST_RECENT_APPLICATION,clientscnt_1130L,numinstunpaidmax_3546851L,days30_165L,numinstpaidlate1d_3546852L,inittransactionamount_650A,pmtnum_8L_MOST_RECENT_APPLICATION,pmtamount_36A_AVG,forweek_601L,days_since_dtlastpmtallstes_4499206D,credacc_actualbalance_314A_MOST_RECENT_APPLICATION,numinstpaidearly5dest_4493211L,days120_123L,numinstpaid_4499208L,clientscnt6m_3712949L,annuity_853A_MOST_RECENT_APPLICATION,interestrategrace_34L,pmtaverage_4955615A,clientscnt_360L,cancelreason_3545846M_MOST_RECENT_APPLICATION,amount_4917619A_TRB_INDIV,clientscnt_157L,numinstlsallpaid_934L,applicationcnt_361L,riskassesment_940T,daysoverduetolerancedd_3976961L,pmtcount_4955617L,days180_256L,totalsettled_863A,numinsttopaygrest_4493213L,currdebt_94A_MOST_RECENT_APPLICATION,days_since_dateactivated_425D_MOST_RECENT_APPLICATION,postype_4733339M_MOST_RECENT_APPLICATION,numinstls_657L,amount_4527230A_MEDIAN,maxdpdlast12m_727P,lastotherinc_902A,maxlnamtstart6m_4525199A,pctinstlsallpaidlate6d_3546844L,applicationscnt_867L,disbursedcredamount_1113A,lastotherlnsexpense_631A,maxdpdfrom6mto36m_3546853P,for3years_128L,maxpmtlast3m_4525190A,cntpmts24_3658933L,credacc_credlmt_575A_MOST_RECENT_APPLICATION,annuity_780A,numrejects9m_859L,days_since_lastrejectdate_50D,pmtamount_36A_SUM,numinsttopaygr_769L,credtype_587L_MOST_RECENT_APPLICATION,interestrate_311L,forquarter_462L,forweek_1077L,maxdpdtolerance_374P,amount_4527230A_SUM,posfpd30lastmonth_3976960P,pmtaverage_3A,clientscnt_257L,days_since_processingdate_168D_TRC_INDIV
997343,28274.0,,,,,0.57143,0.0,13.0,0.0,6.0,,,1.0,10977.0,,MARRIED,,-12.0,0.0,0.0,0.0,4.0,775.0,0.21429,775.0,,0.0,3.0,P197_47_166,a55475b1,,0.0,1.0,0.0,28274.0,5656.6,,27836.0,,1.0,588.0,0.0,False,,0.0,0.0,,,5.0,,616.0,,,a55475b1,0.0000,,14.0,1375.0,,0.0,,0.35714,0.0,6.0,,2.0,0.0,,,0.0,0.0,,,5.0,,,,0.0,9425.8,0.0,0.0,0.0,,,493.0,,,,493.0,,,8.0,4.0,,22000.0,0.0,16600.00,,,,,0.0,0.0,,8.0,4.0,,6.0,0.0,,24.0,,,1131.0,,620.0,POS,0.35714,,0.0,14.0,0.0,620.0,0.0,,15.0,3.0,0.0,0.0,,2.0,14.0,0.0,0.0,1131.0,4.0,,-12.0,,,,8.0,41836.0,620.0,a55475b1,0.0,,1.0,0.0,0.0,,8.0,1.0,,-15.0,-15.0,5.0,2.0,a55475b1,8.0,,28274.0,,6.0,620.0,0.0,1131.0,3.0,,0.0,0.0,,22000.0,1.0,,,,60000.0,,,,0.0,0.0,,,K,0.0,0.0,1.0,5.0,27836.0,6.0,,,493.0,,4.0,2.0,14.0,0.0,4714.2000,,,0.0,a55475b1,,0.0,9.0,0.0,,18.0,,3.0,44080.000,0.0,0.00,616.0,a55475b1,0.0,,0.0,,28274.000,0.35714,4.0,27836.0,,0.0,,,5.0,0.0,1670.0000,0.0,1477.0,,0.0,COL,,,,16.0,,0.0,,0.0,
636633,0.0,,27554.6,,,,0.0,,0.0,,65860.3,,0.0,10793.0,,,,,,0.0,0.0,5.0,,,,,0.0,,P149_105_2,a55475b1,,0.0,0.0,0.0,,,,21972.0,,0.0,1332.0,0.0,False,0.0,,0.0,,,,,,,,P99_56_166,0.0000,,,,21843.079365,0.0,,,0.0,,,0.0,0.0,,,0.0,0.0,,,,168.0,,,0.0,,0.0,0.0,0.0,,,,,,349725.802,,,,,6.0,,30000.0,0.0,0.00,,,,,,0.0,,,5.0,,6.0,0.0,e4d71d32,36.0,,,,,,POS,,,,,0.0,1363.0,0.0,,,6.0,,0.0,,1.0,,,0.0,,,,,58287.633667,79035.0,,,21972.0,,P94_109_143,0.0,,1.0,0.0,0.0,,,7.0,,,,,2.0,a55475b1,,,25098.0,,,1363.0,0.0,,2.0,,0.0,0.0,,,0.0,,,,25098.0,,,,0.0,0.0,,,D,0.0,,0.0,,21972.0,6.0,,,,,,1.0,,0.0,4435.8003,,,0.0,P94_109_143,27554.6,0.0,,0.0,,,,1.0,0.000,,,,P177_117_192,0.0,,0.0,,,,0.0,21972.0,,0.0,,,,0.0,1318.2001,0.0,1363.0,,,COL,,,,0.0,,0.0,,0.0,
18649,,,,,,,0.0,,0.0,,,,0.0,26341.0,,,,,,0.0,0.0,0.0,,,,,0.0,,,,,0.0,1.0,0.0,,,,,,0.0,,0.0,,,,0.0,,,,,,,,,0.0000,,,,,0.0,,,0.0,,,0.0,0.0,,,,,,,,,,,0.0,,0.0,0.0,0.0,,,,,0.4500,,,,,,0.0,,,,,,,,,,,,,0.0,,,,,12.0,,,,,,,,,,,0.0,,,,,0.0,,0.0,,0.0,,,0.0,,,,,,,,,40000.0,,,,,0.0,0.0,,,,0.0,,,,,0.0,,,,,,,,0.0,,1.0,,0.0,,,,0.0,,,,,,,6.0,0.0,0.0,,,,0.0,,0.0,,,,,,,,,0.0,,0.0,,,,0.0,,,0.0,,0.0,,,,0.0,0.000,,,,,0.0,,,,,,0.0,40000.0,,,,,,,5108.8003,0.0,,,,,0.4500,,,,,0.0,12853.8,0.0,
445135,0.0,3910.2,,,0.0,0.00000,0.0,1.0,0.0,0.0,,,0.0,16266.0,,LIVING_WITH_PARTNER,,3.0,42922.0,0.0,0.0,2.0,,0.00000,,3.0,0.0,0.0,P123_39_170,P17_36_170,,0.0,1.0,1.0,44942.0,3910.2,0.0,44800.0,,0.0,1.0,42922.0,False,,1.0,0.0,0.0,,1.0,0.0,11.0,0.0,0.0,a55475b1,3910.2000,,1.0,1.0,,1.0,,0.00000,0.0,0.0,,2.0,0.0,3910.2000,,0.0,0.0,,,3.0,,3.0,,0.0,3910.2,0.0,0.0,0.0,3.0,,,,0.4500,,-2.0,3.0,45588.668,0.0,0.0,,20200.0,3.0,4000.00,,,0.0,,11.0,0.0,46922.0,1.0,2.0,,12.0,3.0,,12.0,0.0,,1.0,0.0,31.0,POS,1.00000,,42922.0,1.0,1.0,31.0,0.0,3.0,0.0,1.0,1.0,0.0,,1.0,1.0,42922.0,1.0,1.0,0.0,,3.0,,,,0.0,44800.0,31.0,a55475b1,0.0,,0.0,3.0,3.0,46922.000,0.0,0.0,,3.0,3.0,0.0,2.0,a55475b1,0.0,,44942.0,,0.0,31.0,,1.0,2.0,636.0,0.0,3.0,,20200.0,0.0,,0.0,0.0,40000.0,0.0,,,42922.0,42922.0,,,A,1.0,11.0,0.0,1.0,,12.0,,0.0,,,0.0,1.0,1.0,0.0,3910.2000,,,0.0,a55475b1,,0.0,0.0,0.0,0.0,3.0,,1.0,4000.000,11.0,42922.00,11.0,a55475b1,12.0,,3.0,,46922.000,0.00000,2.0,44800.0,,0.0,0.0,3910.2000,1.0,0.0,5124.6000,1.0,219.0,,11.0,COL,0.4500,0.0,0.0,3.0,,,,0.0,
1200504,49380.0,4777.6,,,,0.78571,0.0,11.0,0.0,10.0,,,0.0,16154.0,,,,-29.0,20556.0,0.0,0.0,11.0,,0.00000,,0.0,0.0,0.0,P99_153_174,a55475b1,,0.0,0.0,0.0,30976.0,4814.0,,67100.0,,0.0,216.0,20556.0,False,,5.0,0.0,,,8.0,,242.0,,,a55475b1,1290.8000,,14.0,408.0,,1.0,,0.00000,0.0,10.0,,1.0,0.0,2581.6000,,0.0,0.0,,,0.0,,-2.0,,0.0,31908.0,0.0,0.0,0.0,-1.0,,,,0.3900,,3.0,-2.0,21241.715,11.0,8.0,,38000.0,0.0,73285.43,,,,,16.0,0.0,,,11.0,,24.0,0.0,,12.0,,,,,246.0,POS,0.00000,,20556.0,13.0,0.0,246.0,0.0,-76.0,15.0,13.0,0.0,0.0,,2.0,14.0,20556.0,0.0,,6.0,,-30.0,,,,12.0,67100.0,246.0,a55475b1,0.0,,0.0,0.0,0.0,46642.715,11.0,1.0,,-126.0,-126.0,0.0,1.0,a55475b1,11.0,,30976.0,,10.0,246.0,0.0,,1.0,,0.0,0.0,,38000.0,0.0,,,,44296.0,,,,20556.0,20556.0,,,A,0.0,16.0,1.0,0.0,,24.0,,,3.0,,6.0,2.0,14.0,0.0,1290.8000,,,0.0,a55475b1,,0.0,14.0,0.0,,0.0,,2.0,72262.720,16.0,20556.00,242.0,P177_117_192,24.0,,0.0,,38863.402,0.00000,3.0,67100.0,,0.0,,1290.8000,9.0,0.0,6835.6000,0.0,1286.0,,16.0,COL,0.3900,,,0.0,,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1140568,39660.0,4357.0,,,,0.20000,0.0,13.0,0.0,0.0,,,0.0,8345.0,,SINGLE,,0.0,0.0,0.0,0.0,1.0,,0.00000,,1.0,0.0,0.0,P91_34_173,P97_36_170,,0.0,0.0,0.0,37680.0,4357.0,,22137.6,,0.0,330.0,0.0,False,,10.0,0.0,,,10.0,,358.0,,,a55475b1,0.0000,,10.0,330.0,,0.0,,0.00000,0.0,0.0,,0.0,0.0,,,20000.0,0.0,,,1.0,,,,0.0,4357.2,0.0,0.0,0.0,1.0,,58.0,,0.0012,,58.0,0.0,6500.000,2.0,0.0,,20000.0,0.0,8640.00,,,,,0.0,0.0,,6.0,1.0,,10.0,0.0,,9.0,,,149.0,,361.0,POS,0.10000,,0.0,10.0,0.0,361.0,0.0,0.0,10.0,0.0,1.0,0.0,,1.0,10.0,0.0,0.0,179.0,0.0,,0.0,,,,3.0,22137.6,361.0,a55475b1,0.0,,0.0,0.0,1.0,39180.000,2.0,1.0,,-2.0,-2.0,0.0,0.0,a55475b1,2.0,,37680.0,,0.0,361.0,1.0,179.0,2.0,1054.0,0.0,1.0,,20000.0,0.0,,,,,,,,0.0,0.0,,,K,0.0,0.0,0.0,1.0,,10.0,,,58.0,,0.0,1.0,10.0,0.0,4357.2000,,,0.0,a55475b1,,0.0,9.0,0.0,,1.0,,1.0,43580.000,0.0,0.00,358.0,P149_40_170,0.0,,1.0,,39660.000,0.00000,0.0,22137.6,,1.0,,4354.6000,10.0,0.0,2461.2000,0.0,,,0.0,COL,0.0012,,,1.0,,0.0,,0.0,
347578,,,,,,,0.0,,0.0,,,,0.0,8037.0,1070.6,,,,,0.0,0.0,4.0,,,,,0.0,,,,,0.0,0.0,0.0,,,,120000.0,,0.0,,0.0,,,,0.0,,,,,,,,,0.0000,1403.0,,,,0.0,,,0.0,,182.062978,0.0,0.0,,,,,,,,,,,0.0,,0.0,0.0,0.0,,,,,0.0000,,,,,,1.0,,,,,,,,,,,,,4.0,,,,,18.0,,,,,,,,,,,0.0,,,,,0.0,,0.0,,2.0,,,0.0,,,6297.8003,,,,,,120000.0,,,,,0.0,0.0,,,,2.0,,,,,1.0,,,,,,,,0.0,,1.0,,0.0,,1070.6,,1.0,,,,,,6.0,,0.0,0.0,,,,0.0,,1.0,,,,1203.56,,,,,2.0,,0.0,,,,0.0,,,0.0,,0.0,,,,2.0,0.000,,,,,0.0,,,,,,0.0,120000.0,,,,,,,8253.4000,0.0,,6017.8,,,0.0000,,,,,0.0,,0.0,
1337547,19879.0,,,,,0.86667,0.0,16.0,0.0,11.0,,,0.0,25189.0,,MARRIED,,,0.0,0.0,0.0,0.0,2155.0,0.00000,1691.0,,1.0,0.0,P112_89_137,P97_36_170,,,1.0,0.0,17100.0,,,35982.0,,0.0,1703.0,0.0,False,,0.0,0.0,,,9.0,,1732.0,,,a55475b1,0.0000,,15.0,2293.0,,0.0,,0.06667,0.0,11.0,,2.0,0.0,,,1900.0,0.0,,,0.0,,,,0.0,,0.0,0.0,0.0,,0.0,1469.0,,0.4500,,1469.0,,,14.0,0.0,,33600.0,0.0,12440.00,,,,,0.0,0.0,,4.0,0.0,,10.0,0.0,,12.0,,3995.0,,,1734.0,POS,0.06667,,0.0,14.0,0.0,1734.0,0.0,,17.0,0.0,0.0,0.0,,0.0,15.0,0.0,0.0,,6.0,,,,,,14.0,35982.0,1734.0,a55475b1,0.0,,0.0,0.0,0.0,,13.0,0.0,23982.0,,,0.0,0.0,a55475b1,13.0,,17100.0,,11.0,1734.0,,2170.0,1.0,,0.0,0.0,,33600.0,0.0,,,,,,,,0.0,0.0,,,K,0.0,0.0,0.0,1.0,,10.0,,,1469.0,,6.0,0.0,15.0,0.0,2209.2000,,25181.8,0.0,a55475b1,,0.0,15.0,0.0,,0.0,14.0,0.0,30740.000,0.0,0.00,1732.0,P60_146_156,0.0,,0.0,,,0.06667,2.0,35982.0,,0.0,,,0.0,0.0,3775.6000,0.0,,,0.0,COL,0.4500,,,0.0,,,,0.0,
174487,28656.6,3061.0,,,,0.77778,0.0,18.0,0.0,13.0,,,1.0,15851.0,,,,-17.0,53192.6,0.0,0.0,4.0,,0.00000,,0.0,0.0,0.0,P91_50_167,a55475b1,,0.0,2.0,0.0,36796.0,3031.6,,,,0.0,53.0,53192.6,False,,6.0,0.0,,,3.0,,76.0,,,a55475b1,2533.4001,,18.0,695.0,,1.0,,0.00000,0.0,13.0,,1.0,0.0,5066.8003,,0.0,0.0,,,0.0,,-2.0,,0.0,3605.2,0.0,0.0,0.0,0.0,,,,0.4500,,-5.0,0.0,38174.734,14.0,0.0,,100000.0,0.0,3606.00,,,,,21.0,0.0,45558.2,,4.0,,24.0,0.0,,24.0,,,,,84.0,POS,0.00000,,53192.6,17.0,0.0,84.0,0.0,-9.0,19.0,2.0,0.0,0.0,,2.0,18.0,53192.6,0.0,,1.0,,-17.0,,,,17.0,50000.0,84.0,a55475b1,0.0,,0.0,0.0,0.0,58261.600,14.0,1.0,,-30.0,-30.0,0.0,1.0,a55475b1,14.0,,36796.0,,13.0,84.0,0.0,,3.0,,0.0,0.0,,100000.0,0.0,,,,,,,,0.0,53192.6,,,A,0.0,21.0,0.0,0.0,,24.0,,,,,1.0,3.0,18.0,0.0,2533.4001,,,0.0,a55475b1,,0.0,18.0,0.0,,0.0,,3.0,54572.402,21.0,41186.47,76.0,P177_117_192,24.0,,0.0,,27800.201,0.00000,1.0,50000.0,,0.0,,2533.4001,18.0,0.0,3705.4001,0.0,,,21.0,COL,0.4500,,,0.0,,0.0,,0.0,


In [13]:
eval_train = X_train.copy()
eval_train['score'] = gbm.predict(X_train[feats], num_iteration=gbm.best_iteration)

eval_valid = X_valid.copy()
eval_valid['score'] = gbm.predict(X_valid[feats], num_iteration=gbm.best_iteration)

In [14]:
from sklearn.metrics import roc_auc_score 
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(eval_train)
stability_score_valid = gini_stability(eval_valid)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 


The stability score on the train set is: 0.5545560996536345
The stability score on the valid set is: 0.5445156158746891


## Submission


In [16]:

test_ds['score'] = gbm.predict(test[feats], num_iteration=gbm.best_iteration)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_ds['score'] = gbm.predict(test[feats], num_iteration=gbm.best_iteration)


In [17]:
submission = test_ds[['case_id','score']]
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,case_id,score
0,57543,0.026965
1,57549,0.033734
2,57551,0.038186
3,57552,0.017421
4,57569,0.118731
