In [1]:
import numpy as np
import pandas as pd
import os

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
def calculate_woe_iv_categorical(data, feature, target):
    # Create a DataFrame for counts per category
    df = data.groupby([feature])[target].agg(['count', 'sum'])
    df.columns = ['Total', 'Bad']
    
    # Calculate the number of good outcomes
    df['Good'] = df['Total'] - df['Bad']
    
    # Handle cases where the count is 0 to avoid division by zero in WoE calculation
    df['Bad'] = np.where(df['Bad'] == 0, 0.0001, df['Bad'])
    df['Good'] = np.where(df['Good'] == 0, 0.0001, df['Good'])
    
    # Calculate the percentage of bads and goods
    df['Distr_Bad'] = df['Bad'] / df['Bad'].sum()
    df['Distr_Good'] = df['Good'] / df['Good'].sum()
    
    # Calculate WoE
    df['WoE'] = np.log(df['Distr_Good'] / df['Distr_Bad'])
    
    # Calculate IV
    df['IV'] = (df['Distr_Good'] - df['Distr_Bad']) * df['WoE']
    
    # Sum the IV values for the feature
    IV = df['IV'].sum()
    
    # Prepare a report
    report = df.reset_index()[[feature, 'WoE', 'IV']]
    
    return IV, report

def calculate_woe_iv_numeric(data, feature, target, bins=10):
    # Bin the data
    data['binned'] = pd.qcut(data[feature], q=bins, duplicates='drop')

    # Group by the binned feature
    grouped = data.groupby('binned')[target].agg(['count', 'sum'])
    grouped.columns = ['Total', 'Bad']

    # Calculate the number of good outcomes
    grouped['Good'] = grouped['Total'] - grouped['Bad']
    
    # Handle cases where the count is 0 to avoid division by zero in WoE calculation
    grouped['Bad'] = np.where(grouped['Bad'] == 0, 0.0001, grouped['Bad'])
    grouped['Good'] = np.where(grouped['Good'] == 0, 0.0001, grouped['Good'])

    # Calculate the distribution of bads and goods
    grouped['Distr_Bad'] = grouped['Bad'] / grouped['Bad'].sum()
    grouped['Distr_Good'] = grouped['Good'] / grouped['Good'].sum()

    # Calculate WoE
    grouped['WoE'] = np.log(grouped['Distr_Good'] / grouped['Distr_Bad'])

    # Calculate IV
    grouped['IV'] = (grouped['Distr_Good'] - grouped['Distr_Bad']) * grouped['WoE']

    # Sum the IV values for the feature
    IV = grouped['IV'].sum()

    # Prepare a report
    report = grouped.reset_index()[['binned', 'WoE', 'IV']]

    return IV, report

## Preprocessing

[Data Info](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data)

* depth=0 - These are static features directly tied to a specific case_id.
* depth=1 - Each case_id has an associated historical record, indexed by num_group1.
* depth=2 - Each case_id has an associated historical record, indexed by both num_group1 and num_group2.

In [3]:
class DatasetBuilder:
    """ This class is used to create the dataset """
    def __init__(self, 
                 n_samples = None, 
                 partition = "train",
                 parent_path = "/kaggle/input/home-credit-credit-risk-model-stability"):
        
        assert partition in ["train","test"], "partition can only be 'train','test' "
        
        self.parent_path = parent_path
        self.partition = partition
        self.n_samples = n_samples

        self.feat_info = pd.read_csv(f"{parent_path}/feature_definitions.csv")
        self.date_cols = []
        self.features = []
        # run process
        self.run()
 
    def explain_feat(self,feat_name:str):
        assert feat_name in self.feat_info['Variable'].unique(), "feature not found in feature info dataframe"
        return self.feat_info[self.feat_info['Variable']==feat_name]['Description'].values[0]

    def create_base_dataset(self):
        if self.n_samples:
            self.df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{self.partition}_base.csv").sample(n=self.n_samples)
        else:
            self.df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{self.partition}_base.csv")
        
        self.df['date_decision'] = pd.to_datetime(self.df['date_decision'])
        self.df['MONTH'] = pd.to_datetime(self.df['MONTH'].astype(str).str[:4] + '-' + self.df['MONTH'].astype(str).str[-2:] +'-01')
        self.base_cols = self.df.columns.tolist()
        self.case_ids = self.df['case_id'].unique().tolist()
    
    def read_in_file(self, file_name:str):
        df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")
        df = df[df['case_id'].isin(self.case_ids)]
        return df
    
    def add_df_to_dataset(self,df_to_add: pd.DataFrame,convert_types = True):
        if convert_types:
            for col in df_to_add.columns:
                if (df_to_add[col].dtype == 'object'):
                    if ('date' in col) or (col in ['dtlastpmtallstes_4499206D','firstclxcampaign_1125D']):
                        self.date_cols.append(col)
                        df_to_add[col] = pd.to_datetime(df_to_add[col])
                    else:
                        df_to_add[col] = df_to_add[col].astype("category")
        self.df = self.df.merge(df_to_add,on='case_id',how='left')
        
    def find_all_files_that_contain(self,criteria:str):
        files = [x for x in os.listdir(f"{self.parent_path}/csv_files/{self.partition}") if (criteria in x) and (self.partition in x)]
        return files 
    
    def add_all_level0_files(self):
        level0_criterias = ["static_0","static_cb_0"]
        for crit in level0_criterias:
            df_to_concat = []
            for file in self.find_all_files_that_contain(crit):
                print(f"adding {file}...")
                df_to_concat.append(self.read_in_file(file))
            self.add_df_to_dataset(pd.concat(df_to_concat,axis=0))
    

    def make_level0_features(self):
        print("making level0 features...")
        # features that can be used as is are already in numeric or categorical format
        provided_feats = [col for col in self.df.columns[6:] if self.df[col].dtype in ['int64','float64','category']]
            
        # date transformations
        ## number of days prior to the decision date, cannot be negative otherwise we wouldnt have this information when the decision was made 
        date_feats = []
        for dt_col in self.date_cols:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (self.df['date_decision'] - self.df[dt_col]).dt.days
            dt_feat_series = (dt_feat_series).mask(dt_feat_series < 0, np.nan)
            self.df[new_col] = dt_feat_series
            date_feats.append(new_col)
        
        self.features = provided_feats + date_feats

        
    def process_level1_files(self):
        
        print("adding previous applications...")
        # previous applications
        ## gather all files and concat
        appl_prev = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("applprev_1")],axis=0)
        ## only focus on the person for group 0 because that is the person we are making the decision on
        ## additionally, only focus on the most recent application that existed prior to the case's decision date
        ## it might be worth revisting this to look at all past applications because someone might be approved in a previous application but we are only going to
        ## look at the most recent application
        
        appl_prev['creationdate_885D'] = pd.to_datetime(appl_prev['creationdate_885D'])
        appl_prev = appl_prev.merge(self.df[['case_id','date_decision']],on='case_id')
        appl_prev = appl_prev[  (appl_prev['creationdate_885D'] < appl_prev['date_decision']) 
                              & (appl_prev['num_group1'] == 0)].sort_values('creationdate_885D',ascending=False).drop_duplicates(subset=['case_id'])
        ## convert the date columns to days since
        for dt_col in ["creationdate_885D","approvaldate_319D","dateactivated_425D","employedfrom_700D","firstnonzeroinstldate_307D","dtlastpmt_581D","dtlastpmtallstes_3545839D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (appl_prev['date_decision'] - pd.to_datetime(appl_prev[dt_col]) ).dt.days
            appl_prev[new_col] = dt_feat_series
            del appl_prev[dt_col]
        ## change the names so it is clear that this is the most recent application
        del appl_prev['num_group1']
        del appl_prev['date_decision']
        appl_prev.columns = ['case_id'] + [f"{x}_MOST_RECENT_APPLICATION" for x in appl_prev.columns[1:]]
        ## add previous applications to the dataframe
        self.features.extend(appl_prev.columns.tolist()[1:])
        self.add_df_to_dataset(appl_prev)
        ## free up memory
        del appl_prev
        
        print("adding other...")        
        # other file
        ## going to keep it real simple, just grab the first record for the person who we are making decision on
        other = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("other_1")],axis=0)
        other = other[other['num_group1']==0].drop_duplicates(subset=['case_id'])
        ## add other to the dataframe
        self.features.extend(other.columns.tolist()[1:])
        self.add_df_to_dataset(other)        
        ## free up memory
        del other       
        
        print("adding tax registry a...")             
        # tax registry a, look at individual and across all groups
        tra = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_a")],axis=0)
        tra = tra.merge(self.df[['case_id','date_decision']],on='case_id')
        tra['recorddate_4527225D'] = pd.to_datetime(tra['recorddate_4527225D'])
        tra = tra[tra['recorddate_4527225D']<tra['date_decision']]
        ## individual
        individual_tra = tra[tra['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["recorddate_4527225D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_tra['date_decision'] - pd.to_datetime(individual_tra[dt_col]) ).dt.days
            individual_tra[new_col] = dt_feat_series
            del individual_tra[dt_col]
        del individual_tra['date_decision']
        individual_tra.columns = ['case_id'] + [f"{x}_TRA_INDIV" for x in individual_tra.columns[1:]]
        ## add individual_tra to the dataframe
        self.features.extend(individual_tra.columns.tolist()[1:])
        self.add_df_to_dataset(individual_tra)        
        ## free up memory
        del individual_tra   
        ## agg
        agg_tra = tra.groupby('case_id',as_index=False).agg(
            amount_4527230A_MIN = ('amount_4527230A','min'),
            amount_4527230A_MAX = ('amount_4527230A','max'),
            amount_4527230A_STD = ('amount_4527230A','std'),
            amount_4527230A_AVG = ('amount_4527230A','mean'),
            amount_4527230A_MEDIAN = ('amount_4527230A','median'),
            amount_4527230A_SUM = ('amount_4527230A','sum'),    
        )
        self.features.extend(agg_tra.columns.tolist()[1:])
        self.add_df_to_dataset(agg_tra)        
        ## free up memory
        del agg_tra        
        
        
        print("adding tax registry b...") 
        # tax registry b, look at individual and across all groups
        trb = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_b")],axis=0)
        trb = trb.merge(self.df[['case_id','date_decision']],on='case_id')
        trb['deductiondate_4917603D'] = pd.to_datetime(trb['deductiondate_4917603D'])
        trb = trb[trb['deductiondate_4917603D']<trb['date_decision']]
        ## individual
        individual_trb = trb[trb['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["deductiondate_4917603D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_trb['date_decision'] - pd.to_datetime(individual_trb[dt_col]) ).dt.days
            individual_trb[new_col] = dt_feat_series
            del individual_trb[dt_col]
        del individual_trb['date_decision']
        individual_trb.columns = ['case_id'] + [f"{x}_TRB_INDIV" for x in individual_trb.columns[1:]]
        ## add individual_trb to the dataframe
        self.features.extend(individual_trb.columns.tolist()[1:])
        self.add_df_to_dataset(individual_trb)        
        ## free up memory
        del individual_trb   
        ## agg
        agg_trb = trb.groupby('case_id',as_index=False).agg(
            amount_4917619A_MIN = ('amount_4917619A','min'),
            amount_4917619A_MAX = ('amount_4917619A','max'),
            amount_4917619A_STD = ('amount_4917619A','std'),
            amount_4917619A_AVG = ('amount_4917619A','mean'),
            amount_4917619A_MEDIAN = ('amount_4917619A','median'),
            amount_4917619A_SUM = ('amount_4917619A','sum'),    
        )
        self.features.extend(agg_trb.columns.tolist()[1:])
        self.add_df_to_dataset(agg_trb)        
        ## free up memory
        del agg_trb              
        
        print("adding tax registry c...")        
        # tax registry c, look at individual and across all groups
        trc = pd.concat([self.read_in_file(file) for file in self.find_all_files_that_contain("tax_registry_c")],axis=0)
        trc = trc.merge(self.df[['case_id','date_decision']],on='case_id')
        trc['processingdate_168D'] = pd.to_datetime(trc['processingdate_168D'])
        trc = trc[trc['processingdate_168D']<trc['date_decision']]
        ## individual
        individual_trc = trc[trc['num_group1']==0].drop(columns='num_group1')
        for dt_col in ["processingdate_168D"]:
            new_col = f"days_since_{dt_col}"
            dt_feat_series = (individual_trc['date_decision'] - pd.to_datetime(individual_trc[dt_col]) ).dt.days
            individual_trc[new_col] = dt_feat_series
            del individual_trc[dt_col]
        del individual_trc['date_decision']
        individual_trc.columns = ['case_id'] + [f"{x}_TRC_INDIV" for x in individual_trc.columns[1:]]
        ## add individual_trc to the dataframe
        self.features.extend(individual_trc.columns.tolist()[1:])
        self.add_df_to_dataset(individual_trc)        
        ## free up memory
        del individual_trc   
        ## agg
        agg_trc = trc.groupby('case_id',as_index=False).agg(
            pmtamount_36A_MIN = ('pmtamount_36A','min'),
            pmtamount_36A_MAX = ('pmtamount_36A','max'),
            pmtamount_36A_STD = ('pmtamount_36A','std'),
            pmtamount_36A_AVG = ('pmtamount_36A','mean'),
            pmtamount_36A_MEDIAN = ('pmtamount_36A','median'),
            pmtamount_36A_SUM = ('pmtamount_36A','sum'),    
        )
        self.features.extend(agg_trc.columns.tolist()[1:])
        self.add_df_to_dataset(agg_trc)        
        ## free up memory
        del agg_trc            
        
        
        
        
        
        
#         level1_criterias = [,"other_",
#                             "tax_registry_a","tax_registry_b","tax_registry_c",
#                             "credit_bureau_a_1","credit_bureau_b_1",
#                             "deposit_1","person_1"]

            
        
    def run(self):
        self.create_base_dataset()
        
        self.add_all_level0_files()
        self.make_level0_features()
        
        self.process_level1_files()
    
    def get_modeling_dataset(self):
        return self.df[self.base_cols + self.features]

In [4]:
train_ds_builder = DatasetBuilder(n_samples = 10)
train_ds = train_ds_builder.get_modeling_dataset()

adding train_static_0_0.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding train_static_0_1.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding train_static_cb_0.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


making level0 features...
adding previous applications...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")
  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding other...
adding tax registry a...
adding tax registry b...
adding tax registry c...


In [5]:
# test = pd.read_csv("/kaggle/input/home-credit-credit-risk-model-stability/csv_files/train/train_applprev_1_0.csv")
# # test.head()

# test 

## Training LightGBM

Minimal example of LightGBM training is shown below.

In [6]:
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [7]:
train_ds_builder = DatasetBuilder(partition="train")
train_ds = train_ds_builder.get_modeling_dataset()

X_train, X_valid, y_train, y_valid = train_test_split(train_ds,train_ds['target'],stratify=train_ds['target'],train_size=.8)


adding train_static_0_0.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding train_static_0_1.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding train_static_cb_0.csv...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


making level0 features...
adding previous applications...


  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")
  df = pd.read_csv(f"{self.parent_path}/csv_files/{self.partition}/{file_name}")


adding other...
adding tax registry a...
adding tax registry b...
adding tax registry c...


In [None]:
lgb_train = lgb.Dataset(X_train[train_ds_builder.features], label=y_train)
lgb_valid = lgb.Dataset(X_valid[train_ds_builder.features], label=y_valid, reference=lgb_train)

params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 3,
    "num_leaves": 31,
    "learning_rate": 0.01,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 5,
    "n_estimators": 1000,
    "verbose": -1,
}

gbm = lgb.train(
    params,
    lgb_train,
    valid_sets=lgb_valid,
    callbacks=[lgb.log_evaluation(50), lgb.early_stopping(10)]
)



Training until validation scores don't improve for 10 rounds
[50]	valid_0's auc: 0.720335
[100]	valid_0's auc: 0.73812
[150]	valid_0's auc: 0.751975
[200]	valid_0's auc: 0.76271
[250]	valid_0's auc: 0.76905


Evaluation with AUC and then comparison with the stability metric is shown below.

In [None]:
eval_train = X_train.copy()
eval_train['score'] = gbm.predict(X_train[train_ds_builder.features], num_iteration=gbm.best_iteration)

eval_valid = X_valid.copy()
eval_valid['score'] = gbm.predict(X_valid[train_ds_builder.features], num_iteration=gbm.best_iteration)

In [None]:
from sklearn.metrics import roc_auc_score 
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

stability_score_train = gini_stability(eval_train)
stability_score_valid = gini_stability(eval_valid)

print(f'The stability score on the train set is: {stability_score_train}') 
print(f'The stability score on the valid set is: {stability_score_valid}') 


## Submission


In [None]:
train_ds_builder = DatasetBuilder(partition="train")
train_ds = train_ds_builder.get_modeling_dataset()

In [None]:
predictions = xgb_classifier.predict_proba(test_data)[:, 1]
test_Id = test_base_df["case_id"]
submission = pd.DataFrame({
    'case_id': test_Id,
    'score': predictions
})

submission.to_csv('submission.csv', index=False)
submission.head()

In [None]:
submission = pd.DataFrame({
    "case_id": data_submission["case_id"].to_numpy(),
    "score": y_submission_pred
}).set_index('case_id')
submission.to_csv("./submission.csv")

Best of luck, and most importantly, enjoy the process of learning and discovery! 

<img src="https://i.imgur.com/obVWIBh.png" alt="Image" width="700"/>