In [1]:
import numpy as np
import pandas as pd
import polars as pl
import os, gc
from sklearn.preprocessing import LabelEncoder


pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)

In [2]:
def calculate_woe_iv_categorical(feature, response):
    # Calculate the total number of events (positive responses) and non-events (negative responses)
    total_events = response.sum()
    total_non_events = response.count() - total_events
    
    # Create a new DataFrame with the feature and response values
    df = pd.DataFrame({'bin': feature.fillna('missing'), 'response': response})
    
    # Calculate the percentage of events and non-events for each bin of the feature
    bin_summary = df.groupby('bin')['response'].agg(['sum', 'count']).reset_index()
    bin_summary.columns = ['bin', 'events', 'total']
    bin_summary['non-events'] = (bin_summary['total'] - bin_summary['events']) 
    bin_summary['event_rate'] = (bin_summary['events'] / total_events)
    bin_summary['non-event_rate'] = (bin_summary['non-events'] / total_non_events) + 1e-10 # epsilon so that that the non event rate is not 0

    # Calculate the Weight of Evidence (WOE) and Information Value (IV) for each bin
    bin_summary['WOE'] = np.log1p(bin_summary['event_rate'] / bin_summary['non-event_rate'])
    bin_summary['IV'] = (bin_summary['event_rate'] - bin_summary['non-event_rate']) * bin_summary['WOE']

    # # Calculate the total Information Value (IV) for the feature
    total_IV = bin_summary['IV'].sum()
    
    return total_IV

def calculate_woe_iv_numeric(feature, response,quantiles = 50):
    # Calculate the total number of events (positive responses) and non-events (negative responses)
    total_events = response.sum()
    total_non_events = response.count() - total_events
    
    # Create a new DataFrame with the feature and response values
    df = pd.DataFrame({'feature': feature, 'response': response})
    
    # we want to support missing values
    df['bin'] = -1
    df.loc[df['feature'].notnull(),'bin'] = pd.qcut(df.loc[df['feature'].notnull(),'feature'], q=quantiles,duplicates='drop',labels=False)

    del df['feature']
    # Calculate the percentage of events and non-events for each bin of the feature
    bin_summary = df.groupby('bin')['response'].agg(['sum', 'count']).reset_index()
    bin_summary.columns = ['bin', 'events', 'total']
    bin_summary['non-events'] = (bin_summary['total'] - bin_summary['events']) 
    bin_summary['event_rate'] = (bin_summary['events'] / total_events)
    bin_summary['non-event_rate'] = (bin_summary['non-events'] / total_non_events) + 1e-10 # epsilon so that that the non event rate is not 0

    # Calculate the Weight of Evidence (WOE) and Information Value (IV) for each bin
    bin_summary['WOE'] = np.log1p(bin_summary['event_rate'] / bin_summary['non-event_rate'])
    bin_summary['IV'] = (bin_summary['event_rate'] - bin_summary['non-event_rate']) * bin_summary['WOE']

    # # Calculate the total Information Value (IV) for the feature
    total_IV = bin_summary['IV'].sum()
    
    return total_IV

def calculate_psi_categorical(old,new): 
    # series 1 = old, series 2 = new
    old = old.fillna("missing")
    new = new.fillna("missing")    
    
    bins = list(set(old.tolist()+new.tolist())) 
    bin_summary = pd.DataFrame(bins,columns=['bin'])
    bin_summary['prop_old'] = (bin_summary['bin'].apply(lambda x: (old==x).sum()) / len(old)) + 1e-10 # epsilon
    bin_summary['prop_new'] = (bin_summary['bin'].apply(lambda x: (new==x).sum()) / len(new)) + 1e-10 # epsilon

    
    return np.sum((bin_summary['prop_old'] - bin_summary['prop_new']) * np.log(bin_summary['prop_old']/bin_summary['prop_new']))

def calculate_psi_numeric(old,new,q=10): 

    old = pd.DataFrame(old,columns=['val'])
    new = pd.DataFrame(new,columns=['val'])
    
    # set up initial bins for missing values
    old['bin'] = -1
    new['bin'] = -1
    
    
    # we will only generate a score if we have enough unique values
    if (old['val'].dropna().nunique() > 1) and (new['val'].dropna().nunique() > 1):
        # assign each value to a quantile 
        old.loc[old.notnull(),'bin'] = pd.qcut(old.loc[old.notnull(),'val'], q=quantiles,duplicates='drop',labels=False)
        new.loc[new.notnull(),'bin'] = pd.qcut(new.loc[old.notnull(),'val'], q=quantiles,duplicates='drop',labels=False)
        
    
        bins = list(set(old['bin'].tolist()+new['bin'].tolist())) 
        bin_summary = pd.DataFrame(bins,columns=['bin'])
        bin_summary['prop_old'] = (bin_summary['bin'].apply(lambda x: (old['bin']==x).sum()) / len(old)) + 1e-10 # epsilon
        bin_summary['prop_new'] = (bin_summary['bin'].apply(lambda x: (new['bin']==x).sum()) / len(new)) + 1e-10 # epsilon
    
        return np.sum((bin_summary['prop_old'] - bin_summary['prop_new']) * np.log(bin_summary['prop_old']/bin_summary['prop_new']))
    
    else:
        return np.nan

# Preprocessing

[Data Info](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/data) <br>
[Discussion on how the data is setup](https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/473950) <br>
[Starter Notebook](https://www.kaggle.com/code/jetakow/home-credit-2024-starter-notebook)
* depth=0 - These are static features directly tied to a specific case_id.
* depth=1 - Each case_id has an associated historical record, indexed by num_group1.
* depth=2 - Each case_id has an associated historical record, indexed by both num_group1 and num_group2.

In [3]:
class Aggregator:
    # Please add or subtract features yourself, be aware that too many features will take up too much space.
    def __init__(self,numeric_cols,string_cols,date_cols,criteria):
        self.numeric_cols = numeric_cols
        self.string_cols  = string_cols
        self.date_cols    = date_cols
        self.criteria = criteria
        
    def num_expr(self,col):
        
        expr_max    = [pl.max(col).alias(f"{col}_MAX")]
        expr_min    = [pl.min(col).alias(f"{col}_MIN")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST")]
        expr_mean   = [pl.mean(col).alias(f"{col}_MEAN")]
        expr_median = [pl.median(col).alias(f"{col}_MEDIAN")]
        expr_var    = [pl.var(col).alias(f"{col}_VAR")]

        return expr_max + expr_last + expr_mean + expr_median + expr_var + expr_min

    def date_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST")]
        expr_mean   = [pl.mean(col).alias(f"{col}_MEAN")]

        return expr_max + expr_last + expr_mean 

    def str_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX")]
        expr_last   = [pl.last(col).alias(f"{col}_LAST")]
        return expr_max + expr_last 

    def count_expr(self,col):
        expr_max    = [pl.max(col).alias(f"{col}_MAX_{self.criteria}")]

        return expr_max

    def get_exprs(self,df):
        expr = []
        new_date_cols = []
        for col in df.columns:
            if 'num_group' in col:
                expr.extend(self.count_expr(col))
            elif col in self.numeric_cols:
                expr.extend(self.num_expr(col))
            elif col in self.string_cols:
                expr.extend(self.str_expr(col))
            elif col in self.date_cols:
                new_date_cols.extend([f"{col}_MAX",f"{col}_LAST",f"{col}_MEAN"])
                expr.extend(self.date_expr(col))

        return expr, new_date_cols

In [4]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns[7:]:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
class DatasetBuilder:
    """ This class is used to create the dataset """
    def __init__(self, 
                 n_samples   = None, 
                 parent_path = "/kaggle/input/home-credit-credit-risk-model-stability"):
        


        self.parent_path = parent_path
        self.n_samples = n_samples

        self.feat_info = pd.read_csv(f"{parent_path}/feature_definitions.csv")
        self.date_cols = []
        self.string_cols = []
        self.numeric_cols = []
        
        self.run()

    def explain_feat(self,feat_name:str):
        assert feat_name in self.feat_info['Variable'].unique(), "feature not found in feature info dataframe"
        return self.feat_info[self.feat_info['Variable']==feat_name]['Description'].values[0]

    def set_table_dtypes(self,df):
        for col in df.columns:
            
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int32))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float32))
                if col not in self.numeric_cols:
                    self.numeric_cols.append(col)                
            elif (col[-1] in ("M",)) or (col in self.string_cols):
                df = df.with_columns(pl.col(col).cast(pl.String))
                if col not in self.string_cols:
                    self.string_cols.append(col)
            elif col[-1] in ("L","T"): # we dont know the transform needed, just going to assume its either float and if not, then string
                try:
                    df = df.with_columns(pl.col(col).cast(pl.Float32))
                    if col not in self.numeric_cols:
                        self.numeric_cols.append(col) 
                except:
                    df = df.with_columns(pl.col(col).cast(pl.String))
                    if col not in self.string_cols:
                        self.string_cols.append(col) 
                    continue
                
            elif col[-1] in ("D",) or (col in self.date_cols):
                df = df.with_columns(pl.col(col).cast(pl.Date))
                if col not in self.date_cols:
                    self.date_cols.append(col)
                
        return df

    def feature_engineer_dates(self,df,date_cols=None):
        if date_cols is None:
            date_cols = self.date_cols
        for col in date_cols:
            if col in df.columns:
                df = df.with_columns((pl.col("date_decision") - pl.col(col)).dt.total_days().alias(f'{col}_DAYS_SINCE'))  # days since
                df = df.drop(col)
        
        return df
    
    def create_base_dataset(self):
        
        # load in the training dataset 
        if self.n_samples:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .pipe(self.set_table_dtypes).sample(n=self.n_samples).with_columns(pl.lit('train').alias('partition'))
        else:
            train = pl.read_parquet(f"{self.parent_path}/parquet_files/train/train_base.parquet") \
            .pipe(self.set_table_dtypes).with_columns(pl.lit('train').alias('partition'))
        
        # load in the test dataset
        test =  pl.read_parquet(f"{self.parent_path}/parquet_files/test/test_base.parquet")\
                .pipe(self.set_table_dtypes).with_columns(pl.lit('test').alias('partition'))
        
        # concat train and test
        self.df = pl.concat([train,test],how='diagonal_relaxed')
        
        # get all case_ids
        self.case_ids = self.df.get_column('case_id').to_list()
        
        # store base cols
        self.base_df_cols = self.df.columns
        

    def read_in_files_with_criteria(self, criteria:str):
        train_df  = pl.concat([pl.read_parquet(f"{self.parent_path}/parquet_files/train/{x}").pipe(self.set_table_dtypes).filter(pl.col('case_id').is_in(self.case_ids))
                       for x in os.listdir(f"{self.parent_path}/parquet_files/train") if (criteria in x)],how='diagonal_relaxed')
        test_df  =  pl.concat([pl.read_parquet(f"{self.parent_path}/parquet_files/test/{x}").pipe(self.set_table_dtypes)
                       for x in os.listdir(f"{self.parent_path}/parquet_files/test") if (criteria in x)],how='diagonal_relaxed')
        
        # being in train partition doesnt gaurentee it is in the test partition, so we have to ensure it 
        columns_in_common = list(set(train_df.columns).intersection(test_df.columns))
        
        df = pl.concat([train_df.select(columns_in_common),
                         test_df.select(columns_in_common)],how='diagonal_relaxed')
        
        
        return df

    
    def process_depth0(self):
        """
        These files can be used as is except for the dates, so just collect them, do feature engineering on the dates, then 
        throw out the date columns
        """
        depth0_criterias = ["static_0","static_cb_0"]
        df = self.df[['case_id','target','date_decision']]
        for criteria in depth0_criterias:
            df = df.join(self.read_in_files_with_criteria(criteria), on=['case_id'], how='left')
            
        df = self.feature_engineer_dates(df)
        depth0_feats = self.select_features(df,score="woe_iv",top_k=100)
        self.df = self.df.join(df[['case_id']+depth0_feats], on='case_id', how='left')


    

    def process_depth1(self):
        depth1_criterias = ["applprev_1","other_1",
                            "tax_registry_a_1","tax_registry_b_1","tax_registry_c_1",
                            "credit_bureau_a_1","credit_bureau_b_1",
                            "deposit_1","person_1","debitcard_1"]
        df = self.df[['case_id','target','date_decision']]
        agg_dt_cols_coll = []
        for criteria in depth1_criterias:
            criteria_df = self.read_in_files_with_criteria(criteria)
            aggr = Aggregator(self.numeric_cols,self.string_cols,self.date_cols,criteria.upper())
            agg_expr, agg_dt_cols = aggr.get_exprs(criteria_df)
            agg_dt_cols_coll.extend(agg_dt_cols)
            criteria_df = criteria_df.group_by("case_id").agg(agg_expr)
            df = df.join(criteria_df, on=['case_id'], how='inner')
        
        df = self.feature_engineer_dates(df,date_cols=agg_dt_cols_coll)
        depth1_feats = self.select_features(df,score="woe_iv",top_k=100)
        self.df = self.df.join(df[['case_id']+depth1_feats], on='case_id', how='left')        
        
    
    
    def evaluate_features(self,df):
        feats = [x for x in df.columns if x not in self.base_df_cols]
        
        # predictive power - woe*iv
        woeivs  = []
        for col in feats:
            if col in self.string_cols:
                woeiv = calculate_woe_iv_categorical(df[col].to_pandas(), df['target'].to_pandas())
                woeivs.append(woeiv)
            else:
                woeiv = calculate_woe_iv_numeric(df[col].to_pandas(), df['target'].to_pandas())
                woeivs.append(woeiv)

        
#         # stability - psi and woe*iv
#         psi_res = {x:[] for x in feats}
#         woe_res = {x:[] for x in feats}
#         for i in range(len(year_months)-1):
#             psis = []
#             old = df.filter((pl.col("ym_decision") == year_months[i]))
#             new = df.filter((pl.col("ym_decision") == year_months[i+1]))
#             for col in feats:
#                 if col in self.string_cols:
#                     psi = calculate_psi_categorical(old[col].to_pandas(),new[col].to_pandas())
#                 else:
#                     psi = calculate_psi_numeric(old[col].to_pandas(),new[col].to_pandas())
                    
#                 psi_res[col].append(psi)
                    


            
        feature_scores = pd.DataFrame(feats,columns=['feature'])
        feature_scores['prop_null'] = feature_scores['feature'].apply(lambda feat: df[feat].to_pandas().isna().sum()) / len(self.df)
        feature_scores['woe_iv'] = woeivs
        
        # lots of these ended up as nulls, will deal with later
#         feature_scores['eligible_psi'] = feature_scores['feature'].apply(lambda feat: sum([0 if np.isnan(x) else 1 for x in psi_res[feat]]))
#         feature_scores['avg_psi'] = feature_scores['feature'].apply(lambda feat: np.nanmean(psi_res[feat]))
#         feature_scores['std_psi'] = feature_scores['feature'].apply(lambda feat: np.nanstd(psi_res[feat]))
#         feature_scores['max_psi'] = feature_scores['feature'].apply(lambda feat: np.max(psi_res[feat]))

        return feature_scores
        
    
    def select_features(self,df,score="woe_iv",top_k=150):
        feature_scores = self.evaluate_features(df)
        top_k = min(top_k,len(feature_scores)-1)
        chosen_features = feature_scores.sort_values(score,ascending=False).reset_index(drop=True).loc[:top_k,'feature'].to_list()
        print(f"selected {top_k}/{len(feature_scores)} features for the model dataset")
        return chosen_features
        
    def run(self):
        self.create_base_dataset()
        self.process_depth0()
        self.process_depth1()

    
    def to_pandas(self,df_data):
        df_data = df_data.to_pandas()
        df_data[self.string_cols] = df_data[self.string_cols].astype("category")
        
#         enc = LabelEncoder()
#         for col in self.string_cols:
#             df_data[col] = enc.fit_transform(df_data[col])
#             df_data[col] = df_data[col].astype(np.int8)
        return df_data
    
    def get_datasets(self):
        ds = self.to_pandas(self.df)
        ds = reduce_mem_usage(ds)
        return {"train":ds[ds['partition']=='train'].reset_index(drop=True), 
                "test": ds[ds['partition']=='test'].reset_index(drop=True), 
                "features": [x for x in ds.columns if x not in self.base_df_cols],
                "cat_features": self.string_cols}

In [6]:
DSBuilder = DatasetBuilder()
ds = DSBuilder.get_datasets()
del DSBuilder

gc.collect()

selected 100/219 features for the model dataset
selected 100/975 features for the model dataset


KeyError: "['bankacctype_710L', 'cardtype_51L', 'credtype_322L', 'disbursementtype_67L', 'inittransactioncode_186L', 'lastapprcommoditycat_1041M', 'lastapprcommoditytypec_5251766M', 'lastrejectcommodtypec_5251769M', 'paytype1st_925L', 'paytype_783L', 'twobodfilling_608L', 'typesuite_864L', 'description_5085714M', 'education_88M', 'maritalst_385M', 'maritalst_893M', 'cancelreason_3545846M', 'credacc_status_367L', 'credtype_587L', 'district_544M', 'education_1138M', 'familystate_726L', 'inittransactioncode_279L', 'postype_4733339M', 'profession_152M', 'rejectreason_755M', 'rejectreasonclient_4145042M', 'status_219L', 'name_4527232M', 'name_4917606M', 'employername_160M', 'classificationofcontr_13M', 'classificationofcontr_400M', 'contractst_545M', 'contractst_964M', 'description_351M', 'financialinstitution_382M', 'financialinstitution_591M', 'purposeofcred_426M', 'purposeofcred_874M', 'subjectrole_182M', 'subjectrole_93M', 'classificationofcontr_1114M', 'contractst_516M', 'contracttype_653M', 'credor_3940957M', 'periodicityofpmts_997L', 'periodicityofpmts_997M', 'pmtmethod_731M', 'purposeofcred_722M', 'subjectrole_326M', 'subjectrole_43M', 'contaddr_district_15M', 'contaddr_zipcode_807M', 'education_927M', 'empl_employedtotal_800L', 'empl_industry_691L', 'empladdr_district_926M', 'empladdr_zipcode_114M', 'familystate_447L', 'gender_992L', 'housetype_905L', 'housingtype_772L', 'incometype_1044T', 'language1_981M', 'maritalst_703L', 'registaddr_district_1083M', 'registaddr_zipcode_184M', 'relationshiptoclient_415T', 'relationshiptoclient_642T', 'role_1084L', 'role_993L', 'sex_738L', 'type_25L'] not in index"

# Training XGBoost

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split,StratifiedKFold
import xgboost as xgb
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [None]:
search_space = {
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.5, 1),
    'colsample_bynode': hp.uniform('colsample_bynode', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'gamma': hp.loguniform('gamma',np.log(.00001), np.log(100)),
    'max_depth': scope.int(hp.uniform('max_depth', 5, 50)),
    'min_child_weight': hp.loguniform('min_child_weight', np.log(.00001), np.log(100)),
    'reg_alpha': hp.loguniform('reg_alpha', np.log(.00001), np.log(100)),
    'reg_lambda':hp.loguniform('reg_lambda',np.log(.00001), np.log(100)),
    'scale_pos_weight': hp.uniform('scale_pos_weight',1,10),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'learning_rate' : hp.loguniform('learning_rate', np.log(.00001), np.log(.5)),
    'n_estimators':scope.int(hp.uniform('n_estimators', 100, 1000)),
    'max_cat_to_onehot': scope.int(hp.uniform('max_cat_to_onehot', 2, 10)),
    'tree_method':'hist',
    'enable_categorical':True,
    'random_state': 185,
    'objective': 'binary:logistic',
#     'device': 'cuda',
    'n_jobs': 10,
}

In [None]:
def trial_fn(params,
             feats = [],
             ds = [],
             k_folds=4):
    skf = StratifiedKFold(n_splits=k_folds)
    idx = np.arange(len(ds))
    ds['score'] = 0.0

    for train_idx, valid_idx in skf.split(idx,ds['target']):
        mod = xgb.XGBClassifier(**params)
        mod.fit(ds.loc[train_idx,feats],ds.loc[train_idx,'target'])
        ds.loc[valid_idx,'score']  = mod.predict_proba(ds.loc[valid_idx,feats])[:,1] # p(Y=1|X)


    score = roc_auc_score(ds['target'],ds['score'])

    return {"status": STATUS_OK, "loss": -score}

In [None]:
best_params = fmin(fn=partial(trial_fn, feats = ds['features'], ds = ds['train']),
                    space=search_space,
                    algo=tpe.suggest,
                    max_evals=10,
                    timeout=60*60 # seconds
                  )
int_params = ['max_depth','n_estimators']
for k,v in best_params.items():
    best_params[k] = int(best_params[k])

best_params

In [None]:
mod = xgb.XGBClassifier(**best_params)
mod.fit(ds['train'][ds['features']], ds['train']['target'])

## Submission


In [None]:
ds['test']['score'] = mod.predict_proba(ds['test'][ds['features']])[:,1]

In [None]:
submission = ds['test'][['case_id','score']]
submission.to_csv('submission.csv', index=False)
submission.head()