## Training Notebooks
- lgb https://www.kaggle.com/code/arthurroland/lgb-train-notebook
- cat https://www.kaggle.com/code/arthurroland/cat-train-notebook

## Reference
- https://www.kaggle.com/code/xiaoleilian/home-credit-ensemble-infer-lgb-cat


In [1]:
import joblib
from pathlib import Path
import gc
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
import warnings
import datetime
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'

In [2]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        base_date = datetime.datetime(1900, 1, 1)
        for col in df.columns:
            if col[-1] in ("D",):
                days_since_base = (pl.col(col) - pl.lit(base_date)).dt.days()
                df = df.with_columns(
                days_since_base.alias(col + "_days_since_1900_D")
                )
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.98:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df



class Aggregator:
    # Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]
        expr_var = [pl.var(col).alias(f"var_{col}") for col in cols]
        expr_sum = [pl.sum(col).alias(f"sum_{col}") for col in cols]

        return expr_max + expr_last + expr_mean +expr_sum

    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        expr_median = [pl.median(col).alias(f"median_{col}") for col in cols]

        return expr_max + expr_last + expr_mean 

    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        # expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return expr_max + expr_last  # +expr_count

    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        # expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        # expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return expr_max + expr_last

    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs

In [3]:
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df)) 
    return df

def read_files(regex_path, depth=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df


def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base


def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Load Models

In [4]:
lgb_notebook_info = joblib.load('/kaggle/input/lgb-train-notebook/notebook_info.joblib')
print(f"- [lgb] notebook_start_time: {lgb_notebook_info['notebook_start_time']}")
print(f"- [lgb] description: {lgb_notebook_info['description']}")

lgb_cols = lgb_notebook_info['cols']
lgbcat_cols = lgb_notebook_info['cat_cols']
# less_important_features=['applicationcnt_361L', 'clientscnt_157L', 'clientscnt_257L', 'deferredmnthsnum_166L', 'for3years_128L', 'formonth_206L', 'forquarter_1017L', 'forquarter_462L', 'forweek_1077L', 'forweek_601L', 'foryear_818L', 'max_pmts_month_158T', 'last_classificationofcontr_13M', 'last_classificationofcontr_400M', 'last_contractst_545M', 'last_description_351M', 'last_financialinstitution_591M', 'last_purposeofcred_426M', 'last_subjectrole_93M', 'max_contracttype_653M', 'max_pmtmethod_731M', 'max_purposeofcred_722M', 'max_subjectrole_326M', 'last_classificationofcontr_1114M', 'last_periodicityofpmts_997M', 'last_purposeofcred_722M', 'last_subjectrole_326M', 'min_contracttype_653M', 'max_empladdr_district_926M', 'max_empladdr_zipcode_114M', 'last_education_927M', 'last_empladdr_district_926M', 'last_empladdr_zipcode_114M', 'max_contaddr_matchlist_1032L', 'max_collater_typofvalofguarant_298M', 'max_collater_typofvalofguarant_407M', 'last_collater_typofvalofguarant_298M', 'last_collater_typofvalofguarant_407M', 'last_collaterals_typeofguarante_359M', 'last_collaterals_typeofguarante_669M', 'last_subjectroles_name_541M', 'last_subjectroles_name_838M', 'max_cacccardblochreas_147M', 'max_empls_economicalst_849M', 'last_empls_economicalst_849M']
# cat_cols = [item for item in cat_cols if item not in less_important_features]
print(f"- [lgb] len(cols): {len(lgb_cols)}")
print(f"- [lgb] len(cat_cols): {len(lgbcat_cols)}")

lgb_models = joblib.load('/kaggle/input/lgb-train-notebook/lgb_models.joblib')
lgb_models

- [lgb] notebook_start_time: 2024-09-11 11:25:32.647316
- [lgb] description: Add notebook info dict to store cols and cat_cols
- [lgb] len(cols): 610
- [lgb] len(cat_cols): 162


[LGBMClassifier(boosting_type='goss', colsample_bynode=0.65,
                colsample_bytree=0.65, device='cpu', learning_rate=0.03,
                max_depth=10, metric='auc', n_estimators=6000, num_leaves=64,
                objective='binary', random_state=42, reg_alpha=0.4,
                reg_lambda=10, scaled_pos_weight=20, verbose=-1),
 LGBMClassifier(boosting_type='goss', colsample_bynode=0.65,
                colsample_bytree=0.65, device='cpu', learning_rate=0.03,
                max_depth=10, metric='auc', n_estimators=6000, num_leaves=64,
                objective='binary', random_state=42, reg_alpha=0.4,
                reg_lambda=10, scaled_pos_weight=20, verbose=-1),
 LGBMClassifier(boosting_type='goss', colsample_bynode=0.65,
                colsample_bytree=0.65, device='cpu', learning_rate=0.03,
                max_depth=10, metric='auc', n_estimators=6000, num_leaves=64,
                objective='binary', random_state=42, reg_alpha=0.4,
                reg_lambda=1

In [5]:
# cat_notebook_info = joblib.load('/kaggle/input/cat-model-15000iter/notebook_info (10).joblib')
# print(f"- [cat] notebook_start_time: {cat_notebook_info['notebook_start_time']}")
# print(f"- [cat] description: {cat_notebook_info['description']}")
# cab_cols = cat_notebook_info['cols']
# cabcat_cols = cat_notebook_info['cat_cols']
# print(f"- [cat] len(cols): {len(cab_cols)}")
# print(f"- [cat] len(cat_cols): {len(cabcat_cols)}")
# cat_models = joblib.load('/kaggle/input/cat-model-15000iter/cat_models (11).joblib')
# cat_models

In [6]:
# xgb_notebook_info=joblib.load('/kaggle/input/xgb-model-ensemble/notebook_info')
# best_iteration=xgb_notebook_info['best_iteration']
# xgb_model=joblib.load('/kaggle/input/xgb-model-ensemble/xgb_models.joblib')

## Prepare df_test

In [7]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TEST_DIR        = ROOT / "parquet_files" / "test"

data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [8]:
df_test = feature_eng(**data_store)
print("test data shape:\t", df_test.shape)
del data_store
gc.collect()

cols=pd.unique(lgb_cols).tolist()
df_test = df_test.select(['case_id'] + cols)

df_test, cat_cols = to_pandas(df_test, lgbcat_cols)
df_test = reduce_mem_usage(df_test)
df_test = df_test.set_index('case_id')
print("test data shape:\t", df_test.shape)

gc.collect()

test data shape:	 (10, 1061)
Memory usage of dataframe is 0.05 MB
Memory usage after optimization is: 0.04 MB
Decreased by 32.9%
test data shape:	 (10, 610)


0

In [9]:
# for i in cab_cols:
#     if i not in df_test.columns:
#         print(i)

In [10]:
df_test

Unnamed: 0_level_0,month_decision,weekday_decision,credamount_770A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_867L,clientscnt_1022L,clientscnt_100L,...,max_addres_district_368M,max_addres_zip_823M,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M,last_addres_district_368M,last_addres_zip_823M,last_conts_role_79M,last_empls_economicalst_849M,last_empls_employer_name_740M
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
57543,5,5,20000.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,...,,,,,,,,,,
57549,1,1,75000.0,0.0,2.0,0.0,0.0,10.0,0.0,0.0,...,,,,,,,,,,
57551,11,5,27095.201172,0.0,1.0,0.0,0.0,2.0,0.0,0.0,...,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1
57552,11,5,100000.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,...,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1
57569,12,1,60000.0,0.0,1.0,0.0,0.0,6.0,0.0,0.0,...,a55475b1,P96_113_139,a55475b1,a55475b1,a55475b1,P28_121_188,P47_66_61,a55475b1,a55475b1,a55475b1
57630,3,2,96174.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
57631,6,6,24920.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
57632,2,6,25998.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,,,,,,,,,,
57633,1,2,200000.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,,,,,,,,,,
57634,1,3,12108.200195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


## Voting Model

In [11]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
     
    def predict_proba(self, X):      
        # lgb
        lgb_X=X[lgb_cols]
        nums=lgb_X.select_dtypes(exclude='category').columns
        lgb_X[nums] = lgb_X[nums].fillna(0)
        y_preds = [estimator.predict_proba(lgb_X) for estimator in self.estimators[:5]]
        del lgb_X
        gc.collect()
        
#         # cat 
#         X[cabcat_cols] = X[cabcat_cols].astype(str)
#         y_preds += [estimator.predict_proba(X[cab_cols]) for estimator in self.estimators[-5:]]
        
        return np.mean(y_preds, axis=0)
        

In [12]:
def getDate(regex_path):
    chunks = []
    for path in glob(str(regex_path)):
        exps = [
            pl.col("dpdmaxdateyear_596T").max().alias("year"),
            pl.col("dpdmaxdatemonth_89T").filter(pl.col("dpdmaxdateyear_596T") == pl.col("dpdmaxdateyear_596T").max()).max().alias("month"),#同一年份最大月份
        ]
        df = pl.read_parquet(path).group_by("case_id").agg(exps)
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])

    df = df.to_pandas()
    
    df = df.drop(index = df.index[df["year"].isna()])
    # df["year"].fillna("2019", inplace=True)
    df["month"].fillna("12", inplace=True)

    df["year"] = df["year"].astype(int).astype(str)
    df["month"] = df["month"].astype(int).astype(str)
    df["datetime"] = pd.to_datetime(df["year"] + "-" + df["month"], format="%Y-%m")
    df.drop(columns=["year","month"],inplace=True)
    # df = df.drop(index=df.index[df["datetime"] == pd.to_datetime("2019-01-01")])
    
    df.set_index("case_id",drop=True,inplace=True)
    
    return df
regex_path = TEST_DIR / "test_credit_bureau_a_1_*.parquet"
df_test_date = getDate(regex_path)
df_test_date

Unnamed: 0_level_0,datetime
case_id,Unnamed: 1_level_1
57760,2020-11-01
57543,2021-02-01
57551,2020-07-01
57633,2022-02-01
57549,2022-06-01


In [13]:
splitDate = df_test_date["datetime"].quantile(0.20)
print(splitDate)

2020-10-07 09:36:00


In [14]:
model = VotingModel(lgb_models)
len(model.estimators)

5

In [15]:
y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)
df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm['score'] = y_pred
df_subm

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.00348
57549,0.036231
57551,0.001907
57552,0.010732
57569,0.099219
57630,0.013177
57631,0.090652
57632,0.01614
57633,0.104649
57634,0.078044


In [16]:
df_subm = df_subm.merge(df_test_date, on='case_id', how='left')
df_subm["datetime"] = df_subm["datetime"].fillna(pd.to_datetime('2077-01-01'))
df_subm.loc[df_subm['datetime'] < splitDate, 'score'] -= 0.2
df_subm["score"]=df_subm["score"].clip(lower=0,upper=1)

df_subm=df_subm.drop(columns=['datetime'])
df_subm.to_csv("submission.csv")
df_subm

Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.00348
57549,0.036231
57551,0.0
57552,0.010732
57569,0.099219
57630,0.013177
57631,0.090652
57632,0.01614
57633,0.104649
57634,0.078044
