In [1]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'
from IPython.display import clear_output

In [2]:
from sklearn.model_selection import  StratifiedGroupKFold #TimeSeriesSplit, GroupKFold,
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb


In [3]:
import os, psutil  

def cpu_stats():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30
    print('memory GB:' + str(np.round(memory_use + 0.1, 2)))

In [4]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns((pl.col(col) - pl.col("date_decision")).dt.total_days().abs())  #!!?
#                 df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.95:
                    df = df.drop(col)
        
        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.String):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)
        
        return df

In [5]:
class Aggregator:
#     #Please add or subtract features yourself, be aware that too many features will take up too much space.
#     def age_calculate(df):
#         df = df.with_columns(pl.lit(current_date).alias("now_date"))
#         df = (df.with_columns(pl.coalesce(['birthdate_574D', 'dateofbirth_337D', 'dateofbirth_342D']).alias('birth_date'))
#                            .drop(['birthdate_574D', 'dateofbirth_337D', 'dateofbirth_342D'])
#                             .with_columns(pl.col('birth_date').cast(pl.Date))
#                           .with_columns(pl.col('now_date').cast(pl.Date)))
#         df = df.with_columns((pl.col('now_date').dt.year() - pl.col('birth_date').dt.year()).alias('age')).drop(['now_date', 'birth_date'])
    
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max +expr_last+expr_mean
    
    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
#         print(cols)
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return  expr_max +expr_last+expr_mean
    
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        #expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return  expr_max +expr_last#+expr_count
    
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last
    
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols] 
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        print('**********************')
        return  expr_max +expr_last
    
    def subtract_expr(df, col1, col2):
        return [pl.col(col1) - pl.col(col2).alias(f"{col1}_minus_{col2}")]
    
    def divide_expr(df, col1, col2):
        return [(pl.col(col1) / pl.col(col2)).alias(f"{col1}_divided_by_{col2}")]
    
    def fill_null_with_zero(df, columns):
        for col in columns:
            df = df.with_column(col, df[col].fill_null(0))
        return df
    
    def get_exprs(df, custom_ops=None, fill_null_columns=None):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) #+
#                 Aggregator.count_expr(df)

        if custom_ops:
            for op in custom_ops:
                if op["type"] == "subtract":
                    exprs += Aggregator.subtract_expr(df, op["col1"], op["col2"])
                elif op["type"] == "divide":
                    exprs += Aggregator.divide_expr(df, op["col1"], op["col2"])
            
        if fill_null_columns:
            df = Aggregator.fill_null_with_zero(df, fill_null_columns)

        return exprs
    


In [6]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [7]:
def read_file(path, depth=None, drop_columns=None, custom_ops=None, fill_null_columns=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    
    if drop_columns:
        df = df.drop(drop_columns)
        
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df, custom_ops, fill_null_columns)) 
    return df

def read_files(regex_path, depth=None, custom_ops=None, fill_null_columns=None):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df, custom_ops, fill_null_columns))
#             print(df.select(pl.col("^.*instl.*$")))
        chunks.append(df)
    
    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

In [8]:
def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

In [9]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

In [10]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [11]:
ROOT            = Path("/kaggle/input/home-credit-credit-risk-model-stability")

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

is_test = True



In [12]:
# custom_operations = [
#     {'operation': 'divide', 'col1': 'numinstlallpaidearly3d_817L', 'col2': 'numinstls_657L'},
#     {'operation': 'divide', 'col1': 'numinstlsallpaid_934L', 'col2': 'numinstls_657L'},
#     {'operation': 'divide', 'col1': 'numinstlswithdpd10_728L', 'col2': 'numinstls_657L'},
#     {'operation': 'divide', 'col1': 'numinstlswithdpd5_4187116L', 'col2': 'numinstls_657L'},
#     {'operation': 'divide', 'col1': 'numinstlswithoutdpd_562L', 'col2': 'numinstls_657L'},
    
# ]

In [13]:
from hc_fev1_depth_0 import FEv1

In [14]:
import pickle
with open('/kaggle/usr/lib/hc_fev1_depth_0/feature_list.pkl', 'rb') as file:
    df_train_columns = pickle.load(file)

with open('/kaggle/usr/lib/hc_fev1_depth_0/cat_cols.pkl', 'rb') as file:
    cat_cols = pickle.load(file)

In [15]:
data_store = {
    "df_base": read_file(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        read_file(TEST_DIR / "test_static_cb_0.parquet"),
        read_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        read_file(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        read_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        read_file(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        read_file(TEST_DIR / "test_other_1.parquet", 1),
        read_file(TEST_DIR / "test_person_1.parquet", 1),
        read_file(TEST_DIR / "test_deposit_1.parquet", 1),
        read_file(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
        read_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        read_file(TEST_DIR / "test_applprev_2.parquet", 2),
        read_file(TEST_DIR / "test_person_2.parquet", 2)
    ]
}

In [16]:
df_test = feature_eng(**data_store)
del data_store
gc.collect()


0

In [17]:
# Read train_person_1.parquet and count rows per case_id
df_person_1 = pl.read_parquet(TEST_DIR / "test_person_1.parquet")
df_count_person = df_person_1.group_by("case_id").count().select(pl.col("case_id"), pl.col("count").alias("count_person"))
df_person_1_client = df_person_1.filter(pl.col('num_group1') == 0).select(['case_id', 'birth_259D', 'mainoccupationinc_384A'])

# Merge the count_person with df_train
df_test = df_test.join(df_count_person, how="left", on="case_id").join(df_person_1_client, how='left', on='case_id')
del df_count_person, df_person_1_client
gc.collect()


df_test = FEv1.transform(df_test)

In [18]:
df_test.shape

(10, 810)

In [19]:
df_test = df_test.select([col for col in df_train_columns if col != "target"])

# print("train data shape:\t", df_train.shape)
print("test data shape:\t", df_test.shape)

df_test, cat_cols = to_pandas(df_test, cat_cols)
df_test = reduce_mem_usage(df_test)

gc.collect()

test data shape:	 (10, 437)
Memory usage of dataframe is 0.04 MB
Memory usage after optimization is: 0.02 MB
Decreased by 39.4%


0

In [20]:
# df_train = pd.read_csv('/kaggle/usr/lib/hc_fev1_depth_0/train.csv')

In [21]:
list(df_train_columns)

['case_id',
 'WEEK_NUM',
 'target',
 'month_decision',
 'weekday_decision',
 'annuity_780A',
 'applications30d_658L',
 'applicationscnt_1086L',
 'applicationscnt_464L',
 'applicationscnt_867L',
 'clientscnt_533_1022plus107_L',
 'clientscnt_257L',
 'disbursedcredamount_1113A',
 'deferredmnthsnum_166L',
 'downpmt_116A',
 'equalitydataagreement_891L',
 'equalityempfrom_62L',
 'homephncnt_628L',
 'isbidproduct_1095L',
 'mobilephncnt_593L',
 'numactiverelcontr_750L',
 'numcontrs3months_479L',
 'numinstpaidlastcontr_4325080L',
 'numnotactivated_1143L',
 'numpmtchanneldd_318L',
 'numrejects9m_859L',
 'mean_mainoccupationinc_384A',
 'count_person',
 'numinstregularpaid_L',
 'clientscnt_100_493_L',
 'clientscnt_157_887_L',
 'clientscnt_360_946_L',
 'clientscnt_1022_304_L',
 'clientscnt_1071_1130_L',
 'applicationcnt_L',
 'annuity_maininc_ratio_A',
 'numactivecreds_L',
 'sellerplacescnt_L',
 'downpmt_credamount_ratio_A',
 'assignmentdate_238D',
 'assignmentdate_4527235D',
 'pmtaverage_4527227A',

In [22]:
# set(df_train.columns) - set(df_test.columns)

In [23]:
df_test

Unnamed: 0,case_id,WEEK_NUM,month_decision,weekday_decision,annuity_780A,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_867L,clientscnt_533_1022plus107_L,...,max_conts_type_509L,max_credacc_cards_status_52L,last_conts_type_509L,max_conts_role_79M,max_empls_economicalst_849M,max_empls_employer_name_740M,last_conts_role_79M,last_empls_economicalst_849M,last_empls_employer_name_740M,paytype_L
0,57543,100,5,5,3674.0,0.0,0.0,0.0,9.0,0.0,...,PRIMARY_MOBILE,,,,,,,,,
1,57549,100,1,1,5744.0,2.0,0.0,0.0,10.0,0.0,...,,,,,,,,,,
2,57551,100,11,5,2844.0,1.0,0.0,0.0,2.0,0.0,...,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,OTHER
3,57552,100,11,5,6300.0,0.0,0.0,0.0,9.0,0.0,...,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,OTHER
4,57569,100,12,1,4684.0,1.0,0.0,0.0,6.0,0.0,...,,,,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,a55475b1,
5,57630,100,3,2,8904.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,
6,57631,100,6,6,2540.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
7,57632,100,2,6,4732.0,0.0,0.0,0.0,1.0,0.5,...,,,,,,,,,,
8,57633,100,1,2,8272.0,0.0,0.0,0.0,3.0,0.0,...,,,,,,,,,,
9,57634,100,1,3,1166.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [24]:
# y = df_train["target"]
# weeks = df_train["WEEK_NUM"]
# df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
# cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

In [25]:
DEVICE = 'cpu' #cpu gpu
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,  
    "learning_rate": 0.03,
    "n_estimators": 2000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    'max_bin': 250,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
#     "device": 'gpu', 
    "verbose": -1,
}


cb_params = {
        'eval_metric': 'AUC',
#         'task_type': 'GPU',
        'learning_rate': 0.03,
        'random_seed': 42,
        'iterations': 6000, #6000
}

xgb_params = {
    "device":"cuda",
    "objective":'binary:logistic',
    "tree_method":"hist",
    "enable_categorical":True,
    "eval_metric":'auc',
    "subsample":1,
    "colsample_bytree":1,
    "min_child_weight":1,
    "max_depth":20,
    #gamma=0.7,
    #reg_alpha=0.7,
    "n_estimators":1200,
    "random_state":42,
}



In [26]:
%%time
from catboost import CatBoostClassifier, Pool

import pickle

with open('/kaggle/input/hc-model/lgb_fitted_models.pkl', 'rb') as file:
    lgb_fitted_models = pickle.load(file)

# with open('/kaggle/input/hc-model/lgb_model_fitted_all.pkl', 'rb') as file:
#     lgb_model_fitted_all = pickle.load(file)

with open('/kaggle/input/hc-model/cb_fitted_models.pkl', 'rb') as file:
    cb_fitted_models = pickle.load(file)


lgb_model = VotingModel(lgb_fitted_models)
cb_model = VotingModel(cb_fitted_models)
print('predict test set')
cpu_stats()
df_test = df_test.drop(columns=["WEEK_NUM"])
df_test = df_test.set_index("case_id")

#     cb_df_test = df_test.copy()

lgb_y_pred = pd.Series(lgb_model.predict_proba(df_test)[:, 1], index=df_test.index)
# lgb_all_y_pred = pd.Series(lgb_model_fitted_all.predict_proba(df_test)[:, 1], index=df_test.index)

df_test[cat_cols] = df_test[cat_cols].astype(str)
cb_y_pred = pd.Series(cb_model.predict_proba(df_test)[:, 1], index=df_test.index)
y_pred = (lgb_y_pred + cb_y_pred)/2

df_subm = pd.read_csv(ROOT / "sample_submission.csv")
df_subm = df_subm.set_index("case_id")

df_subm["score"] = y_pred
df_subm.to_csv("submission.csv")
df_subm

predict test set
memory GB:0.69
CPU times: user 1.91 s, sys: 499 ms, total: 2.41 s
Wall time: 3.59 s


Unnamed: 0_level_0,score
case_id,Unnamed: 1_level_1
57543,0.007745
57549,0.047251
57551,0.002198
57552,0.025276
57569,0.122002
57630,0.012751
57631,0.017138
57632,0.004196
57633,0.020431
57634,0.014309


In [27]:
# features = X_train.columns
# importances = fitted_models[2].feature_importances_
# feature_importance = pd.DataFrame({'importance':importances,'features':features}).sort_values('importance', ascending=False).reset_index(drop=True)
# feature_importance

# drop_list = []
# for i, f in feature_importance.iterrows():
#     if f['importance']<80:
#         drop_list.append(f['features'])
# print(f"Number of features which are not important: {len(drop_list)} ")

# print(drop_list)