In [1]:
%%writefile data_file.py
import os,sys,warnings,time,re,math,gc
warnings.filterwarnings("ignore")
from glob import glob
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from sklearn.metrics import roc_auc_score,auc
from sklearn.model_selection import train_test_split,StratifiedGroupKFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cgb

print("scrpit start")

path_to_train = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/train"
path_to_test = "/kaggle/input/home-credit-credit-risk-model-stability/parquet_files/test"


all_train_files = glob(path_to_train+"/*.parquet")
all_test_files = glob(path_to_test+"/*.parquet")
train_files_df = pl.DataFrame({"index":range(len(all_train_files)),"path":all_train_files})
test_files_df = pl.DataFrame({"index":range(len(all_test_files)),"path":all_test_files})
train_files_df = (
    train_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)
test_files_df = (
    test_files_df
    .with_columns(
        (pl.col("path").str.split("/").list.get(-1)).alias("filename")
    )
    .sort(by="filename")
)



train_base = (
    pl.read_parquet(path_to_train+"/train_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt64),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("week_num"),
        pl.col("target").cast(pl.UInt8)
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).cast(pl.UInt8).alias("year")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)


test_base = (
    pl.read_parquet(path_to_test+"/test_base.parquet")
    .select(
        pl.col("case_id").cast(pl.UInt64),
        cs.contains("date").str.to_date().alias("Date"),
        pl.col("WEEK_NUM").cast(pl.UInt8).alias("week_num")
    )
    .with_columns(
        pl.col("Date").dt.month().alias("month"),
        pl.col("Date").dt.weekday().alias("weekday"),
        pl.col("Date").dt.week().alias("week"),
        (pl.col("Date").dt.year() - 2018).cast(pl.UInt8).alias("year"),
        pl.lit(0).cast(pl.UInt8).alias("target")
    )
    .select(~cs.contains("target"),cs.contains("target"))
)


def rename_cols(df:pl.DataFrame,name:str):
    if name == "intshallow":
        return df.rename(
            {
                "dpdmaxdatemonth_442T":"dpdmaxdatemonth_442D",
                "dpdmaxdatemonth_89T":"dpdmaxdatemonth_89D",
                "dpdmaxdateyear_596T":"dpdmaxdateyear_596D",
                "dpdmaxdateyear_896T":"dpdmaxdateyear_896D",
                "overdueamountmaxdatemonth_284T":"overdueamountmaxdatemonth_284D",
                "overdueamountmaxdatemonth_365T":"overdueamountmaxdatemonth_365D",
                "overdueamountmaxdateyear_2T":"overdueamountmaxdateyear_2D",
                "overdueamountmaxdateyear_994T":"overdueamountmaxdateyear_994D",
            }
        )
    elif name == "intdepth":
        return df.rename(
            {
                "pmts_month_158T":"pmts_month_158D",
                "pmts_month_706T":"pmts_month_706D",
                "pmts_year_1139T":"pmts_year_1139D",
                "pmts_year_507T":"pmts_year_507D",
            }
        )
    elif name == "extshallow":
        return df.rename(
            {
                "dpdmaxdatemonth_804T":"dpdmaxdatemonth_804D",
                "dpdmaxdateyear_742T":"dpdmaxdateyear_742D",
                "overdueamountmaxdatemonth_494T":"overdueamountmaxdatemonth_494D",
                "overdueamountmaxdateyear_432T":"overdueamountmaxdateyear_432D",
            }
        )
    else:
        return df
    
    
    

def convert_dtype(df:pl.DataFrame):
    return (
        df
        .select(
            cs.by_name("case_id").cast(pl.UInt64),
            cs.contains("num_group").cast(pl.UInt16),
            cs.ends_with("D").cast(pl.Date),
            (cs.ends_with("T","M") | (cs.ends_with("L") & cs.string())).cast(pl.String),
            (cs.ends_with("L") & cs.integer()).cast(pl.Int32),
            (cs.ends_with("L") & cs.float()).cast(pl.Float32),
            (cs.ends_with("P","A") & cs.unsigned_integer()).cast(pl.UInt32),
            (cs.ends_with("P","A") & cs.signed_integer()).cast(pl.Int32),
            (cs.ends_with("P","A") & cs.float()).cast(pl.Float32),
            pl.col(pl.Boolean)
        )
    )


def grouping(df:pl.DataFrame):
    return (
        df
        .group_by("case_id")
        .agg(
            cs.numeric().max().prefix("max_"),
            cs.numeric().mean().prefix("mean_"),
            cs.numeric().first().prefix("first_"),
            cs.numeric().last().prefix("last_"),
            cs.date().first().prefix("first_"),
            cs.date().last().prefix("last_"),
            (cs.string() | cs.boolean()).drop_nulls().mode().first().prefix("mode_"),
            (cs.string() | cs.boolean()).last().prefix("last_"),
            (cs.string() | cs.boolean()).first().prefix("first_")
        )
    )


def preprocess(
        string_name:str,
        prefix_string:str,
        train_base_df:pl.DataFrame,
        test_base_df:pl.DataFrame,
        cat_cols:list,
        num_cols:list
        ):
    train_files_list = train_files_df.filter(pl.col("filename").str.contains(string_name))["path"].to_list()
    test_files_list = test_files_df.filter(pl.col("filename").str.contains(string_name))["path"].to_list()
    with pl.StringCache():
        train_df = (
            pl.concat(
                [
                    pl.read_parquet(train_file)
                    .pipe(rename_cols,prefix_string)
                    .pipe(convert_dtype)
                    .pipe(grouping)
                    for train_file in train_files_list
                ],
                parallel=False,
                rechunk=False
            )
        )
        test_schema = {"_".join(col_name.split("_")[1:]) if col_name != "case_id" else col_name:dtype for col_name,dtype in train_df.schema.items()}
        test_columns = {col_name for col_name in test_schema.keys()}
        if prefix_string == "staticexternal":
            train_df = (
                train_df
                .with_columns(
                    cs.contains("302T").str.split("%").list.gather([0,1])
                    .map_elements(
                        lambda x: (int(x[0]) + int(x[1].strip().split(" ")[1]))/200,return_dtype=pl.Float32
                    )
                )
            )
        
        cat_cols += train_df.select((~(cs.numeric() | cs.date())).prefix(f"{prefix_string}_")).columns
        num_cols += train_df.select((cs.numeric() | cs.date()).exclude("case_id").prefix(f"{prefix_string}_")).columns
        train_base_df = train_base_df.join(
            train_df
            .with_columns(pl.col(pl.String).cast(pl.Categorical).rank("dense"))
            .select(pl.col("case_id"),pl.all().exclude("case_id").shrink_dtype().prefix(f"{prefix_string}_")),
            on="case_id",
            how="left"
        )
        test_df = (
            pl.concat(
                [
                    pl.read_parquet(test_file)
                    .pipe(rename_cols,prefix_string)
                    .select(test_columns)
                    .cast(test_schema)
                    .pipe(grouping)
                    for test_file in test_files_list
                ],
                parallel=False,
                rechunk=False
            )
        )
        if prefix_string == "staticexternal":
            test_df = (
                test_df
                .with_columns(
                    cs.contains("302T").str.split("%").list.gather([0,1])
                    .map_elements(
                        lambda x: (int(x[0]) + int(x[1].strip().split(" ")[1]))/200,return_dtype=pl.Float32
                    )
                )
            )
        test_base_df = test_base_df.join(
            test_df
            .with_columns(pl.col(pl.String).cast(pl.Categorical).rank("dense"))
            .select(pl.col("case_id"),pl.all().exclude("case_id").shrink_dtype().prefix(f"{prefix_string}_")),
            on="case_id",
            how="left"
        )
    return train_base_df,test_base_df,cat_cols,num_cols


string_list = ["applprev_1","applprev_2","static_0","static_cb","person_1","person_2","other_1","deposit_1","debitcard","bureau_a_1","bureau_a_2","bureau_b_1","bureau_b_2","registry_a","registry_b","registry_b"]
prefix_string_list = ["pastshallow","pastdepth","staticbase","staticexternal","personshallow","persondepth","othershallow","depositshallow","cardshallow","intshallow","intdepth","extshallow","extdepth","rega","regb","regc"]


cat_cols = []
num_cols = []
count = 0
for string_name,prefix_name in zip(string_list,prefix_string_list):
    train_base,test_base,cat_cols,num_cols = preprocess(string_name,prefix_name,train_base,test_base,cat_cols,num_cols)
    print(f"done: {count+1} for {prefix_name}")
    count += 1
    gc.collect()

(
    train_base
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.Date).exclude("Date").dt.weekday().prefix("weekdaydate_"),
        pl.col(pl.Date).exclude("Date").dt.month().prefix("monthdate_"),
        pl.col(pl.Date).exclude("Date").dt.week().prefix("weekdate_")
    )
    .drop("Date","case_id")
    .with_columns(pl.col(pl.Boolean).cast(pl.UInt8))
    .write_parquet("/kaggle/working/train_df.parquet")
)

all_date_cols = train_base.select(cs.contains(["weekdaydate","monthdate","weekdate"])).columns
base_cols = ["week","year","month","weekday"]
cat_cols += base_cols
cat_cols += all_date_cols

(
    test_base
    .with_columns(
        (pl.col(pl.Date) - pl.col("Date")).dt.total_days(),
        pl.col(pl.Date).exclude("Date").dt.weekday().prefix("weekdaydate_"),
        pl.col(pl.Date).exclude("Date").dt.month().prefix("monthdate_"),
        pl.col(pl.Date).exclude("Date").dt.week().prefix("weekdate_")
    )
    .drop("Date")
    .with_columns(pl.col(pl.Boolean).cast(pl.UInt8))
    .write_parquet("/kaggle/working/test_df.parquet")
)

joblib.dump((cat_cols,num_cols),"cols.pkl")
print("first scrip done")

Writing data_file.py


In [None]:
!python data_file.py

scrpit start
done: 1 for pastshallow
done: 2 for pastdepth
done: 3 for staticbase
done: 4 for staticexternal
done: 5 for personshallow
done: 6 for persondepth
done: 7 for othershallow
done: 8 for depositshallow
done: 9 for cardshallow
done: 10 for intshallow


In [None]:
%%writefile preprocess.py

import gc
import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
import joblib

print("scrpit start")

cat_cols,num_cols = joblib.load("/kaggle/working/cols.pkl")

print("Pre-processed num cols: ",len(num_cols))
print("Total cat cols: ",len(cat_cols))

train_df = (
    pl.scan_parquet("/kaggle/working/train_df.parquet")
    .select(pl.all().shrink_dtype())
)

num_cols +=  train_df.select(cs.contains(["weekdaydate","weekdate","monthdate"])).columns

train_df = (
    pl.read_parquet("/kaggle/working/train_df.parquet")
    .select(pl.all().shrink_dtype())
    .select(num_cols)
    .to_pandas()
)
gc.collect()

print("Total columns in train:",len(train_df.columns))

print(f"Total categorical columns: {len(cat_cols)} and Total numerical columns {len(num_cols)}")

nans_df = train_df.isna()
nans_groups={}
for col in num_cols:
    cur_group = nans_df[col].sum()
    try:
        nans_groups[cur_group].append(col)
    except:
        nans_groups[cur_group]=[col]
del nans_df; x=gc.collect()

gc.collect()

def reduce_grps(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = train_df[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
        use.append(vx)
    return use

def group_col_by_corr(matrix,thresh=0.8):
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= thresh:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

uses=[]
for k,v in nans_groups.items():
    if len(v)>1:
            Vs = nans_groups[k]
            grps= group_col_by_corr(train_df[Vs], thresh=0.8)
            use=reduce_grps(grps)
            uses=uses+use
    else:
        uses=uses+v
        
print("Post-processed num cols: ",len(uses))
print("Total cols",len(cat_cols+uses))

joblib.dump((cat_cols,uses),"all_cols.pkl")
print("second script done")

In [None]:
!python preprocess.py

In [None]:
%%writefile training.py


import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import joblib,gc
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

class_params = {
    "objective":"binary",
    "boosting_type": "gbdt",
    "metric":"auc",
    "max_depth": 128,
    "learning_rate": 0.01,
    "n_estimators": 5000,
    "colsample_bynode": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 420,
    "reg_alpha": 0.15,
    "reg_lambda": 15,
    "extra_trees": True,
    "num_leaves": 256,
    "device": "gpu",
    "importance_type": "gain",
    "verbose": -1
}

 cat_cols, num_cols = joblib.load("/kaggle/working/all_cols.pkl")

total_df = (
    pl.read_parquet(
        "/kaggle/working/train_df.parquet",
        low_memory=True,
        columns=cat_cols + num_cols +["target","week_num"]
    )
    .select(pl.all().shrink_dtype())
)

week_num = total_df["week_num"]

total_df = total_df.drop("week_num")

gc.collect()

def filter_ind(df:pl.DataFrame,indexes:np.array):
    return (
        df
        .with_row_index()
        .filter(
            pl.col("index").is_in(indexes)
        )
        .drop("index")
    )


cv = StratifiedGroupKFold(n_splits=5,shuffle=True)

for i,(train_ind,valid_ind) in enumerate(
    cv.split(total_df,total_df.select("target"),
    groups=week_num)):
    print(f"Training start for LGBClassifier: {i+1}")
    lgb_model = lgb.LGBMClassifier(**class_params)
    lgb_model.fit(
        (
            total_df
            .drop("target")
            .pipe(filter_ind,train_ind)
        ),
        (
            total_df
            .select('target')
            .pipe(filter_ind,train_ind)
            .to_series()
            .to_numpy()
            .ravel()
        ),
        eval_set=[(
            (
                total_df
                .drop("target")
                .pipe(filter_ind,valid_ind)
            ),
            (
                total_df
                .select("target")
                .pipe(filter_ind,valid_ind)
                .to_series()
                .to_numpy()
                .ravel()
            )
        )],
        callbacks=[lgb.log_evaluation(500),lgb.early_stopping(800)]
    )
    joblib.dump(lgb_model,f"/kaggle/working/lgb_model_{i+1}.pkl")
    gc.collect()

In [None]:
!python training.py