In [1]:
import numpy as np
import pandas as pd
import polars as pl
import os, gc, warnings
from glob import glob
from pathlib import Path
from typing import Any

warnings.filterwarnings("ignore")

ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

# preprocessing

In [2]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("category")

        return df, cat_cols

In [3]:
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"{col}_MAX") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"{col}_MIN") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"{col}_MEAN") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"{col}_VAR") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"{col}_MODE") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

In [4]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None):
        chunks = []
        for path in glob(str(glob_path)):
            df = pl.read_parquet(path, low_memory=True, rechunk=True)
            df = df.pipe(SchemaGen.change_dtypes)
            if depth in [1, 2]:
                df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
            chunks.append(df)
        df = pl.concat(chunks, how="vertical_relaxed")
        del chunks
        gc.collect()

        df = df.unique(subset=["case_id"]) 
        
        return df

    @staticmethod
    def join_dataframes(df_base, depth_0, depth_1, depth_2):
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        return df_base


In [5]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if (col[-1] == 'D') or ('D_' in col):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

            
    df = df.drop("MONTH")
    df = df.drop("WEEK_NUM")

    df = df.with_columns(pl.col("date_decision").dt.year().alias("year").cast(pl.Int16))
    df = df.with_columns(pl.col("date_decision").dt.month().alias("month").cast(pl.Int16))
    df = df.with_columns(pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8))
    df = df.with_columns(pl.col("date_decision").dt.weekday().alias("weekday").cast(pl.UInt8))
    df = df.with_columns(((pl.col("date_decision") - pl.lit("2019-01-01").cast(pl.Date)).dt.total_days() / 7).floor().alias("week_num").cast(pl.Int32))

#     df = df.drop("date_decision")
    return df 

In [6]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

Memory usage of dataframe "df_train" is 4712.6755 MB.
Memory usage of dataframe "df_train" became 2668.5421 MB.
Train data shape: (1526659, 474)


case_id,date_decision,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,…,openingdate_313D_MAX,amount_416A_MEAN,openingdate_313D_MEAN,num_group1_MAX_11,openingdate_857D_MAX,openingdate_857D_MEAN,collater_typofvalofguarant_298M_MAX,collater_typofvalofguarant_407M_MAX,collater_valueofguarantee_1124L_MAX,collater_valueofguarantee_876L_MAX,collaterals_typeofguarante_359M_MAX,collaterals_typeofguarante_669M_MAX,num_group1_MAX_12,num_group2_MAX,pmts_dpd_1073P_MAX,pmts_dpd_303P_MAX,pmts_month_158T_MAX,pmts_month_706T_MAX,pmts_overdue_1140A_MAX,pmts_overdue_1152A_MAX,pmts_year_1139T_MAX,pmts_year_507T_MAX,subjectroles_name_541M_MAX,subjectroles_name_838M_MAX,pmts_dpd_1073P_MEAN,pmts_dpd_303P_MEAN,pmts_overdue_1140A_MEAN,pmts_overdue_1152A_MEAN,pmts_dpd_1073P_VAR,pmts_dpd_303P_VAR,pmts_overdue_1140A_VAR,pmts_overdue_1152A_VAR,year,month,day,weekday,week_num
u32,date,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,f32,…,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8,u8,u8,u8
1313115,2019-03-19,0,,,-11003.0,,-11003,0.0,1.0,0.0,3.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",5.0,3.0,"""3439d993""","""a55475b1""",3.0,,,,,6.0,21815.285156,,14.0,,,1.0,3.0,0.0,26914.201172,1738.200073,3456.400146,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2,35,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,3,19,2,11
1400948,2019-06-13,0,-8550.0,,-25184.0,,-25184,1.0,1.0,0.0,2.0,0.0,"""a55475b1""","""717ddd49""","""a55475b1""",1.0,1.0,"""3439d993""","""a55475b1""",2.0,7553.0,,,6.0,,,,14.0,,,0.0,0.0,0.0,0.0,1798.400024,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",,0.0,"""c7a5ad39""","""a55475b1""",5,35,,20.0,,12.0,,2757.600098,,2020.0,"""ab3c25cf""","""a55475b1""",,0.179487,,44.408546,,3.424403,,114814.132812,2019,6,13,4,23
1664932,2019-12-02,0,,,,,-14367,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,0.0,0.0,12841.400391,740.200012,1167.400024,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,82980.0,"""c7a5ad39""","""c7a5ad39""",7,35,3.0,858.0,12.0,12.0,10930.053711,36943.277344,2020.0,2018.0,"""ab3c25cf""","""ab3c25cf""",0.05,14.536232,182.167572,15074.893555,0.15,10627.487305,1991100.0,212962608.0,2019,12,2,1,47
2594619,2019-07-10,0,-3978.0,,-25180.0,,-25180,1.0,2.0,1.0,4.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,0.0,"""a55475b1""","""a55475b1""",4.0,17767.400391,,,6.0,,,,14.0,,,2.0,3.0,0.0,92579.984375,5704.600098,1101.0,…,-1225.0,224.184998,-1333.0,1.0,-1225.0,-1333.0,"""a55475b1""","""a55475b1""",0.0,2008657.0,"""c7a5ad39""","""c7a5ad39""",14,35,0.0,121.0,12.0,12.0,0.0,4229.184082,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,2.295858,0.0,25.032047,0.0,196.590515,0.0,105833.9375,2019,7,10,3,27
1607830,2019-11-02,0,,,,,-13788,0.0,1.0,0.0,6.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",4.0,3.0,"""38c061ee""","""a55475b1""",6.0,,,,,,,"""DEDUCTION_6""",,14.0,,4.0,5.0,0.0,18232.201172,2326.800049,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",9,35,1.0,11.0,12.0,12.0,99.828003,3492.005859,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.012658,0.8,1.263646,304.817169,0.012658,5.019048,126.147209,663927.5,2019,11,2,6,43
1737265,2020-01-06,0,,,,,-18967,0.0,2.0,0.0,2.0,0.0,"""a55475b1""","""717ddd49""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",2.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,2.0,0.0,5157.0,2353.0,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",4,23,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,1,6,1,52
221966,2020-05-05,0,,,,,-21796,1.0,1.0,0.0,2.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,1.0,"""3439d993""","""a55475b1""",2.0,,,,,,,,,,13.0,0.0,0.0,0.0,19823.201172,7713.399902,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",3,23,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2015.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,5,5,2,70
778841,2019-08-23,0,,,-17707.0,,-17707,0.0,2.0,0.0,10.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",4.0,5.0,"""3439d993""","""a55475b1""",10.0,,,,,7.0,8580.600586,,14.0,,,1.0,2.0,,,1647.800049,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",3,35,0.0,71.0,12.0,12.0,0.0,4361.700195,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.0,6.303571,0.0,420.233612,0.0,253.851624,0.0,1345287.5,2019,8,23,5,33
256267,2020-09-23,0,,,,1216700.0,-22303,4.0,4.0,2.0,7.0,2.0,"""2fc785b2""","""39a0853f""","""a55475b1""",10.0,4.0,"""a7fcb6e5""","""a55475b1""",7.0,,,,,,,,,,0.0,8.0,6.0,0.0,248451.953125,1560.599976,5479.200195,…,-2020.0,398.947998,-2020.0,0.0,-2020.0,-2020.0,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",23,35,0.0,21.0,12.0,12.0,0.0,696.0,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.185185,0.0,3.3,0.0,2.877175,0.0,2242.813477,2020,9,23,3,90
1767794,2020-01-31,0,,,,,-18233,2.0,3.0,1.0,10.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,5.0,"""3439d993""","""a55475b1""",10.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,9.0,0.0,309026.6875,4223.800293,12073.600586,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",11,35,2.0,22.0,12.0,12.0,15680.629883,4057.600098,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.086957,0.515385,681.766541,31.255646,0.173913,8.143172,10690529.0,126644.382812,2020,1,31,5,56


In [7]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

Memory usage of dataframe "df_test" is 0.0298 MB.
Memory usage of dataframe "df_test" became 0.0173 MB.
Test data shape: (10, 473)


In [8]:
if 'target' not in df_test.columns:
    df_test = df_test.with_columns(pl.lit(0).alias('target').cast(pl.Int8))

In [9]:
df, cat_cols = Utility.to_pandas(
                        pl.concat([
                                 df_train.with_columns(pl.lit('train').alias('partition')),
                                 df_test.select(df_train.columns).with_columns(pl.lit('test').alias('partition'))
                                    ],how='vertical_relaxed')
                                )

In [10]:
df_train = df[df['partition']=='train'].reset_index(drop=True)
df_test  = df[df['partition']=='test'].reset_index(drop=True)
n_train = df_train.shape[0]

features = df_train.columns[5:-2]

In [11]:
del df
gc.collect()

0

In [12]:
df_train['score'] = 0.0
df_test['score'] = 0.0

# exploration

In [None]:
# placeholder for exploration code 

# ideas
    # feature score over time
    # rf feature importance
        # 

In [38]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from tqdm import tqdm

def fit_random_forest_and_get_feature_importances(X, y):
    # Identify categorical and numeric features
    categorical_features = X.select_dtypes(include=['category','string','object']).columns.tolist()
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()

    # Create transformers for both categorical and numeric data with imputation
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),  # Impute NaNs with -1 for categorical
        ('encoder', OrdinalEncoder())  # Then encode categorically
    ])
    numeric_transformer = SimpleImputer(strategy='constant', fill_value=-1)  # Impute NaNs with -1 for numeric

    # Apply ColumnTransformer to handle both categorical and numeric data
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features),
            ('num', numeric_transformer, numeric_features)
        ])

    # Create a pipeline with preprocessing and the classifier
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42,max_features=100,class_weight='balanced',bootstrap=False))
    ])

    # Fit the classifier to the data
    clf.fit(X, y)

    # Retrieve the feature importances
    feature_importances = clf.named_steps['classifier'].feature_importances_

    # Get feature names from the preprocessor after fitting
    feature_names = preprocessor.get_feature_names_out()

    # Create a dictionary of feature names and their importances
    feature_importances_dict = dict(zip(feature_names, feature_importances))

    return feature_importances_dict

In [39]:
numeric = [x for x in features if x not in cat_cols]
df_train['ym'] = df_train['year']*100 + df_train['month']

In [None]:
outs = []
for ym in tqdm(df_train['ym'].unique()):
    df_train[cat_cols] = df_train[cat_cols].astype(str)
    out = fit_random_forest_and_get_feature_importances(df_train.loc[df_train['ym']==ym,features], df_train.loc[df_train['ym']==ym,'target'])
    outs.append(out) 

  9%|▉         | 2/22 [19:12<3:21:14, 603.74s/it]

In [None]:
fi = pd.DataFrame(outs)
fi.columns = [x.split('__')[-1] for x in fs.columns]

In [None]:
def rank_elements_descending(arr):
    # Get indices that would sort the array
    sorted_indices = np.argsort(arr)
    # Reverse the ranks (largest gets rank 1, etc.)
    ranks = np.argsort(sorted_indices) + 1  # Adding 1 to convert from 0-based to 1-based index
    # Reverse the rank values so highest has rank 1
    return len(arr) - ranks + 1

In [None]:
feature_ranks = fi.apply(lambda x: rank_elements(x),axis=1)

In [None]:
feature_ranks.mean(axis=0).sort_values()[:100]

In [None]:
feature_ranks.std(axis=0).sort_values()[:100]

In [None]:
(feature_ranks.std(axis=0) / feature_ranks.mean(axis=0)).sort_values()[:100] # stability coefficient of rank

In [None]:
feature_ranks.to_csv('feature_ranks.csv')
fi.to_csv('feature_importances.csv')

# modeling

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, StratifiedKFold
import lightgbm as lgb 
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["week_num", "target", "score"]]\
        .sort_values("week_num")\
        .groupby("week_num")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

## tune - lgbm

In [None]:
def get_lgbm_base_params():
    base_params = {
        'boosting_type':'gbdt',
        'random_state': 117,
        'objective': 'binary',
        'metric': 'auc',
        'extra_trees':True,
        'verbose': -1,
        'max_bin': 128,
#         'device_type': 'gpu', 'gpu_use_dp':True,
        
    }
    return base_params

In [None]:
# set up search space - turn off for submission
lgbm_search_space_setup = {
    'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
    'max_depth': scope.int(hp.uniform('max_depth', 3, 25)),
    'l1_regularization': hp.loguniform('l1_regularization', np.log(.001), np.log(1000)),
    'l2_regularization':hp.loguniform('l2_regularization',np.log(.001), np.log(100)),
    'cat_l2': hp.loguniform('cat_l2', np.log(.001), np.log(100)),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
    'bagging_freq': scope.int(hp.uniform('bagging_freq', 2, 10)),
    'learning_rate' : hp.loguniform('learning_rate', np.log(0.001), np.log(.1)),
    'n_estimators':scope.int(hp.uniform('n_estimators', 500, 2000)),
    'num_leaves': scope.int(hp.uniform('num_leaves', 50, 5000)),
}
lgbm_search_space = get_lgbm_base_params()
for k,v in lgbm_search_space_setup.items():
    lgbm_search_space[k] = v

In [None]:
#### do splits ahead of time to improve trial speed - turn off for submission
k              = 5

# # split by week num
group_splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
                      StratifiedGroupKFold(n_splits=k).split(np.arange(n_train),
                                                             df_train['target'],
                                                             groups = df_train['week_num'])]
# # split by target
# strat_splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
#                       StratifiedKFold(n_splits=k).split(np.arange(n_train),
#                                                              df_train['target'])]


# # single split
# train_idx, test_idx, _, _ = train_test_split(np.arange(n_train),df_train['target'], test_size=0.1, random_state=117,stratify = df_train['target'])
# single_splits = [(train_idx, test_idx)]

In [None]:
def trial_fn_lgbm_gini_stability(params,splits = None):
    

    for train_idx, valid_idx in splits:
        model = lgb.LGBMClassifier(**params)  
        model.fit(df_train.loc[train_idx,features], df_train.loc[train_idx,'target'],
                  eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
                  eval_metric='auc',
                  callbacks=[lgb.early_stopping(50)])
        df_train.loc[valid_idx,'score'] = model.predict_proba(df_train.loc[valid_idx,features])
    
    
    score = gini_stability(df_train)
        
    out = {"status": STATUS_OK, "loss": -score} # always minimizes
    return out

In [None]:
def trial_fn_lgbm_auc(params,splits = None):
    
    scores = []
    for train_idx, valid_idx in splits:
        model = lgb.LGBMClassifier(**params)  
        model.fit(df_train.loc[train_idx,features], df_train.loc[train_idx,'target'],
                  eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
                  eval_metric='auc',
                  callbacks=[lgb.early_stopping(50)])

        score = roc_auc_score(df_train.loc[valid_idx,'target'],model.predict_proba(df_train.loc[valid_idx,features]))
        scores.append(score)
    
    score = np.mean(scores) - np.std(scores)
    out = {"status": STATUS_OK, "loss": -score} # always minimizes
    return out

In [None]:
# best_params = fmin(fn=partial(trial_fn_lgbm_auc, splits = group_splits),
#                     space=search_space,
#                     algo=tpe.suggest,
#                     max_evals=100,
#                     timeout=60*60*3 # seconds
#                   )
# int_params = ['max_depth','n_estimators','bagging_freq','num_leaves']
# bestp = get_base_params()
# for k,v in best_params.items():
#     if k in int_params:
#         bestp[k] = int(v)
#     else:
#         bestp[k] = v
# bestp

In [None]:
# best_params = fmin(fn=partial(trial_fn_lgbm_gini_stability, splits = group_splits),
#                     space=search_space,
#                     algo=tpe.suggest,
#                     max_evals=100,
#                     timeout=60*60*3 # seconds
#                   )
# int_params = ['max_depth','n_estimators','bagging_freq','num_leaves']
# bestp = get_base_params()
# for k,v in best_params.items():
#     if k in int_params:
#         bestp[k] = int(v)
#     else:
#         bestp[k] = v
# bestp

## train - lgbm

In [None]:
bestp = {
 'random_state': 117,
 'objective': 'binary',
 'metric': 'auc',
 'extra_trees': True,
 'verbose': -1,
 'max_bin': 128,
 'num_estimators': 5000,
 'bagging_fraction': 0.8785738092050028,
 'bagging_freq': 2,
 'cat_l2': 0.00894542670433897,
 'feature_fraction': 0.8461039894582675,
 'lambda_l1': 0.00016035395823821462,
 'lambda_l2': 0.00026472977300383716,
 'learning_rate': 0.07658316454314493,
 'min_data_in_leaf': 1336,
 'num_leaves': 3427
}

In [None]:
bestp = {
'random_state': 117,
'objective': 'binary',
'metric': 'auc',
'extra_trees': True,
'verbose': -1,
'max_bin': 128,
'num_estimators': 5000,
'bagging_fraction': 0.9954906383257235,
'bagging_freq': 4,
'cat_l2': 0.012031967105637895,
'feature_fraction': 0.7225434843853376,
'lambda_l1': 0.016218649958706272,
'lambda_l2': 6.861494353196064e-06,
'learning_rate': 0.09833094991052575,
'min_data_in_leaf': 3394,
'num_leaves': 2095
}

In [None]:
# group stratify ensemble
for train_idx, valid_idx in group_splits:
    model = lgb.LGBMClassifier(**bestp)  
    model.fit(df_train.loc[train_idx,features], df_train.loc[train_idx,'target'],
              eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(10)])
    df_test['score'] += model.predict_proba(df_test[features])[:,1] / k

# submission

In [None]:
# why are rules not implemented to avoid people having to do this hack?
# hosts are ok with hacking... so I guess we hack. 
# https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/497337


# condition = df_test['week_num'] < (df_test['week_num'].max()-df_test['week_num'].min())/2 + df_test['week_num'].min() 

# offset = 0.1 * df_test.loc[condition, 'score'].mean()

# df_test.loc[condition, 'score'] = (df_test.loc[condition, 'score'] - offset).clip(0)

In [None]:
df_test[['case_id','score']].to_csv('submission.csv', index=False)
df_test[['case_id','score']].head()