In [1]:
import numpy as np
import pandas as pd
import polars as pl
import os, gc, warnings
from glob import glob
from pathlib import Path
from typing import Any
from itertools import combinations, permutations
from tqdm import tqdm

warnings.filterwarnings("ignore")

ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

# preprocessing

In [2]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("category")

        return df, cat_cols

In [3]:
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"{col}_MAX") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"{col}_MIN") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"{col}_MEAN") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"{col}_VAR") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"{col}_MODE") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

In [4]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None):
        chunks = []
        for path in glob(str(glob_path)):
            df = pl.read_parquet(path, low_memory=True, rechunk=True)
            df = df.pipe(SchemaGen.change_dtypes)
            if depth in [1, 2]:
                df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
            chunks.append(df)
        df = pl.concat(chunks, how="vertical_relaxed")
        del chunks
        gc.collect()

        df = df.unique(subset=["case_id"]) 
        
        return df

    @staticmethod
    def join_dataframes(df_base, depth_0, depth_1, depth_2):
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        return df_base


In [5]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if (col[-1] == 'D') or ('D_' in col):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

    df = df.rename(
        {
            "MONTH": "month",
            "WEEK_NUM": "week_num"
        }
    )
            
    df = df.with_columns(
        [
            pl.col("date_decision").dt.year().alias("year").cast(pl.Int16),
            pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8),
        ]
    )

    df = df.drop("date_decision")
    df = df.drop("month")
    return df

In [6]:
def reduce_group(grps):
    use = []
    for g in grps:
        mx = 0; vx = g[0]
        for gg in g:
            n = df_train[gg].nunique()
            if n>mx:
                mx = n
                vx = gg
        use.append(vx)
    return use

def group_columns_by_correlation(matrix, threshold=0.8):
    correlation_matrix = matrix.corr()
    groups = []
    remaining_cols = list(matrix.columns)
    while remaining_cols:
        col = remaining_cols.pop(0)
        group = [col]
        correlated_cols = [col]
        for c in remaining_cols:
            if correlation_matrix.loc[col, c] >= threshold:
                group.append(c)
                correlated_cols.append(c)
        groups.append(group)
        remaining_cols = [c for c in remaining_cols if c not in correlated_cols]
    
    return groups

In [7]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

Memory usage of dataframe "df_train" is 4699.5721 MB.
Memory usage of dataframe "df_train" became 2659.8065 MB.
Train data shape: (1526659, 471)


case_id,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,…,mainoccupationinc_384A_MEAN,amount_416A_MAX,num_group1_MAX_10,openingdate_313D_MAX,amount_416A_MEAN,openingdate_313D_MEAN,num_group1_MAX_11,openingdate_857D_MAX,openingdate_857D_MEAN,collater_typofvalofguarant_298M_MAX,collater_typofvalofguarant_407M_MAX,collater_valueofguarantee_1124L_MAX,collater_valueofguarantee_876L_MAX,collaterals_typeofguarante_359M_MAX,collaterals_typeofguarante_669M_MAX,num_group1_MAX_12,num_group2_MAX,pmts_dpd_1073P_MAX,pmts_dpd_303P_MAX,pmts_month_158T_MAX,pmts_month_706T_MAX,pmts_overdue_1140A_MAX,pmts_overdue_1152A_MAX,pmts_year_1139T_MAX,pmts_year_507T_MAX,subjectroles_name_541M_MAX,subjectroles_name_838M_MAX,pmts_dpd_1073P_MEAN,pmts_dpd_303P_MEAN,pmts_overdue_1140A_MEAN,pmts_overdue_1152A_MEAN,pmts_dpd_1073P_VAR,pmts_dpd_303P_VAR,pmts_overdue_1140A_VAR,pmts_overdue_1152A_VAR,year,day
u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
633272,6,0,,,-16423.0,,-16423.0,0.0,0.0,0.0,6.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,0.0,"""a55475b1""","""a55475b1""",6.0,,,,,6.0,3393.600098,,14.0,,,2.0,4.0,,,1513.0,0.0,…,120000.0,,,,,,,,,"""a55475b1""","""a55475b1""",10113000.0,,"""a55475b1""","""c7a5ad39""",4.0,35.0,78.0,,12.0,,44888.414062,,2020.0,,"""a55475b1""","""ab3c25cf""",6.304878,,5714.283203,,240.337997,,112599512.0,,2019,16
1839502,71,0,,,,,-19126.0,1.0,3.0,1.0,3.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",2.0,4.0,"""3439d993""","""a55475b1""",3.0,,,,,,,,,,14.0,2.0,2.0,0.0,30607.0,6150.200195,3276.0,…,54000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,3854.0,220.0,12.0,12.0,24174.669922,5234.624023,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",1305.909058,17.328571,6954.425293,858.877136,3048951.5,2296.774414,91777560.0,2149400.0,2020,13
238999,81,0,,,,896256.1875,-14477.0,2.0,5.0,1.0,16.0,2.0,"""2fc785b2""","""717ddd49""","""a55475b1""",9.0,2.0,"""3439d993""","""a55475b1""",16.0,,,,,,,,,,12.0,5.0,11.0,0.0,118177.601562,1374.800049,137.400009,…,52000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,21.0,26.0,12.0,12.0,3879.0,4521.007812,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",1.522727,1.777778,319.399994,630.443542,16.720402,22.197254,1040700.0,1885700.0,2020,21
1663056,47,0,,,,,-9285.0,2.0,2.0,1.0,6.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,6.0,"""3439d993""","""a55475b1""",6.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,6.0,0.0,107023.773438,6581.399902,5860.399902,…,120000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,23.0,0.0,1.0,12.0,12.0,0.0,574.400024,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.076923,0.0,44.184616,0.0,0.076923,0.0,25379.642578,2019,2
2544202,7,0,-5026.0,,-26469.0,,-26469.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""717ddd49""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",0.0,5538.533203,,,5.0,,,,14.0,,,0.0,1.0,0.0,,6358.600098,2740.600098,…,30000.0,0.0,0.0,-1562.0,0.0,-1562.0,0.0,-1562.0,-1562.0,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,19
1725049,52,0,,,,,-11779.0,6.0,6.0,3.0,21.0,5.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",21.0,7.0,"""a7fcb6e5""","""a55475b1""",21.0,,,,,,,"""DEDUCTION_6""",,14.0,,4.0,4.0,0.0,125632.1875,5832.600098,0.0,…,90000.0,,,,,,,,,"""a55475b1""","""a55475b1""",21258140.0,0.0,"""c7a5ad39""","""a55475b1""",11.0,35.0,0.0,3.0,12.0,12.0,0.0,6259.0,2020.0,2020.0,"""ab3c25cf""","""daf49a8a""",0.0,0.023622,0.0,49.283466,,0.070866,,308465.21875,2019,31
891486,48,0,,,,,-13947.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,0.0,,,1869.800049,0.0,…,60000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,35.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,8
620354,3,0,,,-15580.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,6.0,14979.200195,,14.0,,,,,,,1772.200073,0.0,…,40000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,27
990737,75,0,,,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,,,,,,14.0,,,,,2734.600098,0.0,…,30000.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020,10
812716,38,0,,,,,-18591.0,0.0,1.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""a55475b1""","""a55475b1""",1.0,,,,,5.0,4128.600098,"""DEDUCTION_6""",14.0,14.0,,4.0,0.0,0.0,18684.0,2000.0,6228.0,…,36000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,0.0,14.0,12.0,12.0,0.0,3701.917969,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.337349,0.0,188.610916,0.0,2.811637,0.0,629034.1875,2019,26


In [8]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

Memory usage of dataframe "df_test" is 0.0297 MB.
Memory usage of dataframe "df_test" became 0.0172 MB.
Test data shape: (10, 470)


In [9]:
if 'target' not in df_test.columns:
    df_test = df_test.with_columns(pl.lit(0).alias('target').cast(pl.Int8))

In [10]:
df, cat_cols = Utility.to_pandas(
                        pl.concat([
                                 df_train.with_columns(pl.lit('train').alias('partition')),
                                 df_test.select(df_train.columns).with_columns(pl.lit('test').alias('partition'))
                                    ],how='vertical_relaxed')
                                )
df.shape

(1526669, 472)

In [11]:
df

Unnamed: 0,case_id,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,...,pmts_dpd_303P_MEAN,pmts_overdue_1140A_MEAN,pmts_overdue_1152A_MEAN,pmts_dpd_1073P_VAR,pmts_dpd_303P_VAR,pmts_overdue_1140A_VAR,pmts_overdue_1152A_VAR,year,day,partition
0,633272,6,0,,,-16423.0,,-16423.0,0.0,0.0,...,,5714.283203,,2.403380e+02,,1.125995e+08,,2019,16,train
1,1839502,71,0,,,,,-19126.0,1.0,3.0,...,17.328571,6954.425293,858.877136,3.048952e+06,2296.774414,9.177756e+07,2.149396e+06,2020,13,train
2,238999,81,0,,,,8.962562e+05,-14477.0,2.0,5.0,...,1.777778,319.399994,630.443542,1.672040e+01,22.197254,1.040736e+06,1.885690e+06,2020,21,train
3,1663056,47,0,,,,,-9285.0,2.0,2.0,...,0.076923,0.000000,44.184616,0.000000e+00,0.076923,0.000000e+00,2.537964e+04,2019,2,train
4,2544202,7,0,-5026.0,,-26469.0,,-26469.0,0.0,0.0,...,,0.000000,,0.000000e+00,,0.000000e+00,,2019,19,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526664,57631,100,0,,,,4.803345e+05,-12999.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,0.000000e+00,2022,4,test
1526665,57634,100,0,,,,1.526365e+04,-16281.0,2.0,2.0,...,,,,,,,,2021,27,test
1526666,57630,100,0,,,,4.999750e+05,-19767.0,1.0,2.0,...,0.000000,0.000000,0.000000,,,,,2021,16,test
1526667,57569,100,0,,,,,-26408.0,4.0,4.0,...,2328.571533,,33346.402344,,3341.619141,,0.000000e+00,2021,20,test


In [12]:
features = df.columns[3:-1].tolist()

In [13]:
numeric_cols = [x for x in features if x not in cat_cols]

In [14]:
# uncorrelate features 
nans_df = df.loc[df['partition']=='train',numeric_cols].isna()
nans_groups = {}
for col in numeric_cols:
    cur_group = nans_df[col].sum()
    if cur_group in nans_groups:
        nans_groups[cur_group].append(col)
    else:
        nans_groups[cur_group]=[col]
del nans_df
gc.collect()

0

In [None]:
uncorrelated_feats = []
for k,v in tqdm(nans_groups.items()):
    if len(v)>1:
            vals = nans_groups[k]
            grps = group_columns_by_correlation(df.loc[df['partition']=='train',numeric_cols], threshold=0.8)
            use  = reduce_group(grps)
            uncorrelated_feats.extend(use)
    else:
        uncorrelated_feats.extend(v)

In [None]:
features = uncorrelated_feats + cat_cols
df = df[['case_id','target','partition'] + features]
df.shape

# exploration

In [None]:
# placeholder for exploration code 

# training

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split,StratifiedGroupKFold
import lightgbm as lgb 
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

In [None]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

In [None]:
def get_base_params():
    base_params = {
        'boosting_type':'gbdt',
        'random_state': 117,
        'objective': 'binary',
        'metric': 'auc',
        'extra_trees':True,
        'verbose': -1,
        'max_bin': 200,
#         'device_type': 'gpu',
#         'gpu_use_dp': True,
        
    }
    return base_params

In [None]:
# set up search space
search_space_setup = {
    'feature_fraction': hp.uniform('feature_fraction', 0.3, .9),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.3, .9),
    'lambda_l1': hp.loguniform('lambda_l1', np.log(.000001), np.log(1000)),
    'lambda_l2': hp.loguniform('lambda_l2', np.log(.000001), np.log(1000)),
    'bagging_freq': scope.int(hp.uniform('bagging_freq', 2, 10)),
    'min_data_in_leaf': scope.int(hp.uniform('min_data_in_leaf', 100, 10000)),
    'learning_rate' : hp.uniform('learning_rate', 0.001, .1),
    'num_leaves': scope.int(hp.uniform('num_leaves', 20, 5000)),
    'min_gain_to_split': hp.uniform('min_gain_to_split', 0, 15),
}
search_space = get_base_params()
for k,v in search_space_setup.items():
    search_space[k] = v

In [None]:
def trial_fn_lgbm_auc(params,splits = None):
    
    scores = []
    for train_idx, valid_idx in splits:
        model = lgb.LGBMClassifier(**params)  
        model.fit(df.loc[train_idx,features], df.loc[train_idx,'target'],
                  eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
                  eval_metric='auc',
                  callbacks=[lgb.early_stopping(50)])

        score = roc_auc_score(df_train.loc[valid_idx,'target'],model.predict_proba(df_train.loc[valid_idx,features]))
        scores.append(score)
    
    score = np.mean(scores) - np.std(scores)
    out = {"status": STATUS_OK, "loss": -score} # always minimizes
    return out

In [None]:
lgbtrain = lgb.Dataset(df_train[features], label=df_train['target'])
test_X   = df_test[features].copy()
submission = df_test[['case_id']]

In [None]:
# do splits ahead of time to improve trial speed
k = 5
splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
          StratifiedGroupKFold(n_splits=k).split(np.arange(df_train.shape[0]),
                                                 df_train['target'],
                                                 groups = df_train['week_num'])]

In [None]:
del df_test
del df_train
gc.collect()

In [None]:
best_params = fmin(fn=partial(trial_fn, splits = splits, dataset = lgbtrain),
                    space=search_space,
                    algo=tpe.suggest,
                    max_evals=100,
                    timeout=60*60*6 # seconds
                  )
int_params = ['max_depth','n_estimators','bagging_freq','num_leaves']
bestp = get_base_params()
for k,v in best_params.items():
    if k in int_params:
        bestp[k] = int(v)
    else:
        bestp[k] = v
bestp

In [None]:
# bestp = {
#          'boosting_type': 'gbdt',
# #          'device_type': 'gpu',
#          'random_state': 117,
#          'objective': 'binary',
#          'metric': 'auc',
#          'extra_trees': True,
#          'verbose': -1,
#          'max_bin': 64,
#          'bagging_fraction': 0.6615111203742043,
#          'bagging_freq': 4,
#          'cat_l2': 0.4303012850161522,
#          'colsample_bynode': 0.30799275380454566,
#          'l1_regularization': 0.09818609605701412,
#          'l2_regularization': 45.88388390697673,
#          'learning_rate': 0.06583892942324936,
#          'max_depth': 15,
#          'n_estimators': 849,
#          'num_leaves': 100,
#          'verbose': 1
#         }

In [None]:
gbm = lgb.train(
    bestp,
    lgbtrain,
)

# submission

In [None]:
submission['score'] = gbm.predict(test_X)

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()