In [1]:
import numpy as np
import pandas as pd
import polars as pl
import os, gc, warnings
from glob import glob
from pathlib import Path
from typing import Any
import seaborn as sns
from tqdm import tqdm
warnings.filterwarnings("ignore")

ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

# preprocessing

In [2]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("category")

        return df, cat_cols

In [3]:
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"{col}_MAX") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"{col}_MIN") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"{col}_MEAN") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"{col}_VAR") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"{col}_MODE") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

In [4]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None):
        chunks = []
        for path in glob(str(glob_path)):
            df = pl.read_parquet(path, low_memory=True, rechunk=True)
            df = df.pipe(SchemaGen.change_dtypes)
            if depth in [1, 2]:
                df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
            chunks.append(df)
        df = pl.concat(chunks, how="vertical_relaxed")
        del chunks
        gc.collect()

        df = df.unique(subset=["case_id"]) 
        
        return df

    @staticmethod
    def join_dataframes(df_base, depth_0, depth_1, depth_2):
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        return df_base


In [5]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if (col[-1] == 'D') or ('D_' in col):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

            
    df = df.drop("MONTH")
    df = df.drop("WEEK_NUM")

    df = df.with_columns(pl.col("date_decision").dt.year().alias("year").cast(pl.Int16))
    df = df.with_columns(pl.col("date_decision").dt.month().alias("month").cast(pl.Int16))
    df = df.with_columns(pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8))
    df = df.with_columns(pl.col("date_decision").dt.weekday().alias("weekday").cast(pl.UInt8))
    df = df.with_columns(((pl.col("date_decision") - pl.lit("2019-01-01").cast(pl.Date)).dt.total_days() / 7).floor().alias("week_num").cast(pl.Int32))

#     df = df.drop("date_decision")
    return df 

In [6]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

Memory usage of dataframe "df_train" is 4712.6755 MB.
Memory usage of dataframe "df_train" became 2668.5421 MB.
Train data shape: (1526659, 474)


case_id,date_decision,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,annuitynextmonth_57A,…,openingdate_313D_MAX,amount_416A_MEAN,openingdate_313D_MEAN,num_group1_MAX_11,openingdate_857D_MAX,openingdate_857D_MEAN,collater_typofvalofguarant_298M_MAX,collater_typofvalofguarant_407M_MAX,collater_valueofguarantee_1124L_MAX,collater_valueofguarantee_876L_MAX,collaterals_typeofguarante_359M_MAX,collaterals_typeofguarante_669M_MAX,num_group1_MAX_12,num_group2_MAX,pmts_dpd_1073P_MAX,pmts_dpd_303P_MAX,pmts_month_158T_MAX,pmts_month_706T_MAX,pmts_overdue_1140A_MAX,pmts_overdue_1152A_MAX,pmts_year_1139T_MAX,pmts_year_507T_MAX,subjectroles_name_541M_MAX,subjectroles_name_838M_MAX,pmts_dpd_1073P_MEAN,pmts_dpd_303P_MEAN,pmts_overdue_1140A_MEAN,pmts_overdue_1152A_MEAN,pmts_dpd_1073P_VAR,pmts_dpd_303P_VAR,pmts_overdue_1140A_VAR,pmts_overdue_1152A_VAR,year,month,day,weekday,week_num
u32,date,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,f32,…,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8,u8,u8,u8
1935960,2020-09-22,0,,,,149410.09375,-13720.0,0.0,2.0,0.0,7.0,0.0,"""2fc785b2""","""a55475b1""","""a55475b1""",0.0,5.0,"""a55475b1""","""a55475b1""",7.0,,,,,,,,,,14.0,2.0,0.0,0.0,55816.851562,2090.199951,7000.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",5.0,35.0,1.0,1326.0,12.0,12.0,1083.200073,41903.527344,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.047619,357.174408,51.580956,13127.273438,0.047619,146503.8125,55872.496094,229215216.0,2020,9,22,2,90
1524722,2019-09-07,0,14.0,14.0,,,-25270.0,1.0,1.0,0.0,1.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",1.0,7030.200195,7030.200195,6.0,6.0,,,"""PENSION_6""",14.0,14.0,,0.0,1.0,0.0,33869.0,2539.800049,3076.800049,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,9,7,6,35
909972,2019-12-24,0,,,,,-14542.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,0.0,,,4595.800293,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",,64832.0,"""c7a5ad39""","""a55475b1""",4.0,35.0,,1.0,,12.0,,0.0,,2018.0,"""ab3c25cf""","""a55475b1""",,0.027778,,0.0,,0.027778,,0.0,2019,12,24,2,51
111729,2019-02-18,0,,,-11036.0,,-11036.0,6.0,8.0,0.0,12.0,5.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",6.0,4.0,"""3439d993""","""a55475b1""",12.0,,,,,6.0,9743.600586,,14.0,,,3.0,5.0,0.0,,1218.0,944.799988,…,-1834.0,0.0,-1834.0,0.0,-1834.0,-1834.0,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",3.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,2,18,1,6
982642,2020-05-14,0,,,,,-14989.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",0.0,,,,,,,,,,14.0,0.0,0.0,,,1168.0,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,23.0,0.0,,12.0,,0.0,,2021.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2020,5,14,4,71
1303212,2019-03-08,0,,,-11389.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,0.0,0.0,,14.0,,,,,0.0,,666.799988,0.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,3,8,5,9
1689021,2019-12-15,0,,,,,-22993.0,0.0,0.0,0.0,1.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,2.0,"""3439d993""","""a55475b1""",1.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,2.0,0.0,0.0,9038.400391,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,17485.0,"""c7a5ad39""","""c7a5ad39""",9.0,35.0,21.0,42.0,12.0,12.0,9042.015625,11485.206055,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",2.285714,2.785047,2062.769287,1942.899658,30.835165,37.604301,12117522.0,14046650.0,2019,12,15,7,49
179504,2019-11-25,0,,,,,-13842.0,1.0,2.0,0.0,9.0,0.0,"""a55475b1""","""717ddd49""","""a55475b1""",5.0,3.0,"""3439d993""","""a55475b1""",9.0,,,,,,,"""DEDUCTION_6""",,14.0,,8.0,10.0,0.0,139184.796875,5182.200195,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",13.0,35.0,1.0,1113.0,12.0,12.0,7972.26416,33613.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0625,163.082565,337.721802,6101.310059,0.060484,110877.039062,2191700.0,134184176.0,2019,11,25,1,46
581,2019-01-09,0,-278.0,,-23294.0,,-23294.0,0.0,1.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",1.0,15221.367188,,,6.0,,,,14.0,,,0.0,1.0,,,6173.399902,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",3120000.0,,"""a55475b1""","""a55475b1""",0.0,23.0,0.0,,12.0,,0.0,,2019.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,1,9,3,1
656319,2019-03-23,0,,,-8057.0,,-8057.0,2.0,4.0,0.0,6.0,2.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,2.0,"""a55475b1""","""a55475b1""",6.0,,,,,6.0,5238.0,,14.0,,,0.0,2.0,,,1083.400024,0.0,…,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,3,23,6,11


In [7]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

Memory usage of dataframe "df_test" is 0.0298 MB.
Memory usage of dataframe "df_test" became 0.0173 MB.
Test data shape: (10, 473)


In [8]:
if 'target' not in df_test.columns:
    df_test = df_test.with_columns(pl.lit(0).alias('target').cast(pl.Int8))

In [9]:
df, cat_cols = Utility.to_pandas(
                        pl.concat([
                                 df_train.with_columns(pl.lit('train').alias('partition')),
                                 df_test.select(df_train.columns).with_columns(pl.lit('test').alias('partition'))
                                    ],how='vertical_relaxed')
                                )

In [10]:
df['dob'] = df['dateofbirth_337D'].fillna(df['birth_259D_MAX'])
df['dob'] = df['dob'].fillna(df['birthdate_574D'])

In [11]:
# features = ['actualdpd_943P_MEAN',
#  'actualdpd_943P_VAR',
#  'actualdpdtolerance_344P',
#  'amount_416A_MEAN',
#  'amount_4527230A_MEAN',
#  'amount_4527230A_VAR',
#  'amount_4917619A_MEAN',
#  'amount_4917619A_VAR',
#  'amtinstpaidbefduel24m_4187115A',
#  'annualeffectiverate_199L_MAX',
#  'annualeffectiverate_63L_MAX',
#  'annuity_853A_MAX',
#  'annuity_853A_MEAN',
#  'annuity_853A_VAR',
#  'annuitynextmonth_57A',
#  'applicationcnt_361L',
#  'applications30d_658L',
#  'applicationscnt_1086L',
#  'applicationscnt_464L',
#  'applicationscnt_867L',
#  'approvaldate_319D_MAX',
#  'approvaldate_319D_MEAN',
#  'avgdbddpdlast3m_4187120P',
#  'avgdbdtollast24m_4525197P',
#  'avgdpdtolclosure24_3658938P',
#  'avginstallast24m_3658937A',
#  'avglnamtstart24m_4525187A',
#  'avgmaxdpdlast9m_3716943P',
#  'avgoutstandbalancel6m_4187114A',
#  'avgpmtlast12m_4525200A',
#  'bankacctype_710L',
#  'byoccupationinc_3656910L_MAX',
#  'cancelreason_3545846M_MAX',
#  'cardtype_51L',
#  'childnum_21L_MAX',
#  'classificationofcontr_13M_MAX',
#  'classificationofcontr_400M_MAX',
#  'clientscnt_100L',
#  'clientscnt_1022L',
#  'clientscnt_1071L',
#  'clientscnt_1130L',
#  'clientscnt_157L',
#  'clientscnt_257L',
#  'clientscnt_304L',
#  'clientscnt_360L',
#  'clientscnt_493L',
#  'clientscnt_533L',
#  'clientscnt_887L',
#  'clientscnt_946L',
#  'cntincpaycont9m_3716944L',
#  'cntpmts24_3658933L',
#  'collater_typofvalofguarant_298M_MAX',
#  'collater_typofvalofguarant_407M_MAX',
#  'collater_valueofguarantee_1124L_MAX',
#  'collater_valueofguarantee_876L_MAX',
#  'collaterals_typeofguarante_359M_MAX',
#  'collaterals_typeofguarante_669M_MAX',
#  'commnoinclast6m_3546845L',
#  'contaddr_matchlist_1032L_MAX',
#  'contaddr_smempladdr_334L_MAX',
#  'contractssum_5085716L',
#  'contractst_545M_MAX',
#  'contractst_964M_MAX',
#  'creationdate_885D_MAX',
#  'creationdate_885D_MEAN',
#  'credacc_actualbalance_314A_MEAN',
#  'credacc_credlmt_575A_MAX',
#  'credacc_credlmt_575A_MEAN',
#  'credacc_credlmt_575A_VAR',
#  'credacc_maxhisbal_375A_MEAN',
#  'credacc_minhisbal_90A_MEAN',
#  'credacc_status_367L_MAX',
#  'credacc_transactions_402L_MAX',
#  'credamount_590A_MAX',
#  'credamount_590A_MEAN',
#  'credamount_590A_VAR',
#  'credamount_770A',
#  'credlmt_230A_MEAN',
#  'credlmt_230A_VAR',
#  'credlmt_935A_MEAN',
#  'credlmt_935A_VAR',
#  'credtype_322L',
#  'credtype_587L_MAX',
#  'currdebt_22A',
#  'currdebt_94A_MAX',
#  'currdebt_94A_MEAN',
#  'currdebt_94A_VAR',
#  'currdebtcredtyperange_828A',
#  'dateactivated_425D_MAX',
#  'dateactivated_425D_MEAN',
#  'datefirstoffer_1144D',
#  'datelastinstal40dpd_247D',
#  'datelastunpaid_3546854D',
#  'dateofcredend_289D_MAX',
#  'dateofcredend_353D_MAX',
#  'dateofcredend_353D_MEAN',
#  'dateofcredstart_181D_MAX',
#  'dateofcredstart_739D_MEAN',
#  'dateofrealrepmt_138D_MAX',
#  'dateofrealrepmt_138D_MEAN',
#  'day',
#  'days180_256L',
#  'days30_165L',
#  'daysoverduetolerancedd_3976961L',
#  'debtoutstand_525A_MAX',
#  'debtoverdue_47A_MAX',
#  'deductiondate_4917603D_MAX',
#  'deductiondate_4917603D_MEAN',
#  'deferredmnthsnum_166L',
#  'description_351M_MAX',
#  'description_5085714M',
#  'disbursedcredamount_1113A',
#  'disbursementtype_67L',
#  'downpmt_116A',
#  'downpmt_134A_MAX',
#  'downpmt_134A_MEAN',
#  'downpmt_134A_VAR',
#  'dpdmax_139P_VAR',
#  'dpdmax_757P_MEAN',
#  'dpdmax_757P_VAR',
#  'dpdmaxdatemonth_442T_MAX',
#  'dpdmaxdatemonth_89T_MAX',
#  'dpdmaxdateyear_596T_MAX',
#  'dpdmaxdateyear_896T_MAX',
#  'dtlastpmt_581D_MAX',
#  'dtlastpmt_581D_MEAN',
#  'dtlastpmtallstes_3545839D_MAX',
#  'dtlastpmtallstes_3545839D_MEAN',
#  'dtlastpmtallstes_4499206D',
#  'education_1103M',
#  'education_1138M_MAX',
#  'education_88M',
#  'education_927M_MAX',
#  'eir_270L',
#  'empl_employedtotal_800L_MAX',
#  'empl_industry_691L_MAX',
#  'empladdr_district_926M_MAX',
#  'empladdr_zipcode_114M_MAX',
#  'employedfrom_700D_MEAN',
#  'equalitydataagreement_891L',
#  'familystate_447L_MAX',
#  'familystate_726L_MAX',
#  'financialinstitution_382M_MAX',
#  'financialinstitution_591M_MAX',
#  'firstclxcampaign_1125D',
#  'firstdatedue_489D',
#  'firstnonzeroinstldate_307D_MAX',
#  'firstnonzeroinstldate_307D_MEAN',
#  'firstquarter_103L',
#  'fourthquarter_440L',
#  'homephncnt_628L',
#  'housetype_905L_MAX',
#  'incometype_1044T_MAX',
#  'inittransactionamount_650A',
#  'inittransactioncode_186L',
#  'inittransactioncode_279L_MAX',
#  'instlamount_768A_MEAN',
#  'instlamount_768A_VAR',
#  'instlamount_852A_MEAN',
#  'instlamount_852A_VAR',
#  'isbidproduct_1095L',
#  'isbidproduct_390L_MAX',
#  'isdebitcard_527L_MAX',
#  'isdebitcard_729L',
#  'language1_981M_MAX',
#  'lastactivateddate_801D',
#  'lastapprcommoditycat_1041M',
#  'lastapprcredamount_781A',
#  'lastapprdate_640D',
#  'lastcancelreason_561M',
#  'lastdelinqdate_224D',
#  'lastrejectcommoditycat_161M',
#  'lastrejectcommodtypec_5251769M',
#  'lastrejectcredamount_222A',
#  'lastrejectdate_50D',
#  'lastrejectreason_759M',
#  'lastrejectreasonclient_4145040M',
#  'lastst_736L',
#  'lastupdate_1112D_MAX',
#  'lastupdate_1112D_MEAN',
#  'lastupdate_388D_MAX',
#  'lastupdate_388D_MEAN',
#  'maininc_215A',
#  'mainoccupationinc_384A_MAX',
#  'mainoccupationinc_437A_MAX',
#  'mainoccupationinc_437A_MEAN',
#  'mainoccupationinc_437A_VAR',
#  'maritalst_385M',
#  'maritalst_893M',
#  'mastercontrelectronic_519L',
#  'mastercontrexist_109L',
#  'maxannuity_159A',
#  'maxdbddpdlast1m_3658939P',
#  'maxdbddpdtollast12m_3658940P',
#  'maxdbddpdtollast6m_4187119P',
#  'maxdebt4_972A',
#  'maxdpdfrom6mto36m_3546853P',
#  'maxdpdinstldate_3546855D',
#  'maxdpdinstlnum_3546846P',
#  'maxdpdlast24m_143P',
#  'maxdpdlast3m_392P',
#  'maxdpdtolerance_374P',
#  'maxdpdtolerance_577P_MEAN',
#  'maxdpdtolerance_577P_VAR',
#  'maxinstallast24m_3658928A',
#  'maxlnamtstart6m_4525199A',
#  'maxoutstandbalancel12m_4187113A',
#  'maxpmtlast3m_4525190A',
#  'mindbddpdlast24m_3658935P',
#  'mobilephncnt_593L',
#  'month',
#  'monthlyinstlamount_332A_MEAN',
#  'monthlyinstlamount_332A_VAR',
#  'monthlyinstlamount_674A_MAX',
#  'monthlyinstlamount_674A_MEAN',
#  'monthlyinstlamount_674A_VAR',
#  'nominalrate_281L_MAX',
#  'nominalrate_498L_MAX',
#  'num_group1_MAX',
#  'num_group1_MAX_10',
#  'num_group1_MAX_11',
#  'num_group1_MAX_12',
#  'num_group1_MAX_3',
#  'num_group1_MAX_4',
#  'num_group1_MAX_5',
#  'num_group1_MAX_6',
#  'num_group1_MAX_9',
#  'num_group2_MAX',
#  'numactivecreds_622L',
#  'numactivecredschannel_414L',
#  'numactiverelcontr_750L',
#  'numberofcontrsvalue_258L_MAX',
#  'numberofcontrsvalue_358L_MAX',
#  'numberofinstls_229L_MAX',
#  'numberofinstls_320L_MAX',
#  'numberofoutstandinstls_520L_MAX',
#  'numberofoutstandinstls_59L_MAX',
#  'numberofoverdueinstlmax_1039L_MAX',
#  'numberofoverdueinstlmax_1151L_MAX',
#  'numberofoverdueinstlmaxdat_148D_MEAN',
#  'numberofoverdueinstlmaxdat_641D_MEAN',
#  'numberofoverdueinstls_725L_MAX',
#  'numberofoverdueinstls_834L_MAX',
#  'numcontrs3months_479L',
#  'numincomingpmts_3546848L',
#  'numinstls_657L',
#  'numinstlsallpaid_934L',
#  'numinstlswithdpd10_728L',
#  'numinstlswithdpd5_4187116L',
#  'numinstlswithoutdpd_562L',
#  'numinstpaid_4499208L',
#  'numinstpaidearly3d_3546850L',
#  'numinstpaidearly5d_1087L',
#  'numinstpaidearly5dest_4493211L',
#  'numinstpaidearly_338L',
#  'numinstpaidlastcontr_4325080L',
#  'numinstpaidlate1d_3546852L',
#  'numinstregularpaid_973L',
#  'numinstregularpaidest_4493210L',
#  'numinsttopaygr_769L',
#  'numnotactivated_1143L',
#  'numpmtchanneldd_318L',
#  'numrejects9m_859L',
#  'opencred_647L',
#  'openingdate_313D_MEAN',
#  'openingdate_857D_MEAN',
#  'outstandingamount_354A_MEAN',
#  'outstandingamount_354A_VAR',
#  'outstandingamount_362A_MEAN',
#  'outstandingamount_362A_VAR',
#  'outstandingdebt_522A_MAX',
#  'outstandingdebt_522A_MEAN',
#  'outstandingdebt_522A_VAR',
#  'overdueamount_31A_MEAN',
#  'overdueamount_31A_VAR',
#  'overdueamount_659A_MEAN',
#  'overdueamount_659A_VAR',
#  'overdueamountmax2_14A_MEAN',
#  'overdueamountmax2_14A_VAR',
#  'overdueamountmax2_398A_MEAN',
#  'overdueamountmax2_398A_VAR',
#  'overdueamountmax2date_1002D_MEAN',
#  'overdueamountmax2date_1142D_MEAN',
#  'overdueamountmax_155A_VAR',
#  'overdueamountmax_35A_MEAN',
#  'overdueamountmax_35A_VAR',
#  'overdueamountmaxdatemonth_284T_MAX',
#  'overdueamountmaxdatemonth_365T_MAX',
#  'overdueamountmaxdateyear_2T_MAX',
#  'overdueamountmaxdateyear_994T_MAX',
#  'paytype1st_925L',
#  'paytype_783L',
#  'pctinstlsallpaidearl3d_427L',
#  'pctinstlsallpaidlat10d_839L',
#  'pctinstlsallpaidlate1d_3546856L',
#  'periodicityofpmts_1102L_MAX',
#  'periodicityofpmts_837L_MAX',
#  'pmtamount_36A_MEAN',
#  'pmtamount_36A_VAR',
#  'pmtaverage_3A',
#  'pmtaverage_4527227A',
#  'pmtcount_4527229L',
#  'pmtcount_693L',
#  'pmtnum_254L',
#  'pmtnum_8L_MAX',
#  'pmts_dpd_1073P_MEAN',
#  'pmts_dpd_1073P_VAR',
#  'pmts_dpd_303P_MAX',
#  'pmts_dpd_303P_MEAN',
#  'pmts_dpd_303P_VAR',
#  'pmts_month_158T_MAX',
#  'pmts_month_706T_MAX',
#  'pmts_overdue_1140A_MEAN',
#  'pmts_overdue_1140A_VAR',
#  'pmts_overdue_1152A_MEAN',
#  'pmts_overdue_1152A_VAR',
#  'pmts_year_1139T_MAX',
#  'pmts_year_507T_MAX',
#  'pmtscount_423L',
#  'pmtssum_45A',
#  'posfpd10lastmonth_333P',
#  'posfpd30lastmonth_3976960P',
#  'posfstqpd30lastmonth_3976962P',
#  'postype_4733339M_MAX',
#  'price_1097A',
#  'processingdate_168D_MAX',
#  'processingdate_168D_MEAN',
#  'prolongationcount_1120L_MAX',
#  'purposeofcred_426M_MAX',
#  'purposeofcred_874M_MAX',
#  'recorddate_4527225D_MAX',
#  'refreshdate_3813885D_MAX',
#  'refreshdate_3813885D_MEAN',
#  'rejectreason_755M_MAX',
#  'rejectreasonclient_4145042M_MAX',
#  'relationshiptoclient_415T_MAX',
#  'relationshiptoclient_642T_MAX',
#  'remitter_829L_MAX',
#  'requesttype_4525192L',
#  'residualamount_488A_MAX',
#  'residualamount_488A_VAR',
#  'residualamount_856A_MEAN',
#  'residualamount_856A_VAR',
#  'responsedate_1012D',
#  'responsedate_4527233D',
#  'responsedate_4917613D',
#  'revolvingaccount_394A_MEAN',
#  'role_1084L_MAX',
#  'safeguarantyflag_411L_MAX',
#  'secondquarter_766L',
#  'sellerplacecnt_915L',
#  'sex_738L_MAX',
#  'status_219L_MAX',
#  'subjectrole_182M_MAX',
#  'subjectrole_93M_MAX',
#  'subjectroles_name_541M_MAX',
#  'subjectroles_name_838M_MAX',
#  'sumoutstandtotal_3546847A',
#  'sumoutstandtotalest_4493215A',
#  'thirdquarter_1082L',
#  'totalamount_6A_MAX',
#  'totalamount_6A_MEAN',
#  'totalamount_6A_VAR',
#  'totalamount_996A_MEAN',
#  'totalamount_996A_VAR',
#  'totaldebt_9A',
#  'totaldebtoverduevalue_718A_MEAN',
#  'totaloutstanddebtvalue_39A_MEAN',
#  'totaloutstanddebtvalue_668A_MEAN',
#  'totalsettled_863A',
#  'totinstallast1m_4525188A',
#  'twobodfilling_608L',
#  'type_25L_MAX',
#  'typesuite_864L',
#  'validfrom_1069D',
#  'weekday',
#  'year',
#  'dob'] + cat_cols

In [12]:
# top 40
features = ['pctinstlsallpaidlate1d_3546856L',
 'pctinstlsallpaidlat10d_839L',
 'numinstlswithdpd10_728L',
 'avgmaxdpdlast9m_3716943P',
 'pctinstlsallpaidearl3d_427L',
 'datelastinstal40dpd_247D',
 'numberofoverdueinstlmaxdat_148D_MEAN',
 'overdueamountmax2date_1002D_MEAN',
 'pmts_dpd_303P_MEAN',
 'numinstlswithdpd5_4187116L',
 'days180_256L',
 'numberofoverdueinstlmaxdat_641D_MEAN',
 'dpdmax_757P_MEAN',
 'numrejects9m_859L',
 'days30_165L',
 'overdueamountmax2date_1142D_MEAN',
 'firstclxcampaign_1125D',
 'mobilephncnt_593L',
 'pmtaverage_4527227A',
 'avgdbdtollast24m_4525197P',
 'numinstpaidearly3d_3546850L',
 'lastrejectdate_50D',
 'dob',
 'employedfrom_700D_MEAN',
 'numinstlsallpaid_934L',
 'daysoverduetolerancedd_3976961L',
 'maxdpdtolerance_577P_MEAN',
 'numcontrs3months_479L',
 'pmtaverage_3A',
 'numinstpaidearly_338L',
 'avgdpdtolclosure24_3658938P',
 'numinstpaidearly5dest_4493211L',
 'dpdmaxdateyear_596T_MAX',
 'numinstlswithoutdpd_562L',
 'lastupdate_388D_MEAN',
 'dateofrealrepmt_138D_MEAN',
 'maxdpdtolerance_374P',
 'sellerplacecnt_915L',
 'credacc_actualbalance_314A_MEAN',
 'maxdbddpdtollast12m_3658940P',
 'mindbddpdlast24m_3658935P'] + cat_cols

# partition

In [13]:
df_train = df[df['partition']=='train'].reset_index(drop=True)
df_test  = df[df['partition']=='test'].reset_index(drop=True)
n_train = df_train.shape[0]

In [14]:
del df
gc.collect()

0

In [15]:
df_train['score'] = 0.0
df_test['score'] = 0.0

# modeling

In [16]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, StratifiedKFold
import lightgbm as lgb 
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

## train - lgbm

In [17]:
#### do splits ahead of time to improve trial speed - turn off for submission
k              = 5

# # split by week num
group_splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
                      StratifiedGroupKFold(n_splits=k).split(np.arange(n_train),
                                                             df_train['target'],
                                                             groups = df_train['week_num'])]

In [23]:
bestp = {'random_state': 117,
'objective': 'binary',
'metric': 'auc',
'extra_trees': True,
'verbose': -1,
'max_bin': 200,
'device': 'gpu',
'gpu_use_dp': True,
'num_estimators': 10000,
'bagging_fraction': 0.868002269912819,
'bagging_freq': 2,
'feature_fraction': 0.41266236705896975,
'lambda_l1': 6.120758896260358e-06,
# 'lambda_l2': 0.8347087226281584,
'learning_rate': 0.05521805410758059,
'min_data_in_leaf': 1317,
'min_gain_to_split': 0.606904976068058,
'num_leaves': 4790}

In [24]:
# group stratify ensemble
for train_idx, valid_idx in group_splits:
    model = lgb.LGBMClassifier(**bestp)  
    model.fit(df_train.loc[train_idx,features], df_train.loc[train_idx,'target'],
              eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(50)])
    df_test['score'] += model.predict_proba(df_test[features])[:,1] / k

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.809884
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.812385
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.816381
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.817047
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.811956


# submission

In [20]:
df_test[['case_id','score']].to_csv('submission.csv', index=False)
df_test[['case_id','score']].head()

Unnamed: 0,case_id,score
0,57543,0.016306
1,57552,0.023324
2,57630,0.006685
3,57634,0.012593
4,57549,0.030705
