In [1]:
import numpy as np
import pandas as pd
import polars as pl
import os, gc, warnings
from glob import glob
from pathlib import Path
from typing import Any
from itertools import combinations, permutations

warnings.filterwarnings("ignore")

ROOT = Path("/kaggle/input/home-credit-credit-risk-model-stability")
TRAIN_DIR = ROOT / "parquet_files" / "train"
TEST_DIR = ROOT / "parquet_files" / "test"

In [40]:
features = ['case_id', 'date_decision', 'target', 'credamount_770A', 'applicationcnt_361L', 'applications30d_658L', 'applicationscnt_1086L', 'applicationscnt_464L', 'applicationscnt_867L', 'clientscnt_1022L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_1130L', 'clientscnt_157L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'deferredmnthsnum_166L', 'disbursedcredamount_1113A', 'downpmt_116A', 'homephncnt_628L', 'isbidproduct_1095L', 'mobilephncnt_593L', 'numactivecreds_622L', 'numactivecredschannel_414L', 'numactiverelcontr_750L', 'numcontrs3months_479L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'numrejects9m_859L', 'sellerplacecnt_915L', 'birth_259D_MAX', 'mainoccupationinc_384A_MAX', 'num_group1_MAX_9', 'INDIV1_num_group1_MAX_9', 'INDIV1_personindex_1023L_MAX', 'INDIV1_persontype_1072L_MAX', 'INDIV1_persontype_792L_MAX', 'month', 'day', 'weekday', 'assignmentdate_238D', 'assignmentdate_4527235D', 'pmtaverage_4527227A', 'pmtcount_4527229L', 'birthdate_574D', 'contractssum_5085716L', 'INDIV1_contractsum_5085717L_MAX', 'dateofbirth_337D', 'days180_256L', 'days30_165L', 'days360_512L', 'firstquarter_103L', 'fourthquarter_440L', 'secondquarter_766L', 'thirdquarter_1082L', 'debtoutstand_525A_MAX', 'debtoverdue_47A_MAX', 'refreshdate_3813885D_MAX', 'refreshdate_3813885D_MEAN', 'pmtaverage_3A', 'pmtcount_693L', 'pmtscount_423L', 'pmtssum_45A', 'responsedate_1012D', 'responsedate_4527233D', 'responsedate_4917613D', 'actualdpdtolerance_344P', 'amtinstpaidbefduel24m_4187115A', 'numinstlswithdpd5_4187116L', 'annuitynextmonth_57A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'numinstls_657L', 'totalsettled_863A', 'mindbddpdlast24m_3658935P', 'avgdbddpdlast3m_4187120P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'maxinstallast24m_3658928A', 'avglnamtstart24m_4525187A', 'avgmaxdpdlast9m_3716943P', 'avgoutstandbalancel6m_4187114A', 'avgpmtlast12m_4525200A', 'cntincpaycont9m_3716944L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'maxdpdfrom6mto36m_3546853P', 'datefirstoffer_1144D', 'datelastinstal40dpd_247D', 'datelastunpaid_3546854D', 'daysoverduetolerancedd_3976961L', 'numinsttopaygr_769L', 'dtlastpmtallstes_4499206D', 'eir_270L', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'inittransactionamount_650A', 'lastactivateddate_801D', 'lastapplicationdate_877D', 'num_group1_MAX', 'creationdate_885D_MEAN', 'INDIV1_num_group1_MAX', 'lastapprcredamount_781A', 'lastapprdate_640D', 'lastdelinqdate_224D', 'lastrejectcredamount_222A', 'lastrejectdate_50D', 'maininc_215A', 'mastercontrelectronic_519L', 'mastercontrexist_109L', 'maxannuity_159A', 'maxdebt4_972A', 'maxdpdlast24m_143P', 'maxdpdlast3m_392P', 'maxdpdtolerance_374P', 'maxdbddpdlast1m_3658939P', 'maxdbddpdtollast12m_3658940P', 'maxdbddpdtollast6m_4187119P', 'maxdpdinstldate_3546855D', 'maxdpdinstlnum_3546846P', 'maxlnamtstart6m_4525199A', 'maxoutstandbalancel12m_4187113A', 'maxpmtlast3m_4525190A', 'numinstpaidearly_338L', 'numinstpaidearly5d_1087L', 'numinstpaidlate1d_3546852L', 'numincomingpmts_3546848L', 'numinstlsallpaid_934L', 'numinstlswithdpd10_728L', 'numinstlswithoutdpd_562L', 'numinstpaid_4499208L', 'numinstpaidearly3d_3546850L', 'numinstregularpaidest_4493210L', 'numinstpaidearly5dest_4493211L', 'sumoutstandtotalest_4493215A', 'numinstpaidlastcontr_4325080L', 'numinstregularpaid_973L', 'pctinstlsallpaidearl3d_427L', 'pctinstlsallpaidlate1d_3546856L', 'pctinstlsallpaidlat10d_839L', 'pctinstlsallpaidlate4d_3546849L', 'pctinstlsallpaidlate6d_3546844L', 'pmtnum_254L', 'posfpd10lastmonth_333P', 'posfpd30lastmonth_3976960P', 'posfstqpd30lastmonth_3976962P', 'price_1097A', 'sumoutstandtotal_3546847A', 'totaldebt_9A', 'totinstallast1m_4525188A', 'validfrom_1069D', 'actualdpd_943P_MEAN', 'annuity_853A_MAX', 'annuity_853A_MEAN', 'approvaldate_319D_MAX', 'approvaldate_319D_MEAN', 'byoccupationinc_3656910L_MAX', 'childnum_21L_MAX', 'credacc_actualbalance_314A_MEAN', 'credacc_maxhisbal_375A_MEAN', 'credacc_minhisbal_90A_MEAN', 'credacc_transactions_402L_MAX', 'credacc_credlmt_575A_MAX', 'credamount_590A_MAX', 'downpmt_134A_MAX', 'credacc_credlmt_575A_MEAN', 'credamount_590A_MEAN', 'downpmt_134A_MEAN', 'currdebt_94A_MAX', 'currdebt_94A_MEAN', 'dateactivated_425D_MAX', 'dateactivated_425D_MEAN', 'dtlastpmt_581D_MAX', 'dtlastpmt_581D_MEAN', 'dtlastpmtallstes_3545839D_MAX', 'dtlastpmtallstes_3545839D_MEAN', 'employedfrom_700D_MEAN', 'firstnonzeroinstldate_307D_MAX', 'firstnonzeroinstldate_307D_MEAN', 'mainoccupationinc_437A_MAX', 'mainoccupationinc_437A_MEAN', 'maxdpdtolerance_577P_MEAN', 'outstandingdebt_522A_MAX', 'outstandingdebt_522A_MEAN', 'pmtnum_8L_MAX', 'revolvingaccount_394A_MEAN', 'actualdpd_943P_VAR', 'annuity_853A_VAR', 'credacc_credlmt_575A_VAR', 'credamount_590A_VAR', 'downpmt_134A_VAR', 'currdebt_94A_VAR', 'mainoccupationinc_437A_VAR', 'maxdpdtolerance_577P_VAR', 'outstandingdebt_522A_VAR', 'INDIV1_actualdpd_943P_MAX', 'INDIV1_annuity_853A_MAX', 'INDIV1_approvaldate_319D_MAX', 'INDIV1_childnum_21L_MAX', 'INDIV1_credacc_credlmt_575A_MAX', 'INDIV1_credamount_590A_MAX', 'INDIV1_downpmt_134A_MAX', 'INDIV1_currdebt_94A_MAX', 'INDIV1_dateactivated_425D_MAX', 'INDIV1_dtlastpmt_581D_MAX', 'INDIV1_dtlastpmtallstes_3545839D_MAX', 'INDIV1_employedfrom_700D_MAX', 'INDIV1_firstnonzeroinstldate_307D_MAX', 'INDIV1_mainoccupationinc_437A_MAX', 'INDIV1_maxdpdtolerance_577P_MAX', 'INDIV1_outstandingdebt_522A_MAX', 'INDIV1_pmtnum_8L_MAX', 'INDIV1_revolvingaccount_394A_MAX', 'amount_4527230A_MEAN', 'num_group1_MAX_3', 'recorddate_4527225D_MAX', 'INDIV1_num_group1_MAX_3', 'amount_4527230A_VAR', 'amount_4917619A_MEAN', 'deductiondate_4917603D_MAX', 'num_group1_MAX_4', 'deductiondate_4917603D_MEAN', 'INDIV1_amount_4917619A_MAX', 'INDIV1_deductiondate_4917603D_MAX', 'INDIV1_num_group1_MAX_4', 'amount_4917619A_VAR', 'num_group1_MAX_5', 'pmtamount_36A_MEAN', 'processingdate_168D_MAX', 'processingdate_168D_MEAN', 'INDIV1_num_group1_MAX_5', 'pmtamount_36A_VAR', 'annualeffectiverate_199L_MAX', 'annualeffectiverate_63L_MAX', 'credlmt_230A_MEAN', 'credlmt_935A_MEAN', 'dateofcredend_289D_MAX', 'dateofcredstart_739D_MEAN', 'lastupdate_1112D_MAX', 'numberofcontrsvalue_258L_MAX', 'numberofoverdueinstlmax_1039L_MAX', 'overdueamountmax2_14A_MEAN', 'overdueamountmaxdatemonth_365T_MAX', 'overdueamountmaxdateyear_2T_MAX', 'totaloutstanddebtvalue_39A_MEAN', 'lastupdate_1112D_MEAN', 'INDIV1_dateofcredend_289D_MAX', 'INDIV1_dateofcredstart_739D_MAX', 'INDIV1_lastupdate_1112D_MAX', 'INDIV1_overdueamountmax2_14A_MAX', 'INDIV1_overdueamountmaxdatemonth_365T_MAX', 'INDIV1_overdueamountmaxdateyear_2T_MAX', 'INDIV1_totaldebtoverduevalue_178A_MAX', 'INDIV1_totaloutstanddebtvalue_39A_MAX', 'pmts_month_158T_MAX', 'INDIV1_pmts_year_1139T_MAX', 'pmts_overdue_1140A_MEAN', 'INDIV1_pmts_month_158T_MAX', 'dateofcredend_353D_MAX', 'dateofcredstart_181D_MAX', 'numberofoverdueinstlmax_1151L_MAX', 'overdueamountmax2_398A_MEAN', 'dateofcredend_353D_MEAN', 'INDIV1_dateofcredend_353D_MAX', 'INDIV1_numberofoverdueinstlmax_1151L_MAX', 'INDIV1_overdueamountmax2_398A_MAX', 'dateofrealrepmt_138D_MAX', 'dateofrealrepmt_138D_MEAN', 'pmts_dpd_1073P_MEAN', 'dpdmaxdatemonth_89T_MAX', 'dpdmaxdateyear_596T_MAX', 'pmts_dpd_303P_MAX', 'dpdmaxdatemonth_442T_MAX', 'dpdmaxdateyear_896T_MAX', 'dpdmax_757P_MEAN', 'pmts_dpd_303P_MEAN', 'instlamount_768A_MEAN', 'instlamount_852A_MEAN', 'lastupdate_388D_MAX', 'lastupdate_388D_MEAN', 'monthlyinstlamount_332A_MEAN', 'monthlyinstlamount_674A_MAX', 'monthlyinstlamount_674A_MEAN', 'nominalrate_281L_MAX', 'nominalrate_498L_MAX', 'num_group1_MAX_6', 'INDIV1_num_group1_MAX_6', 'numberofcontrsvalue_358L_MAX', 'totaldebtoverduevalue_718A_MEAN', 'totaloutstanddebtvalue_668A_MEAN', 'INDIV1_totaloutstanddebtvalue_668A_MAX', 'numberofinstls_229L_MAX', 'numberofinstls_320L_MAX', 'numberofoutstandinstls_520L_MAX', 'numberofoutstandinstls_59L_MAX', 'numberofoverdueinstlmaxdat_148D_MEAN', 'numberofoverdueinstlmaxdat_641D_MEAN', 'numberofoverdueinstls_725L_MAX', 'overdueamount_659A_MEAN', 'numberofoverdueinstls_834L_MAX', 'outstandingamount_354A_MEAN', 'outstandingamount_362A_MEAN', 'overdueamount_31A_MEAN', 'overdueamountmax2date_1002D_MEAN', 'overdueamountmax2date_1142D_MEAN', 'overdueamountmax_35A_MEAN', 'overdueamountmaxdatemonth_284T_MAX', 'overdueamountmaxdateyear_994T_MAX', 'pmts_overdue_1152A_MEAN', 'periodicityofpmts_1102L_MAX', 'periodicityofpmts_837L_MAX', 'prolongationcount_1120L_MAX', 'residualamount_488A_MAX', 'residualamount_856A_MEAN', 'totalamount_6A_MAX', 'totalamount_6A_MEAN', 'totalamount_996A_MEAN', 'credlmt_230A_VAR', 'credlmt_935A_VAR', 'dpdmax_139P_VAR', 'dpdmax_757P_VAR', 'instlamount_768A_VAR', 'instlamount_852A_VAR', 'monthlyinstlamount_332A_VAR', 'monthlyinstlamount_674A_VAR', 'outstandingamount_354A_VAR', 'outstandingamount_362A_VAR', 'overdueamount_31A_VAR', 'overdueamount_659A_VAR', 'overdueamountmax2_14A_VAR', 'overdueamountmax2_398A_VAR', 'overdueamountmax_155A_VAR', 'overdueamountmax_35A_VAR', 'residualamount_488A_VAR', 'residualamount_856A_VAR', 'totalamount_6A_VAR', 'totalamount_996A_VAR', 'INDIV1_annualeffectiverate_63L_MAX', 'INDIV1_credlmt_230A_MAX', 'INDIV1_credlmt_935A_MAX', 'INDIV1_dateofrealrepmt_138D_MAX', 'INDIV1_pmts_dpd_1073P_MEAN', 'INDIV1_dpdmaxdatemonth_89T_MAX', 'INDIV1_dpdmaxdateyear_596T_MAX', 'INDIV1_dpdmax_757P_MAX', 'INDIV1_dpdmaxdatemonth_442T_MAX', 'INDIV1_dpdmaxdateyear_896T_MAX', 'INDIV1_pmts_dpd_303P_MEAN', 'INDIV1_instlamount_768A_MAX', 'INDIV1_instlamount_852A_MAX', 'INDIV1_lastupdate_388D_MAX', 'INDIV1_monthlyinstlamount_332A_MAX', 'INDIV1_monthlyinstlamount_674A_MAX', 'INDIV1_nominalrate_281L_MAX', 'INDIV1_nominalrate_498L_MAX', 'INDIV1_numberofinstls_229L_MAX', 'INDIV1_numberofinstls_320L_MAX', 'INDIV1_numberofoutstandinstls_520L_MAX', 'INDIV1_numberofoutstandinstls_59L_MAX', 'INDIV1_numberofoverdueinstlmaxdat_148D_MAX', 'INDIV1_numberofoverdueinstlmaxdat_641D_MAX', 'INDIV1_numberofoverdueinstls_725L_MAX', 'INDIV1_numberofoverdueinstls_834L_MAX', 'INDIV1_outstandingamount_354A_MAX', 'INDIV1_outstandingamount_362A_MAX', 'INDIV1_overdueamount_31A_MAX', 'INDIV1_overdueamount_659A_MAX', 'INDIV1_overdueamountmax2date_1002D_MAX', 'INDIV1_overdueamountmax2date_1142D_MAX', 'INDIV1_pmts_overdue_1152A_MEAN', 'INDIV1_overdueamountmaxdatemonth_284T_MAX', 'INDIV1_overdueamountmaxdateyear_994T_MAX', 'INDIV1_periodicityofpmts_1102L_MAX', 'INDIV1_periodicityofpmts_837L_MAX', 'INDIV1_refreshdate_3813885D_MAX', 'INDIV1_residualamount_488A_MAX', 'INDIV1_residualamount_856A_MAX', 'INDIV1_totalamount_6A_MAX', 'INDIV1_totalamount_996A_MAX', 'empl_employedfrom_271D_MAX', 'amount_416A_MEAN', 'num_group1_MAX_10', 'INDIV1_openingdate_313D_MAX', 'INDIV1_num_group1_MAX_10', 'num_group1_MAX_11', 'INDIV1_num_group1_MAX_11', 'openingdate_857D_MEAN', 'INDIV1_openingdate_857D_MAX', 'collater_valueofguarantee_1124L_MAX', 'collater_valueofguarantee_876L_MAX', 'num_group1_MAX_12', 'num_group2_MAX', 'INDIV1_num_group1_MAX_12', 'INDIV1_num_group2_MAX', 'pmts_month_706T_MAX', 'pmts_year_507T_MAX', 'pmts_dpd_1073P_VAR', 'pmts_dpd_303P_VAR', 'pmts_overdue_1140A_VAR', 'pmts_overdue_1152A_VAR', 'INDIV1_collater_valueofguarantee_1124L_MAX', 'INDIV1_collater_valueofguarantee_876L_MAX', 'INDIV1_pmts_month_706T_MAX', 'INDIV1_pmts_year_507T_MAX', 'INDIV1_pmts_dpd_1073P_VAR', 'INDIV1_pmts_dpd_303P_VAR', 'INDIV1_pmts_overdue_1140A_VAR', 'INDIV1_pmts_overdue_1152A_VAR']

# preprocessing

In [3]:
class Utility:
    @staticmethod
    def get_feat_defs(ending_with: str) -> None:
        """
        Retrieves feature definitions from a CSV file based on the specified ending.

        Args:
        - ending_with (str): Ending to filter feature definitions.

        Returns:
        - pl.DataFrame: Filtered feature definitions.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv")

        filtered_feats: pl.DataFrame = feat_defs.filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )

        with pl.Config(fmt_str_lengths=200, tbl_rows=-1):
            print(filtered_feats)

        filtered_feats = None
        feat_defs = None

    @staticmethod
    def find_index(lst: list[Any], item: Any) -> int | None:
        """
        Finds the index of an item in a list.

        Args:
        - lst (list): List to search.
        - item (Any): Item to find in the list.

        Returns:
        - int | None: Index of the item if found, otherwise None.
        """
        try:
            return lst.index(item)
        except ValueError:
            return None

    @staticmethod
    def dtype_to_str(dtype: pl.DataType) -> str:
        """
        Converts Polars data type to string representation.

        Args:
        - dtype (pl.DataType): Polars data type.

        Returns:
        - str: String representation of the data type.
        """
        dtype_map = {
            pl.Decimal: "Decimal",
            pl.Float32: "Float32",
            pl.Float64: "Float64",
            pl.UInt8: "UInt8",
            pl.UInt16: "UInt16",
            pl.UInt32: "UInt32",
            pl.UInt64: "UInt64",
            pl.Int8: "Int8",
            pl.Int16: "Int16",
            pl.Int32: "Int32",
            pl.Int64: "Int64",
            pl.Date: "Date",
            pl.Datetime: "Datetime",
            pl.Duration: "Duration",
            pl.Time: "Time",
            pl.Array: "Array",
            pl.List: "List",
            pl.Struct: "Struct",
            pl.String: "String",
            pl.Categorical: "Categorical",
            pl.Enum: "Enum",
            pl.Utf8: "Utf8",
            pl.Binary: "Binary",
            pl.Boolean: "Boolean",
            pl.Null: "Null",
            pl.Object: "Object",
            pl.Unknown: "Unknown",
        }

        return dtype_map.get(dtype)

    @staticmethod
    def find_feat_occur(regex_path: str, ending_with: str) -> pl.DataFrame:
        """
        Finds occurrences of features ending with a specific string in Parquet files.

        Args:
        - regex_path (str): Regular expression to match Parquet file paths.
        - ending_with (str): Ending to filter feature names.

        Returns:
        - pl.DataFrame: DataFrame containing feature definitions, data types, and file locations.
        """
        feat_defs: pl.DataFrame = pl.read_csv(ROOT / "feature_definitions.csv").filter(
            pl.col("Variable").apply(lambda var: var.endswith(ending_with))
        )
        feat_defs.sort(by=["Variable"])

        feats: list[pl.String] = feat_defs["Variable"].to_list()
        feats.sort()

        occurrences: list[list] = [[set(), set()] for _ in range(feat_defs.height)]

        for path in glob(str(regex_path)):
            df_schema: dict = pl.read_parquet_schema(path)

            for feat, dtype in df_schema.items():
                index: int = Utility.find_index(feats, feat)
                if index != None:
                    occurrences[index][0].add(Utility.dtype_to_str(dtype))
                    occurrences[index][1].add(Path(path).stem)

        data_types: list[str] = [None] * feat_defs.height
        file_locs: list[str] = [None] * feat_defs.height

        for i, feat in enumerate(feats):
            data_types[i] = list(occurrences[i][0])
            file_locs[i] = list(occurrences[i][1])

        feat_defs = feat_defs.with_columns(pl.Series(data_types).alias("Data_Type(s)"))
        feat_defs = feat_defs.with_columns(pl.Series(file_locs).alias("File_Loc(s)"))

        return feat_defs

    def reduce_memory_usage(df: pl.DataFrame, name) -> pl.DataFrame:
        """
        Reduces memory usage of a DataFrame by converting column types.

        Args:
        - df (pl.DataFrame): DataFrame to optimize.
        - name (str): Name of the DataFrame.

        Returns:
        - pl.DataFrame: Optimized DataFrame.
        """
        print(
            f"Memory usage of dataframe \"{name}\" is {round(df.estimated_size('mb'), 4)} MB."
        )

        int_types = [
            pl.Int8,
            pl.Int16,
            pl.Int32,
            pl.Int64,
            pl.UInt8,
            pl.UInt16,
            pl.UInt32,
            pl.UInt64,
        ]
        float_types = [pl.Float32, pl.Float64]

        for col in df.columns:
            col_type = df[col].dtype
            if col_type in int_types + float_types:
                c_min = df[col].min()
                c_max = df[col].max()

                if c_min is not None and c_max is not None:
                    if col_type in int_types:
                        if c_min >= 0:
                            if (
                                c_min >= np.iinfo(np.uint8).min
                                and c_max <= np.iinfo(np.uint8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt8))
                            elif (
                                c_min >= np.iinfo(np.uint16).min
                                and c_max <= np.iinfo(np.uint16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt16))
                            elif (
                                c_min >= np.iinfo(np.uint32).min
                                and c_max <= np.iinfo(np.uint32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt32))
                            elif (
                                c_min >= np.iinfo(np.uint64).min
                                and c_max <= np.iinfo(np.uint64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.UInt64))
                        else:
                            if (
                                c_min >= np.iinfo(np.int8).min
                                and c_max <= np.iinfo(np.int8).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int8))
                            elif (
                                c_min >= np.iinfo(np.int16).min
                                and c_max <= np.iinfo(np.int16).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int16))
                            elif (
                                c_min >= np.iinfo(np.int32).min
                                and c_max <= np.iinfo(np.int32).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int32))
                            elif (
                                c_min >= np.iinfo(np.int64).min
                                and c_max <= np.iinfo(np.int64).max
                            ):
                                df = df.with_columns(df[col].cast(pl.Int64))
                    elif col_type in float_types:
                        if (
                            c_min > np.finfo(np.float32).min
                            and c_max < np.finfo(np.float32).max
                        ):
                            df = df.with_columns(df[col].cast(pl.Float32))

        print(
            f"Memory usage of dataframe \"{name}\" became {round(df.estimated_size('mb'), 4)} MB."
        )

        return df

    def to_pandas(df: pl.DataFrame, cat_cols: list[str] = None) -> (pd.DataFrame, list[str]):  # type: ignore
        """
        Converts a Polars DataFrame to a Pandas DataFrame.

        Args:
        - df (pl.DataFrame): Polars DataFrame to convert.
        - cat_cols (list[str]): List of categorical columns. Default is None.

        Returns:
        - (pd.DataFrame, list[str]): Tuple containing the converted Pandas DataFrame and categorical columns.
        """
        df: pd.DataFrame = df.to_pandas()

        if cat_cols is None:
            cat_cols = list(df.select_dtypes("object").columns)

        df[cat_cols] = df[cat_cols].astype("category")

        return df, cat_cols

In [4]:
class Aggregator:
    @staticmethod
    def max_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating maximum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for maximum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_max: list[pl.Series] = [
            pl.col(col).max().alias(f"{col}_MAX") for col in cols
        ]

        return expr_max

    @staticmethod
    def min_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating minimum values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for minimum values.
        """
        cols: list[str] = [
            col
            for col in df.columns
            if (col[-1] in ("P", "M", "A", "D", "T", "L")) or ("num_group" in col)
        ]

        expr_min: list[pl.Series] = [
            pl.col(col).min().alias(f"{col}_MIN") for col in cols
        ]

        return expr_min

    @staticmethod
    def mean_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mean values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mean values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).mean().alias(f"{col}_MEAN") for col in cols
        ]

        return expr_mean

    @staticmethod
    def var_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating variance for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for variance.
        """
        cols: list[str] = [col for col in df.columns if col.endswith(("P", "A", "D"))]

        expr_mean: list[pl.Series] = [
            pl.col(col).var().alias(f"{col}_VAR") for col in cols
        ]

        return expr_mean

    @staticmethod
    def mode_expr(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Generates expressions for calculating mode values for specific columns.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of expressions for mode values.
        """
        cols: list[str] = [col for col in df.columns if col.endswith("M")]

        expr_mode: list[pl.Series] = [
            pl.col(col).drop_nulls().mode().first().alias(f"{col}_MODE") for col in cols
        ]

        return expr_mode

    @staticmethod
    def get_exprs(df: pl.LazyFrame) -> list[pl.Series]:
        """
        Combines expressions for maximum, mean, and variance calculations.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - list[pl.Series]: List of combined expressions.
        """
        exprs = (
            Aggregator.max_expr(df) + Aggregator.mean_expr(df) + Aggregator.var_expr(df)
        )

        return exprs

In [5]:
a = features+['case_id','target','MONTH','date_decision','WEEK_NUM']

In [6]:
class SchemaGen:
    @staticmethod
    def change_dtypes(df: pl.LazyFrame) -> pl.LazyFrame:
        """
        Changes the data types of columns in the DataFrame.

        Args:
        - df (pl.LazyFrame): Input LazyFrame.

        Returns:
        - pl.LazyFrame: LazyFrame with modified data types.
        """
        for col in df.columns:
            if col == "case_id":
                df = df.with_columns(pl.col(col).cast(pl.UInt32).alias(col))
            elif col in ["WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.UInt16).alias(col))
            elif col == "date_decision" or col[-1] == "D":
                df = df.with_columns(pl.col(col).cast(pl.Date).alias(col))
            elif col[-1] in ["P", "A"]:
                df = df.with_columns(pl.col(col).cast(pl.Float64).alias(col))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.String))
        return df

    @staticmethod
    def scan_files(glob_path: str, depth: int = None):
        chunks = []
        for path in glob(str(glob_path)):
            df = pl.read_parquet(path, low_memory=True, rechunk=True)
            df = df.pipe(SchemaGen.change_dtypes)
            
            if depth in [1, 2]:
                
                indiv = df.filter(pl.col("num_group1")==0).group_by("case_id").agg(Aggregator.get_exprs(df))
                indiv.columns = [f"INDIV1_{x}" if x!='case_id' else x for x in indiv.columns ]
                df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
                df = df.join(indiv,how='inner',on='case_id')
                del indiv
                    
            chunks.append(df)
        df = pl.concat(chunks, how="vertical_relaxed")
        del chunks
        gc.collect()

        df = df.unique(subset=["case_id"]) 
        
        return df[[x for x in df.columns if x in a]]

    @staticmethod
    def join_dataframes(df_base, depth_0, depth_1, depth_2):
        for i, df in enumerate(depth_0 + depth_1 + depth_2):
            df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
        return df_base


In [7]:
def filter_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Filters columns in the DataFrame based on null percentage and unique values for string columns.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with filtered columns.
    """
    for col in df.columns:
        if col not in ["case_id", "year", "month", "week_num", "target"]:
            null_pct = df[col].is_null().mean()

            if null_pct > 0.95:
                df = df.drop(col)

    for col in df.columns:
        if (col not in ["case_id", "year", "month", "week_num", "target"]) & (
            df[col].dtype == pl.String
        ):
            freq = df[col].n_unique()

            if (freq > 200) | (freq == 1):
                df = df.drop(col)

    return df


def transform_cols(df: pl.DataFrame) -> pl.DataFrame:
    """
    Transforms columns in the DataFrame according to predefined rules.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed columns.
    """
    if "riskassesment_302T" in df.columns:
        if df["riskassesment_302T"].dtype == pl.Null:
            df = df.with_columns(
                [
                    pl.Series(
                        "riskassesment_302T_rng", df["riskassesment_302T"], pl.UInt8
                    ),
                    pl.Series(
                        "riskassesment_302T_mean", df["riskassesment_302T"], pl.UInt8
                    ),
                ]
            )
        else:
            pct_low: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[0].replace("%", ""))
                .cast(pl.UInt8)
            )
            pct_high: pl.Series = (
                df["riskassesment_302T"]
                .str.split(" - ")
                .apply(lambda x: x[1].replace("%", ""))
                .cast(pl.UInt8)
            )

            diff: pl.Series = pct_high - pct_low
            avg: pl.Series = ((pct_low + pct_high) / 2).cast(pl.Float32)

            del pct_high, pct_low
            gc.collect()

            df = df.with_columns(
                [
                    diff.alias("riskassesment_302T_rng"),
                    avg.alias("riskassesment_302T_mean"),
                ]
            )

        df.drop("riskassesment_302T")

    return df


def handle_dates(df: pl.DataFrame) -> pl.DataFrame:
    """
    Handles date columns in the DataFrame.

    Args:
    - df (pl.DataFrame): Input DataFrame.

    Returns:
    - pl.DataFrame: DataFrame with transformed date columns.
    """
    for col in df.columns:
        if (col[-1] == 'D') or ('D_' in col):
            df = df.with_columns(pl.col(col) - pl.col("date_decision"))
            df = df.with_columns(pl.col(col).dt.total_days().cast(pl.Int32))

            
    df = df.drop("MONTH")
    df = df.drop("WEEK_NUM")

    df = df.with_columns(pl.col("date_decision").dt.month().alias("month").cast(pl.Int16))
    df = df.with_columns(pl.col("date_decision").dt.day().alias("day").cast(pl.UInt8))
    df = df.with_columns(pl.col("date_decision").dt.weekday().alias("weekday").cast(pl.UInt8))
    df = df.with_columns(((pl.col("date_decision") - pl.lit("2019-01-01").cast(pl.Date)).dt.total_days() / 7).floor().alias("week_num").cast(pl.Int32))

#     df = df.drop("date_decision")
    return df 

In [8]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TRAIN_DIR / "train_static_cb_0.parquet"),
        SchemaGen.scan_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_other_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_person_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_deposit_1.parquet", 1),
        SchemaGen.scan_files(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
    ],
}

df_train: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(filter_cols)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .pipe(Utility.reduce_memory_usage, "df_train")
)

del data_store
gc.collect()

print(f"Train data shape: {df_train.shape}")
display(df_train.head(10))

Memory usage of dataframe "df_train" is 4219.6652 MB.
Memory usage of dataframe "df_train" became 2158.0606 MB.
Train data shape: (1526659, 415)


case_id,date_decision,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days180_256L,days30_165L,days360_512L,firstquarter_103L,fourthquarter_440L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuitynextmonth_57A,applicationcnt_361L,applications30d_658L,applicationscnt_1086L,applicationscnt_464L,applicationscnt_867L,avgdbddpdlast3m_4187120P,avgdbdtollast24m_4525197P,avgdpdtolclosure24_3658938P,avginstallast24m_3658937A,avglnamtstart24m_4525187A,…,INDIV1_num_group1_MAX_11,INDIV1_openingdate_857D_MAX,collater_valueofguarantee_1124L_MAX,collater_valueofguarantee_876L_MAX,num_group1_MAX_12,num_group2_MAX,pmts_dpd_303P_MAX,pmts_month_158T_MAX,pmts_month_706T_MAX,pmts_year_507T_MAX,pmts_dpd_1073P_MEAN,pmts_dpd_303P_MEAN,pmts_overdue_1140A_MEAN,pmts_overdue_1152A_MEAN,pmts_dpd_1073P_VAR,pmts_dpd_303P_VAR,pmts_overdue_1140A_VAR,pmts_overdue_1152A_VAR,INDIV1_collater_valueofguarantee_1124L_MAX,INDIV1_collater_valueofguarantee_876L_MAX,INDIV1_num_group1_MAX_12,INDIV1_num_group2_MAX,INDIV1_pmts_month_158T_MAX,INDIV1_pmts_month_706T_MAX,INDIV1_pmts_year_1139T_MAX,INDIV1_pmts_year_507T_MAX,INDIV1_pmts_dpd_1073P_MEAN,INDIV1_pmts_dpd_303P_MEAN,INDIV1_pmts_overdue_1152A_MEAN,INDIV1_pmts_dpd_1073P_VAR,INDIV1_pmts_dpd_303P_VAR,INDIV1_pmts_overdue_1140A_VAR,INDIV1_pmts_overdue_1152A_VAR,month,day,weekday,week_num
u32,date,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i8,u8,i8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,…,u8,i16,f32,f32,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u8,u8,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,u8,u8,u8,u8
2666437,2020-02-12,0,,14.0,,,-24362.0,2.0,0.0,3.0,0.0,1.0,,7641.200195,6.0,,,,,14.0,,1.0,4.0,0.0,53994.320312,0.0,0.0,0.0,0.0,0.0,0.0,-16.0,-9.0,0.0,5399.399902,,…,0.0,,,0.0,3.0,35.0,9.0,,12.0,2020.0,,0.307692,,105.528961,,1.716346,,284882.21875,,0.0,0.0,23.0,,12.0,,2017.0,,0.631579,2.842105,,4.578948,,72.473686,2,12,3,58
1763328,2020-01-26,0,,,,,-20844.0,5.0,0.0,7.0,3.0,3.0,,,,,,,,14.0,,2.0,7.0,0.0,24582.599609,0.0,0.0,0.0,0.0,0.0,11.0,,3.0,4.0,3816.800049,,…,,,0.0,0.0,14.0,35.0,208.0,12.0,12.0,2020.0,0.933333,14.445714,1000.992188,2945.807617,6.891954,2051.89209,8833225.0,58566812.0,0.0,0.0,0.0,35.0,12.0,12.0,2021.0,2008.0,1.0,4.0,0.0,8.347826,,2664494.5,,1,26,7,55
2677629,2020-03-17,0,,14.0,,,-25066.0,4.0,0.0,5.0,1.0,5.0,,11742.0,13.0,,,,,14.0,,2.0,3.0,0.0,161073.765625,2584.800049,0.0,0.0,0.0,0.0,5.0,-10.0,-6.0,0.0,19165.400391,123812.804688,…,,,0.0,131000.0,11.0,35.0,1134.0,12.0,12.0,2021.0,0.9375,152.877045,272.430634,11747.194336,14.641129,111778.507812,2189714.5,566663360.0,0.0,131000.0,0.0,35.0,12.0,12.0,2021.0,2007.0,0.526316,0.0,0.0,4.263158,0.0,3681500.0,0.0,3,17,2,63
2595039,2019-07-11,0,,,-22320.0,,-22320.0,4.0,2.0,7.0,8.0,1.0,,,,,6.0,39004.113281,14.0,,,6.0,7.0,0.0,194304.671875,2282.199951,0.0,0.0,0.0,0.0,4.0,-5.0,,0.0,8142.0,,…,,,0.0,0.0,9.0,35.0,0.0,12.0,12.0,2020.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,12.0,12.0,2020.0,2007.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,11,4,27
1245849,2019-01-08,0,-3907.0,,-25119.0,,-25119.0,3.0,1.0,5.0,1.0,2.0,17632.367188,,,6.0,,,14.0,,,3.0,2.0,0.0,,5506.200195,0.0,0.0,0.0,0.0,2.0,,,1.0,4959.399902,,…,0.0,-1602.0,0.0,,2.0,23.0,,12.0,,,0.555556,,163.690002,,7.057348,,481596.96875,,0.0,,0.0,23.0,12.0,,2019.0,,1.291667,,,17.867754,,5856.276367,,1,8,2,1
1728777,2020-01-02,0,,,,,-12845.0,5.0,0.0,8.0,4.0,6.0,,,,,,,,14.0,,7.0,17.0,0.0,44697.152344,2488.400146,0.0,1.0,0.0,0.0,2.0,-28.0,-6.0,1.0,2348.0,34598.0,…,,,0.0,0.0,5.0,35.0,2478.0,12.0,12.0,2020.0,0.0,469.521118,0.0,738.780029,0.0,828220.1875,0.0,2207470.5,0.0,0.0,0.0,35.0,12.0,12.0,2020.0,2015.0,0.0,6.125,942.336792,0.0,183.244568,0.0,4903116.5,1,2,4,52
206082,2020-02-19,0,,,,,-18737.0,11.0,1.0,44.0,26.0,18.0,,,,,,,,12.0,,19.0,12.0,0.0,102083.695312,0.0,0.0,0.0,0.0,0.0,14.0,,-1.0,0.0,6624.200195,,…,,,0.0,4971001.0,47.0,35.0,155.0,12.0,12.0,2021.0,10.745763,4.35503,7826.001465,3161.945068,561.882507,327.944641,164799296.0,69433952.0,0.0,65000.0,0.0,35.0,12.0,12.0,2021.0,2006.0,2.25,0.0,0.0,56.804348,,525442.0625,,2,19,3,59
823244,2019-10-06,0,,,,,,,,,,,,,,,2.0,1085.0,14.0,14.0,,,,,,0.0,0.0,0.0,0.0,2.0,0.0,,,,,,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,6,7,39
1883813,2020-07-27,0,,,,0.0,-22276.0,0.0,0.0,1.0,0.0,0.0,,,,,,,,,14.0,0.0,1.0,0.0,11998.0,0.0,0.0,0.0,0.0,0.0,2.0,,-8.0,0.0,2999.600098,,…,,,0.0,0.0,1.0,35.0,1.0,12.0,12.0,2021.0,0.0,0.034483,0.0,112.524139,0.0,0.034483,0.0,367188.78125,0.0,0.0,0.0,35.0,12.0,12.0,2021.0,2021.0,0.0,0.041667,135.96666,0.0,0.041667,0.0,443686.4375,7,27,1,81
1900087,2020-08-22,0,,,,1066400.0,-10614.0,5.0,4.0,10.0,7.0,4.0,,,,,,,,,14.0,2.0,11.0,0.0,11683.839844,0.0,0.0,1.0,0.0,0.0,4.0,11.0,-4.0,0.0,1565.0,,…,,,0.0,0.0,4.0,35.0,6.0,12.0,12.0,2020.0,0.661765,0.484848,404.984772,682.593933,9.033143,1.820076,2515550.5,2691600.0,0.0,0.0,0.0,35.0,12.0,12.0,2021.0,2016.0,0.0,1.066667,1501.706665,0.0,3.495238,0.0,4834228.5,8,22,6,85


In [9]:
data_store: dict = {
    "df_base": SchemaGen.scan_files(TEST_DIR / "test_base.parquet"),
    "depth_0": [
        SchemaGen.scan_files(TEST_DIR / "test_static_cb_0.parquet"),
        SchemaGen.scan_files(TEST_DIR / "test_static_0_*.parquet"),
    ],
    "depth_1": [
        SchemaGen.scan_files(TEST_DIR / "test_applprev_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_a_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_tax_registry_c_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_1_*.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_other_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_person_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_deposit_1.parquet", 1),
        SchemaGen.scan_files(TEST_DIR / "test_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_a_2_*.parquet", 2),
        SchemaGen.scan_files(TEST_DIR / "test_credit_bureau_b_2.parquet", 2),
    ],
}

df_test: pl.DataFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(transform_cols)
    .pipe(handle_dates)
    .select([col for col in df_train.columns if col != "target"])
    .pipe(Utility.reduce_memory_usage, "df_test")
)

del data_store
gc.collect()

print(f"Test data shape: {df_test.shape}")

Memory usage of dataframe "df_test" is 0.0278 MB.
Memory usage of dataframe "df_test" became 0.0155 MB.
Test data shape: (10, 414)


In [10]:
if 'target' not in df_test.columns:
    df_test = df_test.with_columns(pl.lit(0).alias('target').cast(pl.Int8))

In [11]:
df, cat_cols = Utility.to_pandas(
                        pl.concat([
                                 df_train.with_columns(pl.lit('train').alias('partition')),
                                 df_test.select(df_train.columns).with_columns(pl.lit('test').alias('partition'))
                                    ],how='vertical_relaxed')
                                )

In [12]:
df_train = df[df['partition']=='train'].reset_index(drop=True)
n_train = df_train.shape[0]

In [13]:
df_test  = df.loc[df['partition']=='test',features].reset_index(drop=True)

In [14]:
del df
gc.collect()

0

In [15]:
df_train['score'] = 0.0
df_test['score'] = 0.0

# exploration

In [16]:
# placeholder for exploration code 
# im doing it on databricks so its faster

# modeling

In [17]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedGroupKFold, StratifiedKFold
import lightgbm as lgb 
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK
from hyperopt.pyll import scope
from functools import partial

In [18]:
def gini_stability(base, w_fallingrate=88.0, w_resstd=-0.5):
    gini_in_time = base.loc[:, ["week_num", "target", "score"]]\
        .sort_values("week_num")\
        .groupby("week_num")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    x = np.arange(len(gini_in_time))
    y = gini_in_time
    a, b = np.polyfit(x, y, 1)
    y_hat = a*x + b
    residuals = y - y_hat
    res_std = np.std(residuals)
    avg_gini = np.mean(gini_in_time)
    return avg_gini + w_fallingrate * min(0, a) + w_resstd * res_std

## tune - lgbm

In [19]:
# def get_lgbm_base_params():
#     base_params = {
#         'boosting_type':'gbdt',
#         'random_state': 117,
#         'objective': 'binary',
#         'metric': 'auc',
#         'extra_trees':True,
#         'verbose': -1,
#         'max_bin': 256,
# #         'device_type': 'gpu', 'gpu_use_dp':True,
        
#     }
#     return base_params

In [20]:
# # set up search space - turn off for submission
# lgbm_search_space_setup = {
#     'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),
#     'max_depth': scope.int(hp.uniform('max_depth', 3, 25)),
#     'l1_regularization': hp.loguniform('l1_regularization', np.log(.001), np.log(1000)),
#     'l2_regularization':hp.loguniform('l2_regularization',np.log(.001), np.log(100)),
#     'cat_l2': hp.loguniform('cat_l2', np.log(.001), np.log(100)),
#     'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),
#     'bagging_freq': scope.int(hp.uniform('bagging_freq', 2, 10)),
#     'learning_rate' : hp.loguniform('learning_rate', np.log(0.001), np.log(.1)),
#     'n_estimators':scope.int(hp.uniform('n_estimators', 500, 2000)),
#     'num_leaves': scope.int(hp.uniform('num_leaves', 50, 5000)),
# }
# lgbm_search_space = get_lgbm_base_params()
# for k,v in lgbm_search_space_setup.items():
#     lgbm_search_space[k] = v

In [21]:
#### do splits ahead of time to improve trial speed - turn off for submission
# k              = 5

# # split by week num
# group_splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
#                       StratifiedGroupKFold(n_splits=k).split(np.arange(n_train),
#                                                              df_train['target'],
#                                                              groups = df_train['week_num'])]
# # split by target
# strat_splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
#                       StratifiedKFold(n_splits=k).split(np.arange(n_train),
#                                                              df_train['target'])]


# # single split
# train_idx, test_idx, _, _ = train_test_split(np.arange(n_train),df_train['target'], test_size=0.1, random_state=117,stratify = df_train['target'])
# single_splits = [(train_idx, test_idx)]

In [22]:
# def trial_fn_lgbm_gini_stability(params,splits = None):
    

#     for train_idx, valid_idx in splits:
#         model = lgb.LGBMClassifier(**params)  
#         model.fit(df_train.loc[train_idx,features], df_train.loc[train_idx,'target'],
#                   eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
#                   eval_metric='auc',
#                   callbacks=[lgb.early_stopping(50)])
#         df_train.loc[valid_idx,'score'] = model.predict_proba(df_train.loc[valid_idx,features])
    
    
#     score = gini_stability(df_train)
        
#     out = {"status": STATUS_OK, "loss": -score} # always minimizes
#     return out

In [23]:
# def trial_fn_lgbm_auc(params,splits = None):
    
#     scores = []
#     for train_idx, valid_idx in splits:
#         model = lgb.LGBMClassifier(**params)  
#         model.fit(df_train.loc[train_idx,features], df_train.loc[train_idx,'target'],
#                   eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
#                   eval_metric='auc',
#                   callbacks=[lgb.early_stopping(50)])

#         score = roc_auc_score(df_train.loc[valid_idx,'target'],model.predict_proba(df_train.loc[valid_idx,features]))
#         scores.append(score)
    
#     score = np.mean(scores) - np.std(scores)
#     out = {"status": STATUS_OK, "loss": -score} # always minimizes
#     return out

In [24]:
# best_params = fmin(fn=partial(trial_fn_lgbm_auc, splits = group_splits),
#                     space=search_space,
#                     algo=tpe.suggest,
#                     max_evals=100,
#                     timeout=60*60*3 # seconds
#                   )
# int_params = ['max_depth','n_estimators','bagging_freq','num_leaves']
# bestp = get_base_params()
# for k,v in best_params.items():
#     if k in int_params:
#         bestp[k] = int(v)
#     else:
#         bestp[k] = v
# bestp

In [25]:
# best_params = fmin(fn=partial(trial_fn_lgbm_gini_stability, splits = group_splits),
#                     space=search_space,
#                     algo=tpe.suggest,
#                     max_evals=100,
#                     timeout=60*60*3 # seconds
#                   )
# int_params = ['max_depth','n_estimators','bagging_freq','num_leaves']
# bestp = get_base_params()
# for k,v in best_params.items():
#     if k in int_params:
#         bestp[k] = int(v)
#     else:
#         bestp[k] = v
# bestp

## train - lgbm

In [26]:
#### do splits ahead of time to improve trial speed - turn off for submission
k              = 8

# # split by week num
group_splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
                      StratifiedGroupKFold(n_splits=k).split(np.arange(n_train),
                                                             df_train['target'],
                                                             groups = df_train['week_num'])]
# # split by target
# strat_splits   = [(train_idx,valid_idx) for train_idx,valid_idx in 
#                       StratifiedKFold(n_splits=k).split(np.arange(n_train),
#                                                              df_train['target'])]


# single split
# train_idx, test_idx, _, _ = train_test_split(np.arange(n_train),df_train['target'], test_size=0.1, random_state=117,stratify = df_train['target'])

In [27]:
bestp = {
'random_state': 117,
'objective': 'binary',
'metric': 'auc',
'extra_trees': True,
'verbose': -1,
'max_bin': 255,
'device': 'gpu',
'gpu_use_dp': True,
'num_estimators': 10000,
'cat_l2': 0.012031967105637895,
'feature_fraction': 0.7225434843853376,
'lambda_l1': 0.016,
'lambda_l2': 6.861494353196064e-06,
'learning_rate': 0.05,
'min_data_in_leaf': 3400,
'num_leaves': 2095
}

In [41]:
features = [x for x in features if x not in ['case_id','target','date_decision']]

In [43]:
# group stratify ensemble
for train_idx, valid_idx in group_splits:
    model = lgb.LGBMClassifier(**bestp)  
    model.fit(df_train.loc[train_idx,features], df_train.loc[train_idx,'target'],
              eval_set=[(df_train.loc[valid_idx,features], df_train.loc[valid_idx,'target'])],
              eval_metric='auc',
              callbacks=[lgb.early_stopping(50)])
    df_test['score'] += model.predict_proba(df_test[features])[:,1] / k

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.838455
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.825904
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.840434
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.838033
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.840878
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.842896
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[100]	valid_0's auc: 0.846211


KeyboardInterrupt: 

# submission

In [44]:
# why are rules not implemented to avoid people having to do this hack?
# hosts are ok with hacking... so I guess we hack. 
# https://www.kaggle.com/competitions/home-credit-credit-risk-model-stability/discussion/497337


# condition = df_test['week_num'] < (df_test['week_num'].max()-df_test['week_num'].min())/2 + df_test['week_num'].min() 

# offset = -.05

# df_test.loc[condition, 'score'] = (df_test.loc[condition, 'score'] - offset).clip(0)

In [45]:
df_test[['case_id','score']].to_csv('submission.csv', index=False)
df_test[['case_id','score']].head()

Unnamed: 0,case_id,score
0,57631,0.075115
1,57569,0.140563
2,57633,0.08281
3,57634,0.078603
4,57632,0.066602
