In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')


from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [4]:
class Pipeline:

    def set_table_dtypes(df):
        for col in df.columns:
            if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
                df = df.with_columns(pl.col(col).cast(pl.Int64))
            elif col in ["date_decision"]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif col[-1] in ("P", "A"):
                df = df.with_columns(pl.col(col).cast(pl.Float64))
            elif col[-1] in ("M",):
                df = df.with_columns(pl.col(col).cast(pl.Utf8))
            elif col[-1] in ("D",):
                df = df.with_columns(pl.col(col).cast(pl.Date))
        return df

    def handle_dates(df):
        for col in df.columns:
            if col[-1] in ("D",):
                df = df.with_columns(pl.col(col) - pl.col("date_decision"))  #!!?
                df = df.with_columns(pl.col(col).dt.total_days()) # t - t-1
        df = df.drop("date_decision", "MONTH")
        return df

    def filter_cols(df):
        for col in df.columns:
            if col not in ["target", "case_id", "WEEK_NUM"]:
                isnull = df[col].is_null().mean()
                if isnull > 0.9:
                    df = df.drop(col)

        for col in df.columns:
            if (col not in ["target", "case_id", "WEEK_NUM"]) & (df[col].dtype == pl.Utf8):
                freq = df[col].n_unique()
                if (freq == 1) | (freq > 200):
                    df = df.drop(col)

        return df
class Aggregator:
    @staticmethod
    def num_expr(df):

        cols = [col for col in df.columns if col[-1] in ("P", "A")]

        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]


        return expr_max

    @staticmethod
    def date_expr(df):

        cols = [col for col in df.columns if col[-1] in ("D",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        return expr_max

    @staticmethod
    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)
        return exprs

'''
class Aggregator:
    #Please add or subtract features yourself, be aware that too many features will take up too much space.
    def num_expr(df):
        cols = [col for col in df.columns if col[-1] in ("P", "A")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]

        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return expr_max +expr_last+expr_mean

    def date_expr(df):
        cols = [col for col in df.columns if col[-1] in ("D")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        expr_mean = [pl.mean(col).alias(f"mean_{col}") for col in cols]
        return  expr_max +expr_last+expr_mean

    def str_expr(df):
        cols = [col for col in df.columns if col[-1] in ("M",)]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        #expr_count = [pl.count(col).alias(f"count_{col}") for col in cols]
        return  expr_max +expr_last#+expr_count

    def other_expr(df):
        cols = [col for col in df.columns if col[-1] in ("T", "L")]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last

    def count_expr(df):
        cols = [col for col in df.columns if "num_group" in col]
        expr_max = [pl.max(col).alias(f"max_{col}") for col in cols]
        #expr_min = [pl.min(col).alias(f"min_{col}") for col in cols]
        expr_last = [pl.last(col).alias(f"last_{col}") for col in cols]
        #expr_first = [pl.first(col).alias(f"first_{col}") for col in cols]
        return  expr_max +expr_last

    def get_exprs(df):
        exprs = Aggregator.num_expr(df) + \
                Aggregator.date_expr(df) + \
                Aggregator.str_expr(df) + \
                Aggregator.other_expr(df) + \
                Aggregator.count_expr(df)

        return exprs
'''
def read_file(path, depth=None):
    df = pl.read_parquet(path)
    df = df.pipe(Pipeline.set_table_dtypes)
    if depth in [1,2]:
        df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
    return df

def read_files(regex_path, depth=None):
    chunks = []

    for path in glob(str(regex_path)):
        df = pl.read_parquet(path)
        df = df.pipe(Pipeline.set_table_dtypes)
        if depth in [1, 2]:
            df = df.group_by("case_id").agg(Aggregator.get_exprs(df))
        chunks.append(df)

    df = pl.concat(chunks, how="vertical_relaxed")
    df = df.unique(subset=["case_id"])
    return df

def feature_eng(df_base, depth_0, depth_1, depth_2):
    df_base = (
        df_base
        .with_columns(
            month_decision = pl.col("date_decision").dt.month(),
            weekday_decision = pl.col("date_decision").dt.weekday(),
        )
    )
    for i, df in enumerate(depth_0 + depth_1 + depth_2):
        df_base = df_base.join(df, how="left", on="case_id", suffix=f"_{i}")
    df_base = df_base.pipe(Pipeline.handle_dates)
    return df_base

def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data, cat_cols

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type)=="category":
            continue

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            continue
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [5]:
ROOT            = Path('/content/drive/MyDrive/home-credit-credit-risk-model-stability(1)')

TRAIN_DIR       = ROOT / "parquet_files" / "train"
TEST_DIR        = ROOT / "parquet_files" / "test"

In [6]:
data_store = {
    "df_base": read_file(TRAIN_DIR / "train_base.parquet"),
    "depth_0": [
        read_file(TRAIN_DIR / "train_static_cb_0.parquet"),
        read_files(TRAIN_DIR / "train_static_0_*.parquet"),
    ],
    "depth_1": [
        read_files(TRAIN_DIR / "train_applprev_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_a_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_tax_registry_c_1.parquet", 1),
        read_files(TRAIN_DIR / "train_credit_bureau_a_1_*.parquet", 1),
        read_file(TRAIN_DIR / "train_credit_bureau_b_1.parquet", 1),
        read_file(TRAIN_DIR / "train_other_1.parquet", 1),
        read_file(TRAIN_DIR / "train_person_1.parquet", 1),
        read_file(TRAIN_DIR / "train_deposit_1.parquet", 1),
        read_file(TRAIN_DIR / "train_debitcard_1.parquet", 1),
    ],
    "depth_2": [
        read_file(TRAIN_DIR / "train_credit_bureau_b_2.parquet", 2),
        read_files(TRAIN_DIR / "train_credit_bureau_a_2_*.parquet", 2),
        #read_file(TRAIN_DIR / "train_applprev_2.parquet", 2),
        #read_file(TRAIN_DIR / "train_person_2.parquet", 2)
    ]
}

In [7]:
# selected_features = ["target", "case_id", "WEEK_NUM",'month_decision', 'weekday_decision', 'birthdate_574D', 'dateofbirth_337D', 'days120_123L', 'days180_256L', 'days30_165L', 'days360_512L', 'days90_310L', 'firstquarter_103L', 'fourthquarter_440L', 'numberofqueries_373L', 'pmtscount_423L', 'responsedate_1012D', 'responsedate_4917613D', 'thirdquarter_1082L', 'annuitynextmonth_57A', 'applications30d_658L', 'applicationscnt_867L', 'avgdbddpdlast24m_3658932P', 'avgdbdtollast24m_4525197P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'clientscnt12m_3712952L', 'clientscnt3m_3712950L', 'clientscnt_100L', 'clientscnt_1071L', 'clientscnt_257L', 'clientscnt_304L', 'clientscnt_360L', 'clientscnt_493L', 'clientscnt_533L', 'clientscnt_887L', 'clientscnt_946L', 'cntpmts24_3658933L', 'commnoinclast6m_3546845L', 'credamount_770A', 'currdebt_22A', 'currdebtcredtyperange_828A', 'daysoverduetolerancedd_3976961L', 'deferredmnthsnum_166L', 'downpmt_116A', 'dtlastpmtallstes_4499206D', 'eir_270L', 'firstclxcampaign_1125D', 'homephncnt_628L', 'lastapprcredamount_781A', 'lastdelinqdate_224D', 'mastercontrelectronic_519L', 'maxdbddpdtollast12m_3658940P', 'maxdebt4_972A', 'maxdpdfrom6mto36m_3546853P', 'maxdpdinstldate_3546855D', 'maxdpdlast12m_727P', 'maxlnamtstart6m_4525199A', 'mindbddpdlast24m_3658935P', 'monthsannuity_845L', 'numactivecreds_622L', 'numactiverelcontr_750L', 'numinstls_657L', 'numinstlswithdpd5_4187116L', 'numinstpaidearly_338L', 'numinstpaidearlyest_4493214L', 'numinstregularpaid_973L', 'numinstregularpaidest_4493210L', 'numinsttopaygrest_4493213L', 'numinstunpaidmax_3546851L', 'numnotactivated_1143L', 'numpmtchanneldd_318L', 'pctinstlsallpaidearl3d_427L', 'pctinstlsallpaidlat10d_839L', 'pctinstlsallpaidlate4d_3546849L', 'pmtnum_254L', 'posfpd10lastmonth_333P', 'posfpd30lastmonth_3976960P', 'sumoutstandtotal_3546847A', 'totalsettled_863A', 'totinstallast1m_4525188A', 'max_actualdpd_943P', 'max_annuity_853A', 'max_credacc_credlmt_575A', 'max_credacc_minhisbal_90A', 'max_credamount_590A', 'max_currdebt_94A', 'max_downpmt_134A', 'max_maxdpdtolerance_577P', 'max_dtlastpmt_581D', 'max_dtlastpmtallstes_3545839D', 'max_firstnonzeroinstldate_307D', 'max_childnum_21L', 'max_credacc_transactions_402L', 'max_tenor_203L', 'max_amount_4527230A', 'max_num_group1_3', 'max_debtoutstand_525A', 'max_debtoverdue_47A', 'max_dpdmax_139P', 'max_dpdmax_757P', 'max_residualamount_856A', 'max_totalamount_996A', 'max_totaloutstanddebtvalue_39A', 'max_totaloutstanddebtvalue_668A', 'max_dateofcredend_289D', 'max_dateofcredend_353D', 'max_dateofcredstart_181D', 'max_dateofrealrepmt_138D', 'max_lastupdate_1112D', 'max_lastupdate_388D', 'max_numberofoverdueinstlmaxdat_148D', 'max_numberofoverdueinstlmaxdat_641D', 'max_refreshdate_3813885D', 'max_contractsum_5085717L', 'max_dpdmaxdatemonth_442T', 'max_dpdmaxdateyear_896T', 'max_nominalrate_281L', 'max_numberofcontrsvalue_258L', 'max_numberofinstls_320L', 'max_numberofoutstandinstls_520L', 'max_numberofoverdueinstlmax_1039L', 'max_numberofoverdueinstlmax_1151L', 'max_numberofoverdueinstls_725L', 'max_overdueamountmaxdatemonth_365T', 'max_periodicityofpmts_1102L', 'max_periodicityofpmts_837L', 'max_num_group1_6', 'max_persontype_1072L', 'max_pmts_overdue_1140A', 'max_collater_valueofguarantee_876L', 'max_pmts_year_1139T']


In [8]:
# data=data[:train_size]
# df_test_1=data[train_size:]
# y=data['target'][:train_size]
# data=data.drop(columns=["target", "case_id", "WEEK_NUM"])
# df_test_1=df_test_1.drop(columns=["target", "case_id", "WEEK_NUM"])
# y_test=data['target'][train_size:]

In [9]:

df_train = feature_eng(**data_store)
print("train data shape:\t", df_train.shape)


del data_store
gc.collect()
df_train = df_train.pipe(Pipeline.filter_cols)
df_train, cat_cols = to_pandas(df_train)
df_train = reduce_mem_usage(df_train)
print("train data shape:\t", df_train.shape)
nums=df_train.select_dtypes(exclude='category').columns

train data shape:	 (1526659, 472)
Memory usage of dataframe is 3243.87 MB
Memory usage after optimization is: 1126.94 MB
Decreased by 65.3%
train data shape:	 (1526659, 342)


In [10]:
data=df_train.copy()

In [11]:
!pip install optbinning

Collecting optbinning
  Downloading optbinning-0.19.0-py3-none-any.whl (213 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/213.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━[0m [32m133.1/213.5 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.5/213.5 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Collecting ortools>=9.4 (from optbinning)
  Downloading ortools-9.9.3963-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.8/24.8 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
Collecting ropwr>=1.0.0 (from optbinning)
  Downloading ropwr-1.0.0-py3-none-any.whl (17 kB)
Collecting absl-py>=2.0.0 (from ortools>=9.4->optbinning)
  Downloading absl_py-2.1.0-py3-none-any.whl (133 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7

In [12]:
data=data.drop(columns=["target", "case_id", "WEEK_NUM"])
num = data.select_dtypes(include=[np.number]).columns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
X_train_continuous=data[num]
scaler = StandardScaler()
X_train_continuous.fillna(0, inplace=True)
pca = PCA(n_components=0.93) 
X_train_pca = pca.fit_transform(X_train_scaled)
X_train_pca_df = pd.DataFrame(X_train_pca, columns=[f"PCA_{i}" for i in range(1, pca.n_components_ + 1)])

In [13]:
data2=df_train.copy()
X_train_pca_df.index = data.index
data_bin_pca = pd.concat([data,X_train_pca_df], axis=1)
data_bin_pca_1=pd.concat([data2[["case_id", "WEEK_NUM",'target']],data_bin_pca],axis=1)

In [15]:
from optbinning import OptimalBinning
target = data_bin_pca_1["target"]

iv_threshold = 0.3
high_iv_features = []
numerical_features = data_bin_pca_1.select_dtypes(include=['int64', 'float64', 'float16', 'int32', 'int16']).columns
categorical_features = data_bin_pca_1.select_dtypes(include=['object', 'category']).columns

# Initialize storage for IV values
feature_iv = {}

# Iterate over features
for feature in numerical_features.union(categorical_features):
    data_type = "categorical" if feature in categorical_features else "numerical"
    optb = OptimalBinning(name=feature, dtype=data_type, solver="cp")

    # Fit the binning model
    optb.fit(data_bin_pca_1[feature], target)
    # Access the binning table and then build it for output if necessary
    binning_table = optb.binning_table
    binning_table_df = binning_table.build()  # Create DataFrame from binning_table for display or further analysis

    # Obtain the IV directly from the model object
    iv = binning_table.iv

    # Store the IV value
    feature_iv[feature] = iv

    # Decide based on IV
    if iv > iv_threshold:
        # Replace original data with binned data
        data[feature] = optb.transform(data_bin_pca_1[feature], metric="woe")
        high_iv_features.append(feature)
    else:
        print(f"Feature {feature} discarded due to low IV ({iv:.3f})")
# Output the stored IV values
for feature, iv in feature_iv.items():
    print(f"Feature: {feature}, IV: {iv:.3f}")

(CVXPY) Apr 16 10:24:25 PM: Encountered unexpected exception importing solver GLOP:
RuntimeError('Unrecognized new version of ortools (9.9.3963). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')
(CVXPY) Apr 16 10:24:25 PM: Encountered unexpected exception importing solver PDLP:
RuntimeError('Unrecognized new version of ortools (9.9.3963). Expected < 9.5.0.Please open a feature request on cvxpy to enable support for this version.')
Feature actualdpdtolerance_344P discarded due to low IV (0.004)
Feature applicationcnt_361L discarded due to low IV (0.000)
Feature applications30d_658L discarded due to low IV (0.039)
Feature applicationscnt_1086L discarded due to low IV (0.002)
Feature applicationscnt_464L discarded due to low IV (0.006)
Feature applicationscnt_629L discarded due to low IV (0.003)
Feature applicationscnt_867L discarded due to low IV (0.003)
Feature avgdbddpdlast24m_3658932P discarded due to low IV (0.238)
Feature avgdbddpdlast3m_

In [16]:
data_bin_pca_1

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,...,PCA_109,PCA_110,PCA_111,PCA_112,PCA_113,PCA_114,PCA_115,PCA_116,PCA_117,PCA_118
0,0,0,0,1,4,,,,,,...,-0.360322,-0.080518,-0.047737,0.097072,-0.092220,0.010310,-0.041199,-0.059553,0.082294,-0.012239
1,1,0,0,1,4,,,,,,...,-0.516335,0.441499,-0.000135,-0.506176,-0.546311,0.407975,-0.099867,0.107186,-0.612805,1.155439
2,2,0,0,1,5,,,,,,...,-0.473754,0.548363,0.000613,-0.760739,-0.098605,-0.075461,-0.298528,-0.204826,0.219881,-0.229477
3,3,0,0,1,4,,,,,,...,-0.373873,0.062674,-0.349415,0.297558,0.192643,0.103607,-0.212800,0.323663,0.018048,-0.016618
4,4,0,1,1,5,,,,,,...,1.043837,0.559213,-0.022044,-0.475748,0.346309,0.180578,-0.671622,0.002330,0.181657,0.099039
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1526654,2703450,91,0,10,1,,52863.589844,-22192.0,0.0,0.0,...,-0.469163,0.819469,-0.045101,0.441768,-2.141661,-1.410938,0.714786,0.163622,-0.575172,-0.048706
1526655,2703451,91,0,10,1,,324608.531250,-25536.0,0.0,0.0,...,-0.110427,1.050386,-0.227814,-1.041713,0.336786,-0.115306,0.569129,1.386091,-0.346377,0.269942
1526656,2703452,91,0,10,1,,102738.757812,-15768.0,2.0,2.0,...,-0.469743,-0.338853,0.314229,-0.076792,-0.272262,-0.037540,0.111307,0.457816,-0.276121,0.082929
1526657,2703453,91,0,10,1,,212683.296875,-25808.0,2.0,2.0,...,1.155886,-1.497193,-1.121247,-1.568334,-2.357792,0.272913,-0.816974,1.144660,-0.273432,-1.010465


In [17]:
df_train=data_bin_pca_1

In [18]:
sample = pd.read_csv("/content/drive/MyDrive/home-credit-credit-risk-model-stability(1)/sample_submission.csv")
device='gpu'
#n_samples=200000
n_est=6000
DRY_RUN = True if sample.shape[0] == 10 else False
if DRY_RUN:
    device='cpu'
    df_train = df_train.sample(frac=0.1, random_state=42)
    # df_train = df_train.iloc[:60000]
    #n_samples=10000
    n_est=600
print(device)

cpu


In [19]:

train_df_sorted = df_train.sort_values(by='WEEK_NUM')
train_size = int(len(train_df_sorted) * 0.8)
df_train = train_df_sorted[:train_size]
df_test = train_df_sorted[train_size:]

In [20]:
y = df_train["target"]
weeks = df_train["WEEK_NUM"]
df_train= df_train.drop(columns=["target", "case_id", "WEEK_NUM"])
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

In [22]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
df_test[cat_cols] = df_test[cat_cols].astype(str)

In [23]:
params = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 2000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "verbose": -1,
    "random_state": 42,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':64,
    "device": device,
    "verbose": -1,
}

In [24]:
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import IncrementalPCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer

In [25]:

numeric_features = df_train.select_dtypes(include=[np.number]).columns
categorical_features = df_train.select_dtypes(include=['object','category','string']).columns
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])


lasso = Lasso(alpha=0.0005, random_state=42)

train_preprocessed = preprocessor.fit_transform(df_train)

In [26]:

X = df_train
X_preprocessed = preprocessor.fit_transform(X)


fitted_models_logit = []


'''
clf = LogisticRegression(random_state = 42) #(penalty='l2', C=0.01, solver='saga', max_iter=1000, random_state=42)
clf.fit(X_preprocessed, y)
fitted_models_logit.append(clf)
X_test_preprocessed = preprocessor.transform(df_test.drop(columns=['target',"case_id", "WEEK_NUM"]))

predictions = clf.predict(X_test_preprocessed)

y_test = df_test["target"]




probabilities = clf.predict_proba(X_test_preprocessed)[:, 1] 

threshold = 0.5
predictions = (probabilities > threshold).astype(int)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, probabilities)  
print("AUC:", auc)
'''

lasso.fit(X_preprocessed, y)
lasso_coef = lasso.coef_
selected_feature_mask = lasso_coef != 0
feature_names = numeric_features.tolist() + categorical_features.tolist()
selected_feature_names = [name for name, selected in zip(feature_names, selected_feature_mask) if selected]
print("Selected features:", selected_feature_names)
print(len(selected_feature_names))

selected_features = X_preprocessed[:, selected_feature_mask]

clf = LogisticRegression(random_state=42)
clf.fit(selected_features, y)

X_test_preprocessed = preprocessor.transform(df_test.drop(columns=['target',"case_id", "WEEK_NUM"]))
X_test_selected = X_test_preprocessed[:, selected_feature_mask]
predictions = clf.predict(X_test_selected)

y_test = df_test["target"]


probabilities = clf.predict_proba(X_test_selected)[:, 1] 

threshold = 0.5
predictions = (probabilities > threshold).astype(int)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, probabilities) 
print("AUC:", auc)

Selected features: ['weekday_decision', 'birthdate_574D', 'dateofbirth_337D', 'days360_512L', 'days90_310L', 'numberofqueries_373L', 'pmtscount_423L', 'responsedate_1012D', 'responsedate_4917613D', 'thirdquarter_1082L', 'annuitynextmonth_57A', 'applicationscnt_629L', 'avgdbddpdlast24m_3658932P', 'avgdpdtolclosure24_3658938P', 'avginstallast24m_3658937A', 'avgmaxdpdlast9m_3716943P', 'clientscnt3m_3712950L', 'clientscnt_887L', 'clientscnt_946L', 'cntpmts24_3658933L', 'currdebt_22A', 'currdebtcredtyperange_828A', 'daysoverduetolerancedd_3976961L', 'deferredmnthsnum_166L', 'downpmt_116A', 'dtlastpmtallstes_4499206D', 'eir_270L', 'firstclxcampaign_1125D', 'firstdatedue_489D', 'homephncnt_628L', 'inittransactionamount_650A', 'lastapprcredamount_781A', 'lastdelinqdate_224D', 'maxannuity_159A', 'maxdebt4_972A', 'maxdpdfrom6mto36m_3546853P', 'maxdpdinstldate_3546855D', 'maxdpdlast24m_143P', 'maxlnamtstart6m_4525199A', 'mindbddpdlast24m_3658935P', 'numactivecreds_622L', 'numincomingpmts_3546848L

Neutral Network

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

X = df_train
X_preprocessed = preprocessor.fit_transform(X)


model = Sequential([
    Dense(64, activation='relu', input_shape=(X_preprocessed.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1, restore_best_weights=True)
model.fit(X_preprocessed, y, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping])
X_test_preprocessed = preprocessor.transform(df_test.drop(columns=['target', "case_id", "WEEK_NUM"]))
probabilities = model.predict(X_test_preprocessed).ravel()
y_test = df_test["target"]
threshold = 0.5
predictions = (probabilities > threshold).astype(int)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)
auc = roc_auc_score(y_test, probabilities)
print("AUC:", auc)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 12: early stopping
Accuracy: 0.974487456605751
AUC: 0.8326282181418078


In [28]:
!pip install catboost==1.2.3

Collecting catboost==1.2.3
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [29]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


# 1. Light GBM + Catboost

In [30]:

from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

fitted_models_cat = []
fitted_models_lgb = []


cv_scores_cat = []
cv_scores_lgb = []

for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    X_train[cat_cols] = X_train[cat_cols].astype(str)
    X_valid[cat_cols] = X_valid[cat_cols].astype(str)

    # CatBoost
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(
        eval_metric='AUC',
        task_type='GPU',
        learning_rate=0.03,
        iterations=n_est,
        random_seed=3107
    )

    clf.fit(train_pool, eval_set=val_pool, verbose=300)
    fitted_models_cat.append(clf)

    y_pred_valid_cat = clf.predict_proba(X_valid)[:, 1]
    auc_score_cat = roc_auc_score(y_valid, y_pred_valid_cat)
    cv_scores_cat.append(auc_score_cat)

    # LightGBM
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)]
    )
    fitted_models_lgb.append(lgb_model)

    y_pred_valid_lgb = lgb_model.predict_proba(X_valid)[:, 1]
    auc_score_lgb = roc_auc_score(y_valid, y_pred_valid_lgb)
    cv_scores_lgb.append(auc_score_lgb)

print("CatBoost CV AUC scores:", cv_scores_cat)
print("CatBoost Maximum CV AUC score:", max(cv_scores_cat))

print("LightGBM CV AUC scores:", cv_scores_lgb)
print("LightGBM Maximum CV AUC score:", max(cv_scores_lgb))



Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5694678	best: 0.5694678 (0)	total: 116ms	remaining: 1m 9s
300:	test: 0.8047685	best: 0.8047685 (300)	total: 20.3s	remaining: 20.1s
599:	test: 0.8146171	best: 0.8146171 (599)	total: 39.7s	remaining: 0us
bestTest = 0.8146170974
bestIteration = 599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.826943
[400]	valid_0's auc: 0.830282
Early stopping, best iteration is:
[381]	valid_0's auc: 0.830749


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5741273	best: 0.5741273 (0)	total: 101ms	remaining: 1m
300:	test: 0.8124361	best: 0.8125582 (295)	total: 19.9s	remaining: 19.8s
599:	test: 0.8196908	best: 0.8196908 (599)	total: 39.7s	remaining: 0us
bestTest = 0.8196908236
bestIteration = 599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.836042
[400]	valid_0's auc: 0.837013
Early stopping, best iteration is:
[336]	valid_0's auc: 0.837726


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5797390	best: 0.5797390 (0)	total: 104ms	remaining: 1m 2s
300:	test: 0.8071592	best: 0.8071592 (300)	total: 20.3s	remaining: 20.1s
599:	test: 0.8151779	best: 0.8151844 (590)	total: 40.5s	remaining: 0us
bestTest = 0.8151843548
bestIteration = 590
Shrink model to first 591 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831292
Early stopping, best iteration is:
[196]	valid_0's auc: 0.831365


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6067315	best: 0.6067315 (0)	total: 100ms	remaining: 59.9s
300:	test: 0.8159937	best: 0.8159937 (300)	total: 20s	remaining: 19.9s
599:	test: 0.8234174	best: 0.8234444 (595)	total: 40.1s	remaining: 0us
bestTest = 0.8234443963
bestIteration = 595
Shrink model to first 596 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.834808
Early stopping, best iteration is:
[253]	valid_0's auc: 0.835604


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5540254	best: 0.5540254 (0)	total: 98.2ms	remaining: 58.8s
300:	test: 0.8163503	best: 0.8163503 (300)	total: 20.2s	remaining: 20s
599:	test: 0.8234762	best: 0.8234762 (599)	total: 40.2s	remaining: 0us
bestTest = 0.8234762251
bestIteration = 599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831248
[400]	valid_0's auc: 0.830673
Early stopping, best iteration is:
[355]	valid_0's auc: 0.831847
CatBoost CV AUC scores: [0.8146169908673957, 0.8196908093039952, 0.8151844664448819, 0.8234445278251608, 0.8234763507222546]
CatBoost Maximum CV AUC score: 0.8234763507222546
LightGBM CV AUC scores: [0.8307491913952926, 0.8377262852216751, 0.8313648841472686, 0.8356042429044621, 0.8318474912776164]
LightGBM Maximum CV AUC score: 0.8377262852216751


In [31]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators

    def fit(self, X, y=None):
        return self

    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

    def predict_proba(self, X):

        y_preds = [estimator.predict_proba(X) for estimator in self.estimators[:5]]

        X[cat_cols] = X[cat_cols].astype("category")
        y_preds += [estimator.predict_proba(X) for estimator in self.estimators[5:]]

        return np.mean(y_preds, axis=0)

model = VotingModel(fitted_models_cat+fitted_models_lgb)

In [32]:
df_test

Unnamed: 0,case_id,WEEK_NUM,target,month_decision,weekday_decision,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,...,PCA_109,PCA_110,PCA_111,PCA_112,PCA_113,PCA_114,PCA_115,PCA_116,PCA_117,PCA_118
580841,965374,60,0,2,6,,,-10224.0,3.0,4.0,...,-1.019287,0.453288,0.122662,0.915366,-0.075624,-0.814657,-0.764092,0.952997,0.972316,-1.834668
1205842,1799871,60,0,2,4,,,-14848.0,0.0,1.0,...,0.658284,-1.039972,-0.449138,0.711656,0.942276,0.989885,0.160870,-0.180846,-0.077194,0.841468
1495582,2672337,60,0,2,6,,,-18688.0,4.0,5.0,...,-0.805553,0.534499,0.257531,0.277240,-0.141655,0.439955,-0.223646,-1.376918,0.173184,0.246244
1204615,1798644,60,0,2,3,,,-14480.0,1.0,2.0,...,-1.642387,0.278315,-0.771641,0.521967,-0.594016,1.803949,-1.203521,-1.037414,1.885206,-1.124964
164748,210182,60,0,2,3,,,-10768.0,6.0,10.0,...,-0.987591,-1.383984,-1.605748,-0.524490,0.117542,0.623821,0.242940,1.470059,-1.034305,0.951854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1351900,1945929,91,0,10,1,,1.462630e+05,-12360.0,2.0,2.0,...,-0.213203,-0.333583,-0.355118,0.008363,-0.284639,0.227424,-0.014970,-0.364899,-0.155289,-0.007001
1352162,1946192,91,0,10,1,,1.400181e+05,-24256.0,1.0,2.0,...,0.483629,0.397482,-0.317055,-1.365234,0.615832,-0.209386,0.465737,-0.414058,0.150179,-0.181226
641228,1025761,91,0,10,4,,2.366870e+05,-9560.0,0.0,0.0,...,0.470106,-0.640245,-0.464545,0.289088,0.682794,0.027642,0.198531,-0.368077,-0.012707,0.354050
1349106,1943135,91,0,10,4,,6.768738e+06,-15824.0,4.0,4.0,...,-0.125583,1.098398,0.692651,-1.731309,0.600669,-1.943466,-0.726751,-0.876553,1.240031,-1.104611


In [33]:
df_test = df_test.drop(columns=["WEEK_NUM","target"])
df_test = df_test.set_index("case_id")


y_pred = pd.Series(model.predict_proba(df_test)[:, 1], index=df_test.index)



In [34]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred)  # 使用概率计算AUC
print("AUC:", auc)

AUC: 0.8505993017211692


In [35]:
df_train.shape

(122132, 457)

In [36]:
df_test[cat_cols] = df_test[cat_cols].astype(str)
cat_model = fitted_models_cat[0]
lgb_model = VotingModel(fitted_models_lgb)



y_pred_cat = pd.Series(cat_model.predict_proba(df_test)[:, 1])





In [37]:
df_test[cat_cols] = df_test[cat_cols].astype('category')
y_pred_lgb = pd.Series(lgb_model.predict_proba(df_test)[:, 1])

In [38]:
y_pred_lgb

0        0.003891
1        0.001636
2        0.032205
3        0.011640
4        0.256006
           ...   
30529    0.004151
30530    0.020828
30531    0.005781
30532    0.061512
30533    0.019974
Length: 30534, dtype: float64

In [39]:
df_train[cat_cols] = df_train[cat_cols].astype('category')
y_pred_lgb = pd.Series(lgb_model.predict_proba(df_test)[:, 1])
y_pred_lgb_train = pd.Series(lgb_model.predict_proba(df_train)[:, 1])



In [40]:
df_train[cat_cols] = df_train[cat_cols].astype(str)
y_pred_cat_train = pd.Series(cat_model.predict_proba(df_train)[:, 1])
df_pred_train = pd.DataFrame({'y_pred_cat': y_pred_cat_train, 'y_pred_lgb': y_pred_lgb_train})

In [41]:

df_pred = pd.DataFrame({'y_pred_cat': y_pred_cat, 'y_pred_lgb': y_pred_lgb})

Stacking+L1 penalty

In [42]:

clf = LogisticRegression(penalty='l1', C=0.01, solver='saga', max_iter=1000, random_state=42)
clf.fit(df_pred_train, y)


predictions = clf.predict(df_pred)


probabilities = clf.predict_proba(df_pred)[:, 1]  
threshold = 0.5
predictions = (probabilities > threshold).astype(int)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, probabilities)
print("AUC:", auc)

Accuracy: 0.9704919106569726
AUC: 0.8592974305445369


In [43]:

clf = LogisticRegression(penalty='l2', C=0.01, solver='saga', max_iter=1000, random_state=42)
clf.fit(df_pred_train, y)

predictions = clf.predict(df_pred)


probabilities = clf.predict_proba(df_pred)[:, 1] 


threshold = 0.5
predictions = (probabilities > threshold).astype(int)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, probabilities) 
print("AUC:", auc)

Accuracy: 0.9745857077356389
AUC: 0.8560269334140763


In [44]:
from xgboost import XGBClassifier

clf = XGBClassifier(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='binary:logistic',
    nthread=3,
    scale_pos_weight=1,
    seed=42
)
clf.fit(df_pred_train, y)
predictions = clf.predict(df_pred)







probabilities = clf.predict_proba(df_pred)[:, 1] 
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, probabilities)  
print("AUC:", auc)

Accuracy: 0.9709176655531538
AUC: 0.8497206936642872


In [45]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

clf = DecisionTreeClassifier(
    criterion='gini',
    max_depth=5,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42
)
clf.fit(df_pred_train, y)

predictions = clf.predict(df_pred)

probabilities = clf.predict_proba(df_pred)[:, 1] 
threshold = 0.9
predictions = (probabilities > threshold).astype(int)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

auc = roc_auc_score(y_test, probabilities)
print("AUC:", auc)

Accuracy: 0.9741272024628284
AUC: 0.8264979684922291


# XGBoost Trying

In [46]:
params2 = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 1000,
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "alpha": 0.1,
    "lambda": 10,
    "tree_method": 'gpu_hist' if device == 'gpu' else 'auto',
    "random_state": 42,
    "verbosity": 0,
    "enable_categorical":True,
}

In [47]:
%%time
from catboost import CatBoostClassifier, Pool
import xgboost as xgb

fitted_models_cat = []
fitted_models_lgb = []
fitted_models_xgb = []

cv_scores_cat = []
cv_scores_lgb = []
cv_scores_xgb = []


for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):#
    X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]#
    X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]
    train_pool = Pool(X_train, y_train,cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid,cat_features=cat_cols)
    clf = CatBoostClassifier(
    eval_metric='AUC',
    task_type='GPU',
    learning_rate=0.03,
    iterations=n_est)
    random_seed=3107
    clf.fit(train_pool, eval_set=val_pool,verbose=300)
    fitted_models_cat.append(clf)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_cat.append(auc_score)


    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)] )

    fitted_models_lgb.append(model)
    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_lgb.append(auc_score)


    model2 = xgb.XGBClassifier(**params2)
    model2.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100, verbose=False)

    fitted_models_xgb.append(model2)

    y_pred_valid = model2.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)
    cv_scores_xgb.append(auc_score)

    del clf, model, model2
    gc.collect()


print("CV AUC scores: ", cv_scores_cat)
print("Maximum CV AUC score: ", max(cv_scores_cat))


print("CV AUC scores: ", cv_scores_lgb)
print("Maximum CV AUC score: ", max(cv_scores_lgb))

print("CV AUC scores: ", cv_scores_xgb)
print("Maximum CV AUC score: ", max(cv_scores_xgb))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5871351	best: 0.5871351 (0)	total: 92.8ms	remaining: 55.6s
300:	test: 0.8042719	best: 0.8042719 (300)	total: 19.4s	remaining: 19.2s
599:	test: 0.8135852	best: 0.8136263 (595)	total: 38.4s	remaining: 0us
bestTest = 0.813626349
bestIteration = 595
Shrink model to first 596 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.82686
[400]	valid_0's auc: 0.826991
Early stopping, best iteration is:
[343]	valid_0's auc: 0.828539


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5992769	best: 0.5992769 (0)	total: 94.6ms	remaining: 56.7s
300:	test: 0.8132239	best: 0.8132239 (300)	total: 19.5s	remaining: 19.4s
599:	test: 0.8211515	best: 0.8211515 (599)	total: 38.5s	remaining: 0us
bestTest = 0.8211515248
bestIteration = 599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.835708
Early stopping, best iteration is:
[247]	valid_0's auc: 0.836011


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5812440	best: 0.5812440 (0)	total: 95.2ms	remaining: 57s
300:	test: 0.8081409	best: 0.8081409 (300)	total: 19.4s	remaining: 19.2s
599:	test: 0.8158343	best: 0.8158343 (599)	total: 38.7s	remaining: 0us
bestTest = 0.8158343434
bestIteration = 599
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831474
Early stopping, best iteration is:
[233]	valid_0's auc: 0.832558


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5731635	best: 0.5731635 (0)	total: 94.7ms	remaining: 56.7s
300:	test: 0.8181075	best: 0.8181075 (300)	total: 19.9s	remaining: 19.8s
599:	test: 0.8246974	best: 0.8247140 (585)	total: 39s	remaining: 0us
bestTest = 0.8247140348
bestIteration = 585
Shrink model to first 586 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831278
Early stopping, best iteration is:
[222]	valid_0's auc: 0.832199


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5625451	best: 0.5625451 (0)	total: 93.9ms	remaining: 56.3s
300:	test: 0.8143854	best: 0.8143854 (300)	total: 19.3s	remaining: 19.2s
599:	test: 0.8220001	best: 0.8220157 (590)	total: 38.6s	remaining: 0us
bestTest = 0.8220156729
bestIteration = 590
Shrink model to first 591 iterations.
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831979
Early stopping, best iteration is:
[210]	valid_0's auc: 0.832293
CV AUC scores:  [0.8136263173026013, 0.8211513335468156, 0.815834117754598, 0.8247143500615144, 0.8220156749462837]
Maximum CV AUC score:  0.8247143500615144
CV AUC scores:  [0.8285387269511344, 0.8360107019054624, 0.8325578942732194, 0.8321993392119197, 0.8322930678193]
Maximum CV AUC score:  0.8360107019054624
CV AUC scores:  [0.8224000686430488, 0.833123635375636, 0.8248054310650298, 0.827375052194854, 0.8279033572172889]
Maximum CV AUC score:  0.833123635375636
CPU times: user 1h 18min 54s, sys: 18.6 s, total: 1h 19min 12s
Wall time: 15m

In [48]:
model2 = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb)

In [49]:
df_test[cat_cols] = df_test[cat_cols].astype(str)
y_pred2 = pd.Series(model2.predict_proba(df_test)[:, 1], index=df_test.index)

In [50]:
from sklearn.metrics import roc_auc_score
auc = roc_auc_score(y_test, y_pred2)  
print("AUC:", auc)

AUC: 0.8524675076545299


# Stack

In [51]:
from sklearn.model_selection import cross_val_predict

In [52]:
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
import pandas as pd

models = [
    ('CatBoost', CatBoostClassifier(eval_metric='AUC', task_type='GPU', learning_rate=0.03, iterations=n_est, random_seed=3107)),
    ('LightGBM', LGBMClassifier(**params)),
    ('XGBoost', XGBClassifier(**params2))
]


meta_model = RandomForestClassifier(n_estimators=100, random_state=42)


fitted_models_cb = []
fitted_models_lgb = []
fitted_models_xgb = []
cv_scores_cb = []
cv_scores_lgb = []
cv_scores_xgb = []

meta_features = pd.DataFrame(index=df_train.index, columns=['CatBoost', 'LightGBM', 'XGBoost'])

for name, model in models:
    for idx_train, idx_valid in cv.split(df_train, y, groups=weeks):
        X_train, y_train = df_train.iloc[idx_train], y.iloc[idx_train]
        X_valid, y_valid = df_train.iloc[idx_valid], y.iloc[idx_valid]

        if name == 'CatBoost':
            X_train[cat_cols] = X_train[cat_cols].astype(str)
            X_valid[cat_cols] = X_valid[cat_cols].astype(str)
            train_pool = Pool(X_train, y_train, cat_features=cat_cols)
            val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)
            model.fit(train_pool, eval_set=val_pool, verbose=False)
            y_pred_valid = model.predict_proba(val_pool)[:, 1]
            fitted_models_cb.append(model)
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_cb.append(auc_score)
        elif name == 'LightGBM':
            X_train[cat_cols] = X_train[cat_cols].astype('category')
            X_valid[cat_cols] = X_valid[cat_cols].astype('category')
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], callbacks=[lgb.log_evaluation(200), lgb.early_stopping(100)])
            fitted_models_lgb.append(model)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_lgb.append(auc_score)
        else:
            X_train[cat_cols] = X_train[cat_cols].astype('category')
            X_valid[cat_cols] = X_valid[cat_cols].astype('category')
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=100, verbose=False)
            fitted_models_xgb.append(model)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            auc_score = roc_auc_score(y_valid, y_pred_valid)
            cv_scores_xgb.append(auc_score)

        meta_features.loc[X_valid.index, name] = y_pred_valid


meta_preds = cross_val_predict(meta_model, meta_features, y, cv=5, method='predict_proba', n_jobs=-1)[:, 1]
meta_auc_score = roc_auc_score(y, meta_preds)

print("Meta Model CV AUC score:", meta_auc_score)

Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU
Default metric period is 5 because AUC is/are not implemented for GPU


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.82686
[400]	valid_0's auc: 0.826991
Early stopping, best iteration is:
[343]	valid_0's auc: 0.828539
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.835708
Early stopping, best iteration is:
[247]	valid_0's auc: 0.836011
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831474
Early stopping, best iteration is:
[233]	valid_0's auc: 0.832558
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831278
Early stopping, best iteration is:
[222]	valid_0's auc: 0.832199
Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.831979
Early stopping, best iteration is:
[210]	valid_0's auc: 0.832293
Meta Model CV AUC score: 0.7732279946180888


In [53]:
meta_features

Unnamed: 0,CatBoost,LightGBM,XGBoost
644159,0.044749,0.047378,0.043446
647193,0.021716,0.011958,0.012715
644913,0.015593,0.006044,0.012217
647647,0.008737,0.001972,0.004061
57773,0.081639,0.056196,0.059141
...,...,...,...
1494021,0.012742,0.012299,0.005899
164061,0.0175,0.01055,0.005744
580747,0.010598,0.006118,0.007167
1494271,0.102687,0.247495,0.053844


In [54]:
meta_model.fit(meta_features, y)

In [55]:
test_meta_features = pd.DataFrame(index=df_test.index, columns=['CatBoost', 'LightGBM', 'XGBoost'])

In [56]:
# CatBoost


for model in fitted_models_cat:
    df_test[cat_cols] = df_test[cat_cols].astype(str)
    y_pred_test = model.predict_proba(df_test)[:, 1]
    test_meta_features['CatBoost'] = test_meta_features['CatBoost'].add(y_pred_test, fill_value=0)

test_meta_features['CatBoost'] /= len(fitted_models_cat)

# LightGBM
for model in fitted_models_lgb:
    df_test[cat_cols] = df_test[cat_cols].astype("category")
    y_pred_test = model.predict_proba(df_test)[:, 1]
    test_meta_features['LightGBM'] = test_meta_features['LightGBM'].add(y_pred_test, fill_value=0)

test_meta_features['LightGBM'] /= len(fitted_models_lgb)

# XGBoost
for model in fitted_models_xgb:
    df_test[cat_cols] = df_test[cat_cols].astype("category")
    y_pred_test = model.predict_proba(df_test)[:, 1]
    test_meta_features['XGBoost'] = test_meta_features['XGBoost'].add(y_pred_test, fill_value=0)

test_meta_features['XGBoost'] /= len(fitted_models_xgb)

In [57]:
y_pred_test = meta_model.predict_proba(test_meta_features)[:, 1]
meta_auc_score_test = roc_auc_score(y_test, y_pred_test)

print("Meta Model Test AUC score:", meta_auc_score_test)

Meta Model Test AUC score: 0.7896290385051097


In [58]:
meta_auc_score = roc_auc_score(y, meta_preds)

print("Meta Model CV AUC score:", meta_auc_score)

Meta Model CV AUC score: 0.7732279946180888


In [59]:
catauc_test = roc_auc_score(y_test, test_meta_features.iloc[:, 0])
lgbauc_test = roc_auc_score(y_test, test_meta_features.iloc[:, 1])
xgbauc_test = roc_auc_score(y_test, test_meta_features.iloc[:, 2])

In [60]:
print(catauc_test, lgbauc_test, xgbauc_test)

0.8345967176460561 0.8506048014118008 0.8431205669029138


In [61]:
catauc = roc_auc_score(y, meta_features.iloc[:, 0])
lgbauc = roc_auc_score(y, meta_features.iloc[:, 1])
xgbauc = roc_auc_score(y, meta_features.iloc[:, 2])

In [62]:
print(catauc, lgbauc, xgbauc)

0.8191587433475787 0.8323758444648146 0.8149105589418175


In [63]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

meta_features = meta_features.astype(float)
test_meta_features = test_meta_features.astype(float)

meta_models = [
    ('Logistic Regression', LogisticRegression(random_state=42)),
    ('Random Forest', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('AdaBoost', AdaBoostClassifier(n_estimators=100, random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('XGBoost', XGBClassifier(n_estimators=100, random_state=42))
]

train_scores = []
cv_scores = []
for name, model in meta_models:
    print(f"Training {name}...")

    model.fit(meta_features, y)

    y_pred_train = model.predict_proba(meta_features)[:, 1]
    train_auc = roc_auc_score(y, y_pred_train)
    train_scores.append((name, train_auc))

    cv_auc = cross_val_score(model, meta_features, y, cv=5, scoring='roc_auc')
    cv_scores.append((name, cv_auc.mean(), cv_auc.std()))

print("\nTrain AUC Scores:")
for name, score in train_scores:
    print(f"{name}: {score:.4f}")

print("\nCross-Validation AUC Scores (Mean ± Std):")
for name, mean_score, std_score in cv_scores:
    print(f"{name}: {mean_score:.4f} ± {std_score:.4f}")

Training Logistic Regression...
Training Random Forest...
Training AdaBoost...
Training Gradient Boosting...
Training XGBoost...

Train AUC Scores:
Logistic Regression: 0.8337
Random Forest: 1.0000
AdaBoost: 0.8376
Gradient Boosting: 0.8439
XGBoost: 0.8980

Cross-Validation AUC Scores (Mean ± Std):
Logistic Regression: 0.8345 ± 0.0113
Random Forest: 0.7740 ± 0.0117
AdaBoost: 0.8323 ± 0.0106
Gradient Boosting: 0.8326 ± 0.0107
XGBoost: 0.8188 ± 0.0128


# Gradient Boosting AS second layer

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

params = {
    'n_estimators': 12,
    'learning_rate': 0.1,
    'max_depth': 3,
    'min_samples_split': 3,
    'min_samples_leaf': 1
}

meta_model = GradientBoostingClassifier(**params)

meta_model.fit(meta_features, y)
test_pred = meta_model.predict_proba(test_meta_features)[:, 1]
meta_auc_score = roc_auc_score(y_test, test_pred)

print("Meta Model Test AUC score:", meta_auc_score)

Meta Model Test AUC score: 0.8487928915242752


# logistic regression as second layer

l2

In [65]:
from sklearn.linear_model import LogisticRegression

params = {
    'C': 0.01,
    'penalty': 'l2',
    'solver': 'lbfgs',
    'max_iter': 1000,
    'random_state': 42
}

meta_model = LogisticRegression(**params)
meta_model.fit(meta_features, y)

# 在测试集上进行预测
test_pred = meta_model.predict_proba(test_meta_features)[:, 1]

# 计算AUC得分
meta_auc_score = roc_auc_score(y_test, test_pred)
print("Meta Model Test AUC score:", meta_auc_score)

Meta Model Test AUC score: 0.8494419416281025


l1

In [66]:
from sklearn.linear_model import LogisticRegression


meta_model = LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=1000, random_state=42)
meta_model.fit(meta_features, y)

# 在测试集上进行预测
test_pred = meta_model.predict_proba(test_meta_features)[:, 1]

# 计算AUC得分
meta_auc_score = roc_auc_score(y_test, test_pred)
print("Meta Model Test AUC score:", meta_auc_score)

Meta Model Test AUC score: 0.8505744881563513


# SVD AS second layer

In [67]:
from sklearn.svm import SVC

params = {
    'C': 0.1,
    'kernel': 'rbf',
    'gamma': 'scale',
    'probability': True,
    'random_state': 42
}

meta_model = SVC(**params)
meta_model.fit(meta_features, y)

test_pred = meta_model.predict_proba(test_meta_features)[:, 1]

meta_auc_score = roc_auc_score(y_test, test_pred)
print("Meta Model Test AUC score:", meta_auc_score)

Meta Model Test AUC score: 0.7167115851372896


# catboost as seccond layer

In [68]:
from catboost import CatBoostClassifier

params = {
    'iterations': 500, 
    'learning_rate': 0.1, 
    'depth': 6, 
    'l2_leaf_reg': 3, 
    'bootstrap_type': 'Bernoulli', 
    'subsample': 0.8, 
    'colsample_bylevel': 0.7, 
    'random_seed': 42
}

meta_model = CatBoostClassifier(**params)

meta_model.fit(meta_features, y, eval_set=(test_meta_features, y_test), use_best_model=True, verbose=100)

test_pred = meta_model.predict_proba(test_meta_features)[:, 1]

meta_auc_score = roc_auc_score(y_test, test_pred)
print("Meta Model Test AUC score:", meta_auc_score)

0:	learn: 0.5143398	test: 0.5246178	best: 0.5246178 (0)	total: 12.1ms	remaining: 6.03s
100:	learn: 0.1155771	test: 0.0990816	best: 0.0990816 (100)	total: 889ms	remaining: 3.51s
200:	learn: 0.1141189	test: 0.0992725	best: 0.0990816 (100)	total: 1.75s	remaining: 2.6s
300:	learn: 0.1130137	test: 0.0994612	best: 0.0990816 (100)	total: 2.62s	remaining: 1.73s
400:	learn: 0.1119673	test: 0.0996632	best: 0.0990816 (100)	total: 3.64s	remaining: 899ms
499:	learn: 0.1109206	test: 0.0998921	best: 0.0990816 (100)	total: 4.69s	remaining: 0us

bestTest = 0.09908159327
bestIteration = 100

Shrink model to first 101 iterations.
Meta Model Test AUC score: 0.8500684300089452


In [69]:
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier

xgb = XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05)

lr = LogisticRegression()
eclf = VotingClassifier(estimators=[('xgb', xgb), ('lr', lr)], voting='soft')
eclf.fit(meta_features, y)
eval_score = roc_auc_score(y_test, eclf.predict_proba(test_meta_features)[:, 1])
print('Ensemble score: {:.4f}'.format(eval_score))

Ensemble score: 0.8497


# Parameter Optimization

In [70]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

xgb_params = {
    'max_depth': [3, 5, 7, 9],
    'n_estimators': [100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}
lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'class_weight': ['balanced', None]
}
xgb = XGBClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
voting_estimator = VotingClassifier(estimators=[('xgb', xgb), ('lr', lr)], voting='soft')
xgb_grid = RandomizedSearchCV(xgb, param_distributions=xgb_params, n_iter=20, cv=5, scoring='roc_auc', random_state=42, n_jobs=-1, verbose=3)
xgb_grid.fit(meta_features, y)
print('Best XGBoost params:', xgb_grid.best_params_)
print('Best XGBoost score:', xgb_grid.best_score_)

lr_grid = GridSearchCV(lr, param_grid=lr_params, cv=5, scoring='roc_auc', n_jobs=-1, verbose=3)
lr_grid.fit(meta_features, y)
print('Best LR params:', lr_grid.best_params_)
print('Best LR score:', lr_grid.best_score_)


xgb = XGBClassifier(**xgb_grid.best_params_, random_state=42)
lr = LogisticRegression(**lr_grid.best_params_, random_state=42)
eclf = VotingClassifier(estimators=[('xgb', xgb), ('lr', lr)], voting='soft')


eclf.fit(meta_features, y)


eval_score = roc_auc_score(y_test, eclf.predict_proba(test_meta_features)[:, 1])
print('Ensemble score: {:.4f}'.format(eval_score))

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best XGBoost params: {'subsample': 0.6, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 1.0}
Best XGBoost score: 0.832721263565055
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best LR params: {'C': 100, 'class_weight': None, 'penalty': 'l2'}
Best LR score: 0.8345735501435405
Ensemble score: 0.8504
