# ICR Competition

In [145]:
# run_para = 'local'      # local用于本地跑；kaggle用于在kaggle上评测
run_para = 'kaggle'

Feature_Derivation = {
    "Non_feature": 0,  # 缺失值特征衍生
    "Demaxmin": 0,  # 最大最小值特征衍生
    "K_dist": 0,  # 到聚类中心的距离特征衍生
    "Maxmin_dist": 1,  # 到聚类中心dist max (min) mean特征衍生
    "BN_bin": 1,  # BN 分箱特征衍生
    "Poly": 1,  # 多项式特征衍生
}
Post_Process = {
    "Calibration": 'none',  # 校准类型(bc: Beat; iso: Isotonic)
    "Set_thres": False,  # 是否使用卡阈值
    "boost": 1.0,         # boost
    "optimal": False,      # from the discussion
    "trival_calib": True,  # 0.5 0.5 calibration
    "Ensemble_method": 'mean', # how to ensemble the models' predicted probabilities together.
}

thres_high_l = 1000.0
thres_high_r = 1000.0
thres_low_l = 0.05
thres_low_r = 0.20

## Enviroment Setup

先配置环境，在本地时候不需要执行下面这个代码块，在kaggle中需要。

In [146]:
if run_para == 'kaggle':
    !pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
    !mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
    !cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/
#     !pip install betacal --no-index --find-links=file:///kaggle/input/betacal-whl

Looking in links: file:///kaggle/input/pip-packages-icr/pip-packages


In [147]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneOut
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings
import torch

warnings.filterwarnings("ignore")
# warnings.filterwarnings(action='ignore', category=LightGBMWarning)

## Read Data & EDA
### 简单处理
先加载各个数据文件

In [148]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

if run_para == 'kaggle':
    train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
    test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
    sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
    greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')
elif run_para == 'local':
    train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
    test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
    sample = pd.read_csv('./icr-identify-age-related-conditions/sample_submission.csv')
    greeks = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

# hard_samples = [102, 267, 313, 318, 434, 509]

# train = train.drop(hard_samples)
# greeks = greeks.drop(hard_samples)

# train.reset_index(drop=True, inplace=True)
# greeks.reset_index(drop=True, inplace=True)    

print(train.shape)

(617, 58)


然后将EJ属性二值化（0/1）。

In [149]:
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

### 数据整数化
根据Disscussion Post，我们可以将原数据集中的某些属性转化成整数取值，并且猜测BN属性可能是指年龄，所以我们可以在BN上进行一些操作。

In [150]:
int_denominators = {
    'AB': 0.004273,
    'AF': 0.00242,
    'AH': 0.008709,
    'AM': 0.003097,
    'AR': 0.005244,
    'AX': 0.008859,
    'AY': 0.000609,
    'AZ': 0.006302,
    'BC': 0.007028,
    'BD ': 0.00799,
    'BN': 0.3531,
    'BP': 0.004239,
    'BQ': 0.002605,
    'BR': 0.006049,
    'BZ': 0.004267,
    'CB': 0.009191,
    'CC': 6.12e-06,
    'CD ': 0.007928,
    'CF': 0.003041,
    'CH': 0.000398,
    'CL': 0.006365,
    'CR': 7.5e-05,
    'CS': 0.003487,
    'CU': 0.005517,
    'CW ': 9.2e-05,
    'DA': 0.00388,
    'DE': 0.004435,
    'DF': 0.000351,
    'DH': 0.002733,
    'DI': 0.003765,
    'DL': 0.00212,
    'DN': 0.003412,
    'DU': 0.0013794,
    'DV': 0.00259,
    'DY': 0.004492,
    'EB': 0.007068,
    'EE': 0.004031,
    'EG': 0.006025,
    'EH': 0.006084,
    'EL': 0.000429,
    'EP': 0.009269,
    'EU': 0.005064,
    'FC': 0.005712,
    'FD ': 0.005937,
    'FE': 0.007486,
    'FI': 0.005513,
    'FR': 0.00058,
    'FS': 0.006773,
    'GB': 0.009302,
    'GE': 0.004417,
    'GF': 0.004374,
    'GH': 0.003721,
    'GI': 0.002572
}
for k, v in int_denominators.items():
    train[k] = np.round(train[k] / v, 1)
    test[k] = np.round(test[k] / v, 1)

对一些属性特征进行drop操作，尝试找到最佳的组合方式。

In [151]:
ft = ['AF','BQ','AB','DU','DI','FL','CR','DH','DA','EH','CD ','BP','BC','DL','EE','FD ','DE','GL','FR','FI','EB','CU','CS', 'BN']
# train = train[['Id'] + ft + ['Class']]
# test = test[['Id'] + ft]

pure_train = train.drop(['Id', 'Class'], axis=1)
pure_test = test.drop(['Id'], axis=1)

### 记录每条数据缺失值个数

在数据集中填充记录每条数据有多少缺失值。

In [152]:
'''
non_feature: 表示该位置的值是否为缺失值
non_feature_count: 表示该行（一条数据）缺失值的个数
'''
def creat_non_feature(df, columns):
    feature_isnull_col = [f'{f}_isnull' for f in columns]
    non_feature = df[columns].isnull().astype(int)
    non_feature.columns = feature_isnull_col
    
    non_feature_count = pd.DataFrame(non_feature.sum(axis=1))
    non_feature_count.columns = ['count_isnull']
    
    non_feature = non_feature.reset_index(drop=True)
    non_feature_count = non_feature_count.reset_index(drop=True)
    return non_feature, non_feature_count

columns = pure_train.columns.tolist()
non_feature, non_feature_count = creat_non_feature(pure_train, columns)
if Feature_Derivation['Non_feature']:
    train = pd.concat([train, non_feature_count], axis=1)

non_feature, non_feature_count = creat_non_feature(pure_test, columns)
if Feature_Derivation['Non_feature']:
    test = pd.concat([test, non_feature_count], axis=1)

### 填补缺失值

在这里我们使用每个特征数据的中位数填补缺失值。

In [153]:
Imp = SimpleImputer(missing_values=np.nan, strategy='median')

columns_to_select = [col for col in train.columns if col not in ['Class', 'Id']]

train_data = train[columns_to_select].copy()
test_data = test[columns_to_select].copy()

# 填充缺失值
train_data = pd.DataFrame(Imp.fit_transform(train_data), columns=columns_to_select)
test_data = pd.DataFrame(Imp.transform(test_data), columns=columns_to_select)

# 重新组合数据和原始列
train_filled = pd.concat([train['Id'], train_data, train['Class']], axis=1)
test_filled = pd.concat([test['Id'], test_data], axis=1)

train = train_filled.copy()
test = test_filled.copy()

pure_train = train[columns]
pure_test = test[columns]
# print(train.shape, test.shape)

### 聚类距离distance
我们对Class为0和Class为1的数据分别进行KMeans聚类，然后计算每个数据点到各个中心的距离。

In [154]:
from sklearn.cluster import KMeans

def feature_clustering(df, label, k, ref_label):
    kmeans = KMeans(n_clusters=k, init='random', n_init=30, max_iter=200, tol=0.001)
    kmeans.fit(df[label == ref_label])
    return kmeans.cluster_centers_

# 计算距离各聚类中心点的欧式距离
def dist_transform(df, centers, ref_label):
    final_data=pd.DataFrame()    
    for i in range(len(centers)):
        final_data[f"feature_distance_{ref_label}_{i}"] = np.linalg.norm(df - centers[i], axis=1)
    return final_data

train_label = train['Class']
if Feature_Derivation['K_dist']:
    # train数据集的聚类计算
    centers_0 = feature_clustering(pure_train, train_label, 20, 0)
    dist_feature_0 = dist_transform(pure_train, centers_0, 0)
    centers_1 = feature_clustering(pure_train, train_label, 15, 1)
    dist_feature_1 = dist_transform(pure_train, centers_1, 1)
    train = pd.concat([train, dist_feature_0, dist_feature_1], axis=1)

    # test数据集的聚类计算
    dist_feature_0 = dist_transform(pure_test, centers_0, 0)
    dist_feature_1 = dist_transform(pure_test, centers_1, 1)
    test = pd.concat([test, dist_feature_0, dist_feature_1], axis=1)

除了计算到每个聚类中心的距离，我们可以提取出每个数据点到Class0/1的聚类中心的距离的最大、最小、平均值。

In [155]:
def get_dis_feature(df, centers, i):
    dist_feature = dist_transform(df, centers, i)  # 所点距离正常类样本中心的距离（32个中心）
    min_dist_feature = dist_feature.min(axis=1)
    min_dist_feature.name = f"min_dist_feature_{i}"

    max_dist_feature = dist_feature.max(axis=1)
    max_dist_feature.name  = f"max_dist_feature_{i}"

    mean_dist_feature = dist_feature.mean(axis=1)
    mean_dist_feature.name  = f"mean_dist_feature_{i}"
    return min_dist_feature, max_dist_feature, mean_dist_feature

if Feature_Derivation['Maxmin_dist']:
    # train数据集
    train_label = train['Class']
    centers_0 = feature_clustering(pure_train, train_label, 32, 0)
    train_min_dist_0, train_max_dist_0, train_mean_dist_0 = get_dis_feature(pure_train, centers_0, 0)
    centers_1 = feature_clustering(pure_train, train_label, 32, 1)
    train_min_dist_1, train_max_dist_1, train_mean_dist_1 = get_dis_feature(pure_train, centers_1, 1)
    # train = pd.concat([train, train_min_dist_0, train_max_dist_0, train_mean_dist_0, train_min_dist_1, train_max_dist_1, train_mean_dist_1], axis=1)
    train = pd. concat([train, train_min_dist_0, train_min_dist_1], axis=1)

    # test数据集
    test_min_dist_0, test_max_dist_0, test_mean_dist_0 = get_dis_feature(pure_test, centers_0, 0)
    test_min_dist_1, test_max_dist_1, test_mean_dist_1 = get_dis_feature(pure_test, centers_1, 1)
    # test = pd.concat([test, test_min_dist_0, test_max_dist_0, test_mean_dist_0, test_min_dist_1, test_max_dist_1, test_mean_dist_1], axis=1)
    test = pd.concat([test, test_min_dist_0, test_min_dist_1], axis=1)

### 分箱（BN列）
通过绘制密度-年龄图我们可以看到，Class为1的样本对应的曲线比Class为0的样本对应的曲线要右移了一些，所以这体现出年龄较大时更可能得病。故我们可以在BN上进行一些分析和处理：对BN列进行聚类分箱，总共分为5-6类，增加新列BN_binning。

In [156]:
k = 7
if run_para == 'kaggle':
    BNpd = pd.concat([train['BN'], test['BN']], axis=0, ignore_index=True)
elif run_para == 'local':
    BNpd = train['BN']

BNpd = pd.concat([train['BN'], test['BN']], axis=0, ignore_index=True)
data = BNpd.values.reshape(-1, 1)
kmodel = KMeans(n_clusters=k)           # k为聚成几类
kmodel.fit(data)  # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_, columns=['cc']) # 求聚类中心
c0 = pd.DataFrame({'cc': [0.0]})
c = pd.concat([c0, c], axis=0, ignore_index=True)
c = c.sort_values(by='cc').reset_index(drop=True)

# 求聚类中心之间的平均值作为分割点
for i in range(c.shape[0] - 1):
    c.iloc[i]['cc'] = (c.iloc[i]['cc'] + c.iloc[i+1]['cc']) / 2
c = c.drop(c.index[-1])

c0 = pd.DataFrame({'cc': [0.0]})
cn = pd.DataFrame({'cc': [max(train['BN'].max(), test['BN'].max()) * 5]})
c = pd.concat([c0, c, cn], axis=0, ignore_index=True)
c = c['cc'].round().astype(int)
c = c.unique()
range_num = c.shape[0] - 1
c = c.tolist()

# 保留旧BN，添加BN_binning
train_BN = train['BN'].values
train_binning = pd.cut(train_BN, c, labels=range(range_num), include_lowest=True).astype(int)
if Feature_Derivation['BN_bin']:
    train['BN_binning'] = train_binning

test_BN = test['BN'].values
test_binning = pd.cut(test_BN, c, labels=range(range_num), include_lowest=True).astype(int)
if Feature_Derivation['BN_bin']:
    test['BN_binning'] = test_binning

### 其他特征衍生

#### 多项式特征衍生
对DI、DU和Br、Bz进行二阶多项式特征衍生，

In [157]:
Multiply_features = [
#     ['DI', 'DU'],
#     ['DU', 'DU'],
    ['DU', 'FR'],
    ['DA', 'DE'],
    ['AB', 'GL'],
]

for j, columns_to_mul in enumerate(Multiply_features):
    mix_col = columns_to_mul[0] + '+' + columns_to_mul[1]
    train[mix_col] = train[columns_to_mul[0]] * train[columns_to_mul[1]]
    test[mix_col] = test[columns_to_mul[0]] * test[columns_to_mul[1]]

In [158]:
def change(X):
    X['out_GL'] = 0
    X.loc[X['GL']<1,'out_GL'] = X.loc[X['GL']<1,'GL'].map(lambda x : x-X.loc[X['GL']<1,'GL'].mean())
    X.loc[X['GL']>1.5,'out_GL'] = X.loc[X['GL']>1.5,'GL'].map(lambda x : x-X.loc[X['GL']>1.5,'GL'].mean())
    X.out_GL = X.out_GL.astype('float')
    X['DA*CS'] = np.log(X.DA*2 / X.CS**0.5)#0.2100892\
    return X

In [159]:
train = change(train)
test = change(test)
print(train.columns)

Index(['Id', 'AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class',
       'min_dist_feature_0', 'min_dist_feature_1', 'BN_binning', 'DU+FR',
       'DA+DE', 'AB+GL', 'out_GL', 'DA*CS'],
      dtype='object')


In [160]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

Poly_features = [
                 ['DI', 'DU'],
#                  ['BR', 'BZ'],
#                  ['CR', 'AB', 'FL',],
#                  ['CR'],
                ]

for j, columns_to_derive in enumerate(Poly_features):
    # 多项式特征衍生
    degree_dim = 2
    poly = PolynomialFeatures(degree=degree_dim, include_bias=False, interaction_only=False)
    # Z-Score标准化
    scaler = StandardScaler()
    
    poly_features = poly.fit_transform(train[columns_to_derive])
    scaled_features = scaler.fit_transform(poly_features)
    # 生成新的特征列名
    new_feature_names = [f"poly_{j}_{i}" for i in range(scaled_features.shape[1])]
    features_train_df = pd.DataFrame(scaled_features, columns=new_feature_names)
    if Feature_Derivation['Poly']:
        train = pd.concat([train, features_train_df], axis=1)
        train = train.drop(columns=columns_to_derive)
        for x in columns_to_derive:
            ft.remove(x)

    # 测试集
    poly_features_test = poly.transform(test[columns_to_derive])
    scaled_features_test = scaler.transform(poly_features_test)
    features_test_df = pd.DataFrame(scaled_features_test, columns=new_feature_names)
    if Feature_Derivation['Poly']:
        test = pd.concat([test, features_test_df], axis=1)
        test = test.drop(columns=columns_to_derive)

In [161]:
drop_col = [
    ['CL'],
    # ['EU', 'CW '],
    # ['count_isnull'],
    # ['feature_distance_0_0', 'feature_distance_0_1', 'feature_distance_0_2', 'feature_distance_0_3', 'feature_distance_0_4', 'feature_distance_0_5', 'feature_distance_0_6', 'feature_distance_0_7', 'feature_distance_0_8', 'feature_distance_0_9'],
    # ['feature_distance_1_0', 'feature_distance_1_1', 'feature_distance_1_2', 'feature_distance_1_3', 'feature_distance_1_4', 'feature_distance_1_5', 'feature_distance_1_6', 'feature_distance_1_7', 'feature_distance_1_8', 'feature_distance_1_9'],
    # ['BN_binning'],
#     ['CF', 'AF', 'FE', 'CR', 'BR', 'GH', 'EE']
    ['FD ','CS'],
    ['CW ', 'DV'],
    ['BD ', 'AR'],
]
for dc in drop_col:
    train = train.drop(columns=dc)
    test = test.drop(columns=dc)
    for x in dc:
        if x in ft:
            ft.remove(x)

need_to_add = ['min_dist_feature_0', 'min_dist_feature_1', 'BN_binning', 'DU+FR', 'DA+DE', 'AB+GL', 'out_GL', 'DA*CS']
for x in need_to_add:
    ft.append(x)
print(ft)
    
print(train.columns.tolist())
print(train.shape)

['AF', 'BQ', 'AB', 'FL', 'CR', 'DH', 'DA', 'EH', 'CD ', 'BP', 'BC', 'DL', 'EE', 'DE', 'GL', 'FR', 'FI', 'EB', 'CU', 'BN', 'min_dist_feature_0', 'min_dist_feature_1', 'BN_binning', 'DU+FR', 'DA+DE', 'AB+GL', 'out_GL', 'DA*CS']
['Id', 'AB', 'AF', 'AH', 'AM', 'AX', 'AY', 'AZ', 'BC', 'BN', 'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CR', 'CU', 'DA', 'DE', 'DF', 'DH', 'DL', 'DN', 'DY', 'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FE', 'FI', 'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL', 'Class', 'min_dist_feature_0', 'min_dist_feature_1', 'BN_binning', 'DU+FR', 'DA+DE', 'AB+GL', 'out_GL', 'DA*CS', 'poly_0_0', 'poly_0_1', 'poly_0_2', 'poly_0_3', 'poly_0_4']
(617, 62)


### 特征提取
predictor_columns中提取上面所有除了Class和Id的属性特征。

In [162]:
predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']

我们可以将greeks.csv中的Epsilon，也就是时间，加入到train dataset中。

In [163]:
from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan
times = times.astype(np.float64)

train_pred_and_time = pd.concat((train, times, greeks.Alpha), axis=1)
train_cate = train_pred_and_time.iloc[:, -1]        # A, B, D, G
train_pred_and_time = train_pred_and_time.drop(train_pred_and_time.columns[-1], axis=1)

test_predictors = test[predictor_columns]
test_time = np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1
test_pred_and_time = pd.concat((test_predictors, pd.DataFrame(test_time, columns=['Epsilon'])), axis=1)
# test_pred_and_time = train_pred_and_time.copy().drop(['Id', 'Class'], axis=1)

## Model & Evaluation

首先设置评判标准。在这次比赛中使用的评判标准是balanced log loss，公式如下：
$$
\text { Log Loss }=\frac{-\frac{1}{N_0} \sum_{i=1}^{N_0} y_{0 i} \log p_{0 i}-\frac{1}{N_1} \sum_{i=1}^{N_1} y_{1 i} \log p_{1 i}}{2}
$$
这样的目标是平衡两类的重要程度。

In [164]:
from sklearn.metrics import log_loss
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15, labels=[0, 1])

def meta_to_bll(y_true, y_pred):
    y_pred = y_pred.reshape(-1, 4)
    y_true = np.array([0 if x == 0 else 1 for x in y_true])
    probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=0的概率
    p1 = 1 - p0

    loss = balanced_log_loss(y_true, p1)
    return "bll", loss, False

def err(y_true, y_pred):
#     print(np.concatenate((y_true, y_pred), axis=1))
#     print(y_true, y_pred)
    return abs(y_true - y_pred)

In [165]:
y_true = np.array([0, 0, 1, 1])
y_pred = np.array([0.035, 0.035, 0.965, 0.965])
print(balanced_log_loss(y_true, y_pred))

0.03562717764315116


然后设计集成模型，这里使用了4个分类器，两个XGBoost，两个TabPFN。

In [166]:
XGB_params1 =  {'lambda': 0.0002570549301993319, 'alpha': 0.0017687267794924669, 'max_depth': 4, 'learning_rate': 0.09979383709722141, 'gamma': 0.07220786000357873, 'colsample_bytree': 0.939232633292054, 'min_child_weight': 2, 'subsample': 0.7443351925177553}
XGB_params2 =  {'lambda': 0.0003741769532691957, 'alpha': 0.00027386497304802976, 'max_depth': 9, 'learning_rate': 0.09571794639204703, 'gamma': 1.2542733834438399e-05, 'colsample_bytree': 0.5362641371247696, 'min_child_weight': 3, 'subsample': 0.11567163589480782}
# XGB_params2 =  {'lambda': 2.312149530300647e-06, 'alpha': 8.44800888247485e-06, 'max_depth': 9, 'learning_rate': 0.0606469701340082, 'gamma': 4.135034256351954e-08, 'colsample_bytree': 0.4998641476146233, 'min_child_weight': 3, 'subsample': 0.13052486707638683}
LGBM_params1 = {'lambda_l1': 0.013086657734924316, 'lambda_l2': 0.021818112097617245, 'num_leaves': 7, 'learning_rate': 0.17939656611410498, 'feature_fraction': 0.6470978553425908, 'bagging_fraction': 0.9259314688300568, 'bagging_freq': 1, 'min_child_samples': 98}
LGBM_params2 = {'lambda_l1': 2.125504936636207e-07, 'lambda_l2': 0.0004897340091318311, 'num_leaves': 8, 'learning_rate': 0.053376510932806945, 'feature_fraction': 0.4748425486495317, 'bagging_fraction': 0.38288586743438613, 'bagging_freq': 4, 'min_child_samples': 68}
LGBM_params3 = {'colsample_bytree': 0.41751822001010247,
                'learning_rate': 0.059510850084881564,
                'max_depth': 10,
                'min_child_samples': 14,
                'num_leaves': 255,
                'reg_alpha': 0.05762416020785592,
                'reg_lambda': 0.03489804289769388,
                'subsample': 0.6517804799892094,
                }
LGBM_params4 = {
        'objective': 'binary', 
        'metric': 'binary_logloss', 
        'boosting': 'goss',
        'learning_rate': 0.09110460114828077,
        'num_leaves': 8,
        'feature_fraction': 0.4989639912997521,
        'bagging_fraction': 0.54872439795985,
        'lambda_l1': 1.4522184914523175, 
        'lambda_l2': 1.7873553090132748e-08,
        'is_unbalance':True, 
        'seed': 42,
    }
LGBM_params_13 = {
    "max_depth": 4,
    "num_leaves": 9,
    "min_child_samples": 17,
    "n_estimators": 200,
    "learning_rate": 0.15,
    "colsample_bytree": 0.4,
    "min_split_gain": 1e-4,
    "reg_alpha": 1e-2,
    "reg_lambda": 5e-3,
}

In [167]:
xgb_optuna1 = {
    'n_estimators': 2000,
    'learning_rate': 0.09641232707445854,
    'booster': 'gbtree',
    'lambda': 4.666002223704784,
    'alpha': 3.708175990751336,
    'subsample': 0.6100174145229473,
    'colsample_bytree': 0.5506821152321051,
    'max_depth': 7,
    'min_child_weight': 3,
    'eta': 1.740374368661041,
    'gamma': 0.007427363662926455,
    'grow_policy': 'depthwise',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'verbosity': 0,
    'random_state': 666,
}

xgb_optuna2 = {
    'n_estimators': 2000,
    'learning_rate': 0.012208383405206188,
    'booster': 'gbtree',
    'lambda': 0.009968756668882757,
    'alpha': 0.02666266827121168,
    'subsample': 0.7097814108897231,
    'colsample_bytree': 0.7946945784285216,
    'max_depth': 3,
    'min_child_weight': 4,
    'eta': 0.5480204506554545,
    'gamma': 0.8788654128774149,
    'scale_pos_weight': 4.71,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'verbosity': 0,
    'random_state': 38,
}

xgb_params4 = {
    'n_estimators': 2000,
    'colsample_bytree': 0.8757972257439255,
    'gamma': 0.11135738771999848,
    'max_depth': 7,
    'min_child_weight': 3,
    'reg_alpha': 0.4833998914998038,
    'reg_lambda': 0.006223568555619563,
    'scale_pos_weight': 8,
    'subsample': 0.7056434340275685,
    'random_state': 424
}

lgbm_params_2_2 = { 'boosting_type': 'goss', 
                    'learning_rate': 0.06733232950390658, 
                    'n_estimators': 50000, 
                    'early_stopping_round': 300, 
                    'random_state': 810,
                    'subsample': 0.6970532011679706,
                    'colsample_bytree': 0.6055755840633003,
                    'class_weight': 'balanced',
                    'metric': 'none', 
                    'is_unbalance': True,
                    'max_depth': 8
}

In [168]:
from scipy.stats.mstats import gmean

class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')
        self.imputer4Tab = SimpleImputer(missing_values=np.nan, strategy='median')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.classifiers = [
            # xgboost
            xgboost.XGBClassifier(**XGB_params1, n_estimators=2000),
            xgboost.XGBClassifier(**XGB_params2, n_estimators=2000),
#             xgboost.XGBClassifier(nestimators=2000),
#             xgboost.XGBClassifier(colsample_bytree=0.8374643164834153, gamma=0.1, learning_rate=0.16064608298859606, max_depth=7, min_child_weight=1, n_estimators=500, reg_alpha=0.5, subsample=0.6687393925308378, random_state=19),
            
            # lightGBM
            lgb.LGBMClassifier(**LGBM_params1,n_estimators=3000, early_stopping_rounds=100, verbose=-1),
#             lgb.LGBMClassifier(**LGBM_params2,n_estimators=3000, early_stopping_rounds=100, verbose=-1),
#             lgb.LGBMClassifier(**LGBM_params3,boosting_type='goss', random_state=42,class_weight='balanced',verbose=-1,n_estimators = 10000,early_stopping_rounds=300),
            lgb.LGBMClassifier(**LGBM_params_13),
#             lgb.LGBMClassifier(**LGBM_params_13),
            
            # TabPFN
            TabPFNClassifier(N_ensemble_configurations=64, device=self.device),
#             TabPFNClassifier(N_ensemble_configurations=128, device=self.device),
            TabPFNClassifier(N_ensemble_configurations=256, device=self.device),

        ]
        self.classifiers_2 = [
            lgb.LGBMClassifier(**LGBM_params4, n_estimators=3000, early_stopping_rounds=100, verbose=-1),
            lgb.LGBMClassifier(**lgbm_params_2_2),
            
            CatBoostClassifier(verbose=0,),
            
            xgboost.XGBClassifier(**xgb_optuna1),
            xgboost.XGBClassifier(**xgb_optuna2),
#             xgboost.XGBClassifier(**xgb_params4),
            
            TabPFNClassifier(N_ensemble_configurations=32, device=self.device),
        ]
    
    def fit(self, X, X4Tab, y, x_val, y_val_meta):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
#         print(self.classes_)
        X = self.imputer.fit_transform(X)
        X4Tab = self.imputer4Tab.fit_transform(X4Tab)
        
        x_val = x_val.values
        y_val_meta = y_val_meta.values
        y_val_meta = np.searchsorted(self.classes_, y_val_meta)
        for classifier in self.classifiers:
            # if classifier != self.classifiers[0] and classifier != self.classifiers[1]:
            if isinstance(classifier, TabPFNClassifier):
                classifier.fit(X4Tab, y, overwrite_warning =True)
            elif isinstance(classifier, lgb.LGBMClassifier):
                classifier.fit(X, y, eval_set=[(x_val, y_val_meta)], 
                               eval_metric='multi_logloss',
                               verbose=10000,
#                                 eval_metric=meta_to_bll,
#                                callbacks=[empty_callback],
                               )
            elif isinstance(classifier, xgboost.XGBClassifier):
                classifier.fit(X, y, eval_set=[(x_val, y_val_meta)], early_stopping_rounds=100, verbose=False)
            else :
                classifier.fit(X, y)
        
        y_2 = y.copy()
        y_val_meta_2 = y_val_meta.copy()
        y_2[y_2 > 0] = 1
        y_val_meta_2[y_val_meta_2 > 0] = 1
        for classifier in self.classifiers_2:
            if isinstance(classifier, TabPFNClassifier):
                classifier.fit(X, y_2, overwrite_warning =True)
            elif isinstance(classifier, lgb.LGBMClassifier):
                classifier.fit(X, y_2, eval_set=[(x_val, y_val_meta_2)], 
                               eval_metric='logloss',
                               verbose=10000,
                               )
            elif isinstance(classifier, xgboost.XGBClassifier):
                classifier.fit(X, y_2, eval_set=[(x_val, y_val_meta_2)], early_stopping_rounds=100, verbose=False)
            else :
                classifier.fit(X, y_2)
    
    def predict_proba(self, x, x4Tab):
        x = self.imputer.transform(x)
        x4Tab = self.imputer4Tab.transform(x4Tab)
        
        # As for 4 categories, we squeeze into 2 firstly
        probabilities = np.stack([classifier.predict_proba(x4Tab) if isinstance(classifier, TabPFNClassifier) else classifier.predict_proba(x) for classifier in self.classifiers])
        probabilities[:, :, 1] = probabilities[:, :, 1:].sum(axis=2)   # calc sum to binary results
        probabilities = probabilities[:, :, :2].copy()
        
        probabilities_2 = np.stack([classifier.predict_proba(x4Tab) if isinstance(classifier, TabPFNClassifier) else classifier.predict_proba(x) for classifier in self.classifiers_2])
        probabilities = np.concatenate((probabilities, probabilities_2), axis=0)
        
        if Post_Process['Ensemble_method'] == 'mean':
            averaged_probabilities = np.mean(probabilities, axis=0)
        elif Post_Process['Ensemble_method'] == 'gmean':
            averaged_probabilities = gmean(probabilities, axis=0)
            
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        
        # Weighted probabilities based on class imbalance(default 1:1)
        new_probabilities = averaged_probabilities
        if Post_Process['trival_calib'] == True:
            new_probabilities = averaged_probabilities * np.array([[(0.58/class_0_est_instances if i==0 else 0.42/others_est_instances) for i in range(averaged_probabilities.shape[1])]])
    
        odds = Post_Process["boost"] * new_probabilities[:, 0] / (1 - new_probabilities[:, 0])
        new_probabilities[:, 0] = odds / (1 + odds)
        
        new_probabilities = new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 
        
        return new_probabilities

这里我们设置了两个KFold，一个为outer，用于选80%的trainning dataset和20%的validation dataset；
一个为inner，用于对trainning dataset分成5折，分别训练出5个模型（5-折模型）。用5-折模型对outer分出的20%的validation dataset预测并计算balanced log loss。
最后选取效果最好的5-折模型对test预测（即分别用5个模型预测，取均值）

In [169]:
from sklearn.model_selection import KFold as KF, GridSearchCV
from sklearn.model_selection import StratifiedKFold as SKF


cv_outer = SKF(n_splits = 8, shuffle=True, random_state=20230806)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=19)

In [170]:
# 计算预测准确率
def calc_acc(y_pred, y):
    probabilities = np.concatenate((y_pred[:, :1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=0的概率
    p1 = 1 - p0
    
    y = y.values.astype(int)
    cnt = 0

    for i in range(len(p0)):
        if p0[i] >= p1[i]:
            lab = 0
        else :
            lab = 1

        if lab == y[i]:
            cnt += 1

    return cnt / len(p0)

# 计算balanced log loss
def calc_loss(y_pred, y):
    probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=
    
    if Post_Process["optimal"] == True:
        N = p0.sum()
        M = probabilities.sum() - N
#         print(M, N, y.shape)
        p0 = (p0 * (M+1)) / (p0*(M-N) + N + 1)
        p_non = probabilities[:, :1]
    
    
    if Post_Process['Set_thres'] == True:
        p0[(p0 > thres_high_l) & (p0 < thres_high_r)] = thres_high_r
        p0[(p0 < thres_low_r) & (p0 > thres_low_l)] = thres_low_l

    p1 = 1 - p0
    
    y = y.values.astype(int)
    loss = balanced_log_loss(y, p1)

    return loss

In [171]:
from tqdm.notebook import tqdm

ros = RandomOverSampler(random_state=42)

def training(model, x, y, y_meta):
    low_loss = np.inf
    best_models = []
    best_loss = []

    for out_id, (train_idx, val_idx) in enumerate(cv_outer.split(x, y_meta), start=1):
        print(f'Now for outer fold {out_id}:')
        x_train_ori, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train_ori, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]

        x_train, y_train = ros.fit_resample(x_train_ori, y_train_ori)
        # x_train, y_train = x_train_ori, y_train_ori
        
        train_loss = np.zeros((x_train.shape[0], 4))
    
        out_X, out_y_meta = x_train, y_train
        out_y = out_y_meta.apply(lambda x: 0 if x == 'A' else 1)
        
        models = []
        losses = []

        for in_id, (train_idx1, val_idx1) in enumerate(cv_inner.split(out_X), start=1):
            in_x_train, in_x_val = out_X.iloc[train_idx1], out_X.iloc[val_idx1]
            in_y_train, in_y_val = out_y_meta.iloc[train_idx1], out_y.iloc[val_idx1]
            in_y_val_meta = out_y_meta.iloc[val_idx1]

            model.fit(in_x_train, in_y_train, in_x_val, in_y_val_meta)
            models.append(model) 

            y_pred = model.predict_proba(in_x_val)
            train_loss[val_idx1] = y_pred

            metric = calc_loss(y_pred, in_y_val)
            losses.append(metric)
            print('Inner_fold = %.1f, val_loss = %.5f' % (in_id, metric))
        
        # 分别用models中的模型计算x_val的loss
        val_y_pred = np.zeros((x_val.shape[0], 4))
        for los_idx, model in enumerate(models):
            y_pred = model.predict_proba(x_val)
            val_y_pred += y_pred
        val_y_pred /= len(models)

        metric_train = calc_loss(train_loss, out_y)
        acc_train = calc_acc(train_loss, out_y)
        print(f'80% Train Loss: {metric_train}; Train Acc: {acc_train}')
        metric_val = calc_loss(val_y_pred, y_val)
        acc_val = calc_acc(val_y_pred, y_val)
        print(f'20% Val Loss: {metric_val}; Val Acc: {acc_val}\n')

        if metric_val < low_loss:
            low_loss = metric_val
            best_models = models
            best_loss = losses

        # break       # 先只跑一次，节约时间    
        
    return best_models, best_loss

In [172]:
def training_kaggle(model, x, y, y_meta):
    models = []
    losses = []
    errors = np.zeros((x.shape[0], ), dtype=np.float64)
    train_preds = np.zeros((x.shape[0], 2), dtype=np.float64)

    for out_id, (train_idx, val_idx) in enumerate(cv_outer.split(x, y_meta), start=1):
        print(f'Now for fold {out_id}:')
        x_train_ori, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train_ori, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
        y_val_meta = y_meta.iloc[val_idx]
        y_train_label = y.iloc[train_idx]

        cols_x_train_ori = len(x_train_ori.columns)
        y_train_ori_df = pd.DataFrame(y_train_ori, columns=['Alpha'])
        x_train_ori_comb = pd.concat((x_train_ori, y_train_ori_df), axis=1)

#         train_ros, y_nonsense = ros.fit_resample(x_train_ori_comb, y_train_label)    # 按 0/1 over sample
#         x_train = train_ros.iloc[:, :cols_x_train_ori].copy()
#         y_train = train_ros.iloc[:, cols_x_train_ori:].copy()
        x_train, y_train = ros.fit_resample(x_train_ori, y_train_ori)         # 按 A/B/D/G over sample
        # x_train, y_train = x_train_ori, y_train_ori

        out_X, out_y_meta = x_train, y_train
        # out_y = out_y_meta.apply(lambda x: 0 if x == 'A' else 1)
        out_X4Tab = out_X.copy()         # try to extract the useful features for TabPFN
        model.fit(out_X, out_X4Tab, out_y_meta, x_val, y_val_meta)
        models.append(model)
        
        # 用训练好的model计算x_val的loss
        x_val4Tab = x_val.copy()
        val_y_pred = model.predict_proba(x_val, x_val4Tab) # try to extract the useful features for TabPFN
        train_preds[val_idx] = val_y_pred
        errors[val_idx] = err(y_val, val_y_pred[:, 1:].sum(axis=1))

        metric_val = calc_loss(val_y_pred, y_val)
        losses.append(metric_val)
        acc_val = calc_acc(val_y_pred, y_val)
        print(f'20% Val Loss: {metric_val}; Val Acc: {acc_val}\n')

        # break       # 先只跑一次，节约时间    
        
    return models, losses, errors, train_preds

In [173]:
def training_loo(model, x, y, y_meta):
    loo = LeaveOneOut()
    
    models = []
    errors = []
    train_preds = np.zeros((x.shape[0], 4), dtype=np.float64)

    for train_idx, val_idx in tqdm(loo.split(x), total = x.shape[0]):
        x_train_ori, x_val = x.iloc[train_idx], x.iloc[val_idx]
        y_train_ori, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
        y_val_meta = y_meta.iloc[val_idx]

#         x_train, y_train = ros.fit_resample(x_train_ori, y_train_ori)
        x_train, y_train = x_train_ori, y_train_ori
    
        model.fit(x_train, y_train, x_val, y_val_meta)
        models.append(model)
        
        val_y_pred = model.predict_proba(x_val)
        train_preds[val_idx] = val_y_pred
        
        errors.append(err(y_val, val_y_pred[:,1:].sum()))
    
    return models, errors, train_preds

用模型对验证集在本地进行指标评估(balanced log loss).

In [174]:
x_ = train_pred_and_time.drop(['Class', 'Id'], axis=1)
y_ = train_pred_and_time['Class']
y_meta_ = train_cate

yt = Ensemble()

print(x_.shape, y_.shape)
# models, losses = training(yt, x_, y_, y_meta_)
models, losses, errors, train_preds = training_kaggle(yt, x_, y_, y_meta_)
# models, errors, train_preds = training_loo(yt, x_, y_, y_meta_)

Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
(617, 61) (617,)
Now for fold 1:
20% Val Loss: 0.10343073104028024; Val Acc: 0.9743589743589743

Now for fold 2:








20% Val Loss: 0.31943874994328153; Val Acc: 0.8831168831168831

Now for fold 3:








20% Val Loss: 0.185349739138346; Val Acc: 0.935064935064935

Now for fold 4:








20% Val Loss: 0.07878315922568181; Val Acc: 0.961038961038961

Now for fold 5:








20% Val Loss: 0.13817745545294338; Val Acc: 0.961038961038961

Now for fold 6:








20% Val Loss: 0.19042126322181466; Val Acc: 0.8961038961038961

Now for fold 7:








20% Val Loss: 0.11222276237234641; Val Acc: 0.961038961038961

Now for fold 8:








20% Val Loss: 0.13046371600959747; Val Acc: 0.935064935064935



计算一下对训练集的预测的概率的loss，以及使用probability calibration后的loss.
这里的probability calibration我们尝试了Platt Scale & Isotionic Regression

In [175]:
from sklearn.isotonic import IsotonicRegression
# from betacal import BetaCalibration

train_label = train_pred_and_time['Class']

# before calibration
print('Before calibration:')
print(f'Bll: {calc_loss(train_preds, train_label)}, Acc: {calc_acc(train_preds, train_label)}\n')

# after calibration
print('After calibration:')
train_preds2 = train_preds[:, 1:].sum(axis=1).reshape(-1, 1)
ros_pc = RandomOverSampler(random_state=38)
train_preds2_ros, train_label_ros = ros_pc.fit_resample(train_preds2, train_label)

if Post_Process['Calibration'] == 'bc':
    Calib = BetaCalibration(parameters="abm")
    Calib.fit(train_preds2_ros, train_label_ros)
    print(f'Bll: {balanced_log_loss(train_label, Calib.predict(train_preds2))}\n')
elif Post_Process['Calibration'] == 'iso':
    Calib = IsotonicRegression(out_of_bounds='clip')
    Calib.fit(train_preds2_ros, train_label_ros)
    print(f'Bll: {balanced_log_loss(train_label, Calib.predict(train_preds2))}\n')

# print(np.concatenate((train_preds2, train_label.values.reshape(-1, 1)), axis=1))

Before calibration:
Bll: 0.1567333645748055, Acc: 0.9384116693679092

After calibration:


In [176]:
errors = pd.DataFrame(errors)
errors.join(greeks.Alpha).sort_values(0).tail(30)

Unnamed: 0,0,Alpha
203,0.583941,A
503,0.592838,A
146,0.596212,B
61,0.610563,A
337,0.61238,A
521,0.612414,A
322,0.61423,A
55,0.619037,A
220,0.62244,A
313,0.636637,B


In [177]:
errors.join(greeks.Alpha).groupby('Alpha').agg(['mean', 'std', 'max'])

Unnamed: 0_level_0,0,0,0
Unnamed: 0_level_1,mean,std,max
Alpha,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,0.095126,0.183645,0.95011
B,0.106027,0.16832,0.741273
D,0.176148,0.254974,0.99367
G,0.061872,0.108515,0.509798


In [178]:
test_pred_and_time4Tab = test_pred_and_time.copy()
y_pred = np.zeros((test_pred_and_time.shape[0], 2))
# print(len(models))
for los_idx, model in enumerate(models):
    y_pred += model.predict_proba(test_pred_and_time, test_pred_and_time4Tab) / losses[los_idx]
y_pred = y_pred / y_pred[:, :].sum(axis=1, keepdims=True)

print(f"Weight: {[(1/x) for x in losses]}")

probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
p0 = probabilities[:, :1]
p1 = 1 - p0
p0 = p0.astype(np.float64)

if Post_Process["optimal"] == True:
    N = p0.sum()
    M = probabilities.sum() - N
#         print(M, N, y.shape)
    p0 = (p0 * (M+1)) / (p0*(M-N) + N + 1)
    p_non = probabilities[:, :1]

if Post_Process['Calibration'] != 'none':
    p1 = Calib.predict(p1)
    p0 = 1 - p1
    
if Post_Process['Set_thres'] == True:
    p0[p0 > thres_high_l & p0 < thres_high_r] = thres_high_r
    p0[p0 < thres_low_r & po > thres_low_l] = thres_low_l

KeyboardInterrupt: 

In [None]:
submission = pd.DataFrame(test["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1.0 - p0
submission.to_csv('submission.csv', index=False)

In [None]:
# submission_df = pd.read_csv('submission.csv')
print(submission.head(20))