ICR Competition

In [1259]:
# !pip install tabpfn --no-index --find-links=file:///kaggle/input/pip-packages-icr/pip-packages
# !mkdir -p /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff
# !cp /kaggle/input/pip-packages-icr/pip-packages/prior_diff_real_checkpoint_n_0_epoch_100.cpkt /opt/conda/lib/python3.10/site-packages/tabpfn/models_diff/

In [1260]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings

warnings.filterwarnings("ignore")

先加载各个数据文件

In [1261]:
# train = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/train.csv')
# test = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/test.csv')
# sample = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/sample_submission.csv')
# greeks = pd.read_csv('/kaggle/input/icr-identify-age-related-conditions/greeks.csv')

train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('./icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

然后将EJ属性二值化（0/1）。

In [1262]:
first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

origin_test = test.copy()

根据Disscussion Post，我们可以将原数据集中的某些属性转化成整数取值，并且猜测BN属性可能是指年龄，所以我们可以在BN上进行一些操作。

In [1263]:
int_denominators = {
    'AB': 0.004273,
    'AF': 0.00242,
    'AH': 0.008709,
    'AM': 0.003097,
    'AR': 0.005244,
    'AX': 0.008859,
    'AY': 0.000609,
    'AZ': 0.006302,
    'BC': 0.007028,
    'BD ': 0.00799,
    'BN': 0.3531,
    'BP': 0.004239,
    'BQ': 0.002605,
    'BR': 0.006049,
    'BZ': 0.004267,
    'CB': 0.009191,
    'CC': 6.12e-06,
    'CD ': 0.007928,
    'CF': 0.003041,
    'CH': 0.000398,
    'CL': 0.006365,
    'CR': 7.5e-05,
    'CS': 0.003487,
    'CU': 0.005517,
    'CW ': 9.2e-05,
    'DA': 0.00388,
    'DE': 0.004435,
    'DF': 0.000351,
    'DH': 0.002733,
    'DI': 0.003765,
    'DL': 0.00212,
    'DN': 0.003412,
    'DU': 0.0013794,
    'DV': 0.00259,
    'DY': 0.004492,
    'EB': 0.007068,
    'EE': 0.004031,
    'EG': 0.006025,
    'EH': 0.006084,
    'EL': 0.000429,
    'EP': 0.009269,
    'EU': 0.005064,
    'FC': 0.005712,
    'FD ': 0.005937,
    'FE': 0.007486,
    'FI': 0.005513,
    'FR': 0.00058,
    'FS': 0.006773,
    'GB': 0.009302,
    'GE': 0.004417,
    'GF': 0.004374,
    'GH': 0.003721,
    'GI': 0.002572
}
for k, v in int_denominators.items():
    train[k] = np.round(train[k] / v, 1)
    test[k] = np.round(test[k] / v, 1)

通过绘制密度-年龄图我们可以看到，Class为1的样本对应的曲线比Class为0的样本对应的曲线要右移了一些，所以这体现出年龄较大时更可能得病。为了增强BN这一属性对预测结果的影响，我们尝试将BN属性复制多份加入到数据中。

predictor_columns中提取中56个属性特征。

In [1264]:
repeat_attr = ['BN', 'CR', 'CU', 'DE']
for attr in repeat_attr:
    for i in range(3):
        AttrName = attr + str(i+1)
        train[AttrName] = train[attr].copy()
        test[AttrName] = test[attr].copy()

predictor_columns = [n for n in train.columns if n != 'Class' and n != 'Id']

设置评判标准。在这次比赛中使用的评判标准是balanced log loss，公式如下：
$$
\text { Log Loss }=\frac{-\frac{1}{N_0} \sum_{i=1}^{N_0} y_{0 i} \log p_{0 i}-\frac{1}{N_1} \sum_{i=1}^{N_1} y_{1 i} \log p_{1 i}}{2}
$$
这样的目标是平衡两类的重要程度。

In [1265]:
from sklearn.metrics import log_loss
def balanced_log_loss(y_true, y_pred):
    nc = np.bincount(y_true)
    # print(1/nc[y_true])
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15, labels=[0, 1])

In [1266]:
y_true = np.array([1,1,1,0,0,0,0]).astype('int')
y_pred = np.array([1] * len(y_true)).astype('float64')
# y_pred = np.array([1,1,1,0,0,0,0]).astype('float64')
bll = balanced_log_loss(y_true, y_pred)
print(bll)

17.26978799617044


In [1267]:
from datetime import datetime
times = greeks.Epsilon.copy()
times[greeks.Epsilon != 'Unknown'] = greeks.Epsilon[greeks.Epsilon != 'Unknown'].map(lambda x: datetime.strptime(x,'%m/%d/%Y').toordinal())
times[greeks.Epsilon == 'Unknown'] = np.nan
# times = times.astype('float64')

In [1268]:
train_pred_and_time = pd.concat((train, times, greeks.Alpha), axis=1)
train_label = train_pred_and_time['Class']
train_cate = train_pred_and_time.iloc[:, -1]         # A, B, D, G
train_pred_and_time = train_pred_and_time.drop(train_pred_and_time.columns[-1], axis=1)

test_predictors = test[predictor_columns]
test_time = np.zeros((len(test_predictors), 1)) + train_pred_and_time.Epsilon.max() + 1
test_pred_and_time = pd.concat((test_predictors, pd.DataFrame(test_time, columns=['Epsilon'])), axis=1)

In [1269]:
# 对train和test dataset的新的改动

train_pred_and_time = train_pred_and_time.drop(['EH'], axis=1)
test_pred_and_time = test_pred_and_time.drop(['EH'], axis=1)

设计集成模型，这里使用了4个分类器，两个XGBoost，两个TabPFN。

In [1270]:
class Ensemble():
    def __init__(self):
        self.imputer = SimpleImputer(missing_values=np.nan, strategy='median')

        self.classifiers =[xgboost.XGBClassifier(n_estimators=100,max_depth=3,learning_rate=0.15,subsample=0.9,colsample_bytree=0.85),
                           xgboost.XGBClassifier(),
                           TabPFNClassifier(N_ensemble_configurations=24),
                           TabPFNClassifier(N_ensemble_configurations=64)]
    
    def fit(self, X, y):
        y = y.values
        unique_classes, y = np.unique(y, return_inverse=True)
        self.classes_ = unique_classes
        # first_category = X.EJ.unique()[0]
        # X.EJ = X.EJ.eq(first_category).astype('int')
        
        X = self.imputer.fit_transform(X)

        for classifier in self.classifiers:
            if classifier == self.classifiers[2] or classifier == self.classifiers[3]:
                classifier.fit(X, y, overwrite_warning=True)
            else :
                classifier.fit(X, y)
     
    def predict_proba(self, x):
        x = self.imputer.transform(x)

        probabilities = np.stack([classifier.predict_proba(x) for classifier in self.classifiers])
        averaged_probabilities = np.mean(probabilities, axis=0)
        class_0_est_instances = averaged_probabilities[:, 0].sum()
        others_est_instances = averaged_probabilities[:, 1:].sum()
        
        # Weighted probabilities based on class imbalance
        new_probabilities = averaged_probabilities * np.array([[1/(class_0_est_instances if i==0 else others_est_instances) for i in range(averaged_probabilities.shape[1])]])
        
        return new_probabilities / np.sum(new_probabilities, axis=1, keepdims=1) 

这里我们设置了两个KFold，一个为outer，用于选80%的trainning dataset和20%的validation dataset；
一个为inner，用于对trainning dataset分成5折，分别训练出5个模型（5-折模型）。用5-折模型对outer分出的20%的validation dataset预测并计算balanced log loss。
最后选取效果最好的5-折模型对test预测（即分别用5个模型预测，取均值）

In [1271]:
from sklearn.model_selection import StratifiedKFold as SKF, GridSearchCV

cv_outer = SKF(n_splits = 5, shuffle=True, random_state=42)
# cv_inner = SKF(n_splits = 5, shuffle=True, random_state=19)

In [1272]:
def calc_acc(y_pred, y):
    probabilities = np.concatenate((y_pred[:, :1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=0的概率
    p1 = 1 - p0
    
    y = y.values.astype(int)
    cnt = 0

    for i in range(len(p0)):
        if p0[i] >= p1[i]:
            lab = 0
        else :
            lab = 1

        if lab == y[i]:
            cnt += 1

    return cnt / len(p0)

In [1273]:
def calc_loss(y_pred, y):
    probabilities = np.concatenate((y_pred[:, :1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=0的概率
    p0[p0 > 0.75] = 1
    p0[p0 < 0.20] = 0

    p1 = 1 - p0

    y_  = y.values.astype(int).reshape(-1, 1)
    prt = np.concatenate((p0, p1, y_), axis=1)
    # print(prt)
    
    y = y.values.astype(int)
    loss = balanced_log_loss(y, p1)

    return loss

In [1274]:
from tqdm.notebook import tqdm

def training(model, x, y, y_meta):

    model.fit(x, y_meta)
    y_pred = model.predict_proba(x)
    metric = calc_loss(y_pred, y)
    acc = calc_acc(y_pred, y)

    return model, metric, acc
    

In [1275]:
yt = Ensemble()
ros = RandomOverSampler(random_state=42)

low_loss = np.inf
low_loss = np.inf
for out_id, (train_idx, val_idx) in enumerate(cv_outer.split(train_pred_and_time, train_label), start=1):
    # 初步得到训练集
    x_train = train_pred_and_time.iloc[train_idx].drop(['Class', 'Id'], axis=1)
    y_meta_train = train_cate[train_idx]
    y_train = train_label[train_idx]

    # 为了平衡两类的样本数，进行过采样，得到最终训练集
    x_train_ros, y_meta_train_ros = ros.fit_resample(x_train, y_meta_train)
    y_train_ros = y_meta_train_ros.apply(lambda x: 0 if x == 'A' else 1)

    # print(f'Original dataset shape:')
    # print(y_meta_train.value_counts())
    # print('Resample dataset shape')
    # print(y_meta_train_ros.value_counts())

    # 得到验证集
    x_val = train_pred_and_time.iloc[val_idx].drop(['Class', 'Id'], axis=1)
    y_val = train_label.iloc[val_idx]
    y_meta_val = train_cate[val_idx]

    # 模型训练
    model, train_loss, train_acc = training(yt, x_train_ros, y_train_ros, y_meta_train_ros)
    # model, train_loss, train_acc = training(yt, x_train, y_train, y_meta_train)
    
    # 模型验证
    y_pred = model.predict_proba(x_val)
    val_loss = calc_loss(y_pred, y_val)
    val_acc = calc_acc(y_pred, y_val)

    # 输出结果
    print(f'第{out_id}折验证：')
    print(f'训练集loss: {train_loss}; 训练集acc: {train_acc}')
    print(f'验证集loss: {val_loss}; 验证集acc: {val_acc}\n')

    # 更新模型
    if val_loss < low_loss:
        low_loss = val_loss
        best_model = model
        print('Saved new best model.')
    
    # break


Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters
第1折验证：
训练集loss: 9.992007221626415e-16; 训练集acc: 1.0
验证集loss: 0.2525410147871475; 验证集acc: 0.9516129032258065

Saved new best model.
第2折验证：
训练集loss: 9.992007221626415e-16; 训练集acc: 1.0
验证集loss: 1.0762323246738559; 验证集acc: 0.9193548387096774

第3折验证：
训练集loss: 9.992007221626413e-16; 训练集acc: 1.0
验证集loss: 1.37848762318333; 验证集acc: 0.9186991869918699

第4折验证：
训练集loss: 9.992007221626415e-16; 训练集acc: 1.0
验证集loss: 3.4042875188636064; 验证集acc: 0.8617886178861789

第5折验证：
训练集loss: 9.992007221626415e-16; 训练集acc: 1.0
验证集loss: 0.6252899985967869; 验证集acc: 0.9105691056910569



将训练得到的好模型保存下来，存入pkl文件

In [1276]:
import pickle

filename = 'model.pkl'
with open(filename, 'wb') as f:
    pickle.dump(model, f)

用模型对验证集在本地进行指标评估(balanced log loss).

对测试集进行预测，并输出到submission.csv

In [1277]:
y_pred = model.predict_proba(test_pred_and_time)

probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:,1:], 1, keepdims=True)), axis=1)
p0 = probabilities[:,:1]
p0[p0 > 0.75] = 1
p0[p0 < 0.20] = 0

# 根据discussion中的发现，BQ为none的class为0
p0[test_pred_and_time['BQ'] == None] = 0

In [1278]:
submission = pd.DataFrame(test["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('submission.csv', index=False)

In [1279]:
submission_df = pd.read_csv('submission.csv')
submission_df

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.5,0.5
1,010ebe33f668,0.5,0.5
2,02fa521e1838,0.5,0.5
3,040e15f562a2,0.5,0.5
4,046e85c7cc7f,0.5,0.5
