In [41]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [42]:
train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('./icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

In [43]:
int_denominators = {
    'AB': 0.004273,
    'AF': 0.00242,
    'AH': 0.008709,
    'AM': 0.003097,
    'AR': 0.005244,
    'AX': 0.008859,
    'AY': 0.000609,
    'AZ': 0.006302,
    'BC': 0.007028,
    'BD ': 0.00799,
    'BN': 0.3531,
    'BP': 0.004239,
    'BQ': 0.002605,
    'BR': 0.006049,
    'BZ': 0.004267,
    'CB': 0.009191,
    'CC': 6.12e-06,
    'CD ': 0.007928,
    'CF': 0.003041,
    'CH': 0.000398,
    'CL': 0.006365,
    'CR': 7.5e-05,
    'CS': 0.003487,
    'CU': 0.005517,
    'CW ': 9.2e-05,
    'DA': 0.00388,
    'DE': 0.004435,
    'DF': 0.000351,
    'DH': 0.002733,
    'DI': 0.003765,
    'DL': 0.00212,
    'DN': 0.003412,
    'DU': 0.0013794,
    'DV': 0.00259,
    'DY': 0.004492,
    'EB': 0.007068,
    'EE': 0.004031,
    'EG': 0.006025,
    'EH': 0.006084,
    'EL': 0.000429,
    'EP': 0.009269,
    'EU': 0.005064,
    'FC': 0.005712,
    'FD ': 0.005937,
    'FE': 0.007486,
    'FI': 0.005513,
    'FR': 0.00058,
    'FS': 0.006773,
    'GB': 0.009302,
    'GE': 0.004417,
    'GF': 0.004374,
    'GH': 0.003721,
    'GI': 0.002572
}
for k, v in int_denominators.items():
    train[k] = np.round(train[k] / v, 1)
    test[k] = np.round(test[k] / v, 1)

chose_col = ['Id','AF','BQ','AB','DU','DI','FL','CR','DH','BN','DA','EH','CD ','BP', 'DL','EE','FD ','DE','GL','FR','FI','EB','CU','CS', 'BR', 'BZ', 'CC']
train = train[chose_col + ['Class']]
test = test[chose_col]

In [44]:
Imp = SimpleImputer(missing_values=np.nan, strategy='median')

columns_to_select = [col for col in train.columns if col not in ['Class', 'Id']]

train_data = train[columns_to_select].copy()
test_data = test[columns_to_select].copy()

# 填充缺失值
train_data = pd.DataFrame(Imp.fit_transform(train_data), columns=columns_to_select)
test_data = pd.DataFrame(Imp.transform(test_data), columns=columns_to_select)

# 重新组合数据和原始列
train_filled = pd.concat([train['Id'], train_data, train['Class']], axis=1)
test_filled = pd.concat([test['Id'], test_data], axis=1)

train = train_filled.copy()
test = test_filled.copy()
# print(train.shape, test.shape)

In [45]:
from sklearn.cluster import KMeans

k = 7
BNpd = train['BN']

BNpd = pd.concat([train['BN'], test['BN']], axis=0, ignore_index=True)
data = BNpd.values.reshape(-1, 1)
kmodel = KMeans(n_clusters=k)           # k为聚成几类
kmodel.fit(data)  # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_, columns=['cc']) # 求聚类中心
c0 = pd.DataFrame({'cc': [0.0]})
c = pd.concat([c0, c], axis=0, ignore_index=True)
c = c.sort_values(by='cc').reset_index(drop=True)

# 求聚类中心之间的平均值作为分割点
for i in range(c.shape[0] - 1):
    c.iloc[i]['cc'] = (c.iloc[i]['cc'] + c.iloc[i+1]['cc']) / 2
c = c.drop(c.index[-1])

c0 = pd.DataFrame({'cc': [0.0]})
cn = pd.DataFrame({'cc': [max(train['BN'].max(), test['BN'].max()) * 5]})
c = pd.concat([c0, c, cn], axis=0, ignore_index=True)
c = c['cc'].round().astype(int)
c = c.unique()
range_num = c.shape[0] - 1
c = c.tolist()

# 保留旧BN，添加BN_binning
train_BN = train['BN'].values
train_binning = pd.cut(train_BN, c, labels=range(range_num), include_lowest=True).astype(int)
train['BN_binning'] = train_binning
print(train['BN_binning'].value_counts())

test_BN = test['BN'].values
test_binning = pd.cut(test_BN, c, labels=range(range_num), include_lowest=True).astype(int)
test['BN_binning'] = test_binning

BN_binning
5    159
3    147
4    140
2     78
6     66
1     27
Name: count, dtype: int64


In [46]:
y_meta = greeks['Alpha']
X = train.drop(columns=['Id', 'Class'])
y = train['Class']

In [47]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y_meta, test_size=0.2, random_state=25)

# 将标签'A', 'B', 'C', 'D'映射成0，1，2，3
label_encoder = LabelEncoder()
y_meta = label_encoder.fit_transform(y_meta)
X = pd.DataFrame(X)
y_meta = pd.DataFrame(y_meta)
y = pd.DataFrame(y)
# print(y_train.shape, y_test.shape)

In [48]:
from sklearn.metrics import log_loss

def balanced_log_loss(y_true, y_pred):
    # print(y_true)
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15, labels=[0, 1])

def calc_loss(y_pred, y):
    probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=
    p1 = 1 - p0

    y = y.values
    y = np.array([0 if x==0 else 1 for x in y])
    loss = balanced_log_loss(y, p1)
    return loss

In [49]:
from sklearn.model_selection import KFold as KF, GridSearchCV
from sklearn.model_selection import StratifiedKFold as SKF


cv_outer = SKF(n_splits = 8, shuffle=True, random_state=19)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=19)
ros = RandomOverSampler(random_state=19)

In [50]:
import optuna
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# Define objective function
def objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 3, 15),
        # 'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.5),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 10),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'loss_function': 'MultiClass',
    }

    model = CatBoostClassifier(**params)
    train_preds = np.zeros((X.shape[0], 4))

    for out_id, (train_idx, val_idx) in enumerate(cv_outer.split(X, y_meta), start=1):
        x_train, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_meta.iloc[train_idx], y_meta.iloc[val_idx]

        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)

        val_y_pred = model.predict_proba(x_val)
        train_preds[val_idx] = val_y_pred

    bll = calc_loss(train_preds, y)

    return bll

# Create Optuna optimizer and run optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1200)

# Print tuning results
print('Best trial:', study.best_trial.params)
print('Best Bll:', study.best_value)

[I 2023-08-06 03:12:54,763] A new study created in memory with name: no-name-29147c96-bf17-4664-9f54-15dc5b45438e
[I 2023-08-06 03:12:55,500] Trial 0 finished with value: 0.8669401241137923 and parameters: {'depth': 10, 'learning_rate': 0.12015779403147335, 'l2_leaf_reg': 7.36651073598724e-08, 'border_count': 210, 'thread_count': 7, 'random_strength': 76, 'bagging_temperature': 61.670904404963395, 'od_wait': 43}. Best is trial 0 with value: 0.8669401241137923.
[I 2023-08-06 03:13:10,513] Trial 1 finished with value: 0.6026430803075559 and parameters: {'depth': 7, 'learning_rate': 0.294307405267538, 'l2_leaf_reg': 5.457844928941769e-06, 'border_count': 221, 'thread_count': 3, 'random_strength': 43, 'bagging_temperature': 0.06708295724299283, 'od_wait': 23}. Best is trial 1 with value: 0.6026430803075559.
[I 2023-08-06 03:13:21,558] Trial 2 finished with value: 0.5120284641101124 and parameters: {'depth': 6, 'learning_rate': 0.04405130776108185, 'l2_leaf_reg': 0.024293319079698936, 'bord

KeyboardInterrupt: 