In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
import xgboost
import inspect
from collections import defaultdict
from tabpfn import TabPFNClassifier
import warnings

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
train = pd.read_csv('./icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('./icr-identify-age-related-conditions/test.csv')
sample = pd.read_csv('./icr-identify-age-related-conditions/sample_submission.csv')
greeks = pd.read_csv('./icr-identify-age-related-conditions/greeks.csv')

first_category = train.EJ.unique()[0]
train.EJ = train.EJ.eq(first_category).astype('int')
test.EJ = test.EJ.eq(first_category).astype('int')

In [3]:
int_denominators = {
    'AB': 0.004273,
    'AF': 0.00242,
    'AH': 0.008709,
    'AM': 0.003097,
    'AR': 0.005244,
    'AX': 0.008859,
    'AY': 0.000609,
    'AZ': 0.006302,
    'BC': 0.007028,
    'BD ': 0.00799,
    'BN': 0.3531,
    'BP': 0.004239,
    'BQ': 0.002605,
    'BR': 0.006049,
    'BZ': 0.004267,
    'CB': 0.009191,
    'CC': 6.12e-06,
    'CD ': 0.007928,
    'CF': 0.003041,
    'CH': 0.000398,
    'CL': 0.006365,
    'CR': 7.5e-05,
    'CS': 0.003487,
    'CU': 0.005517,
    'CW ': 9.2e-05,
    'DA': 0.00388,
    'DE': 0.004435,
    'DF': 0.000351,
    'DH': 0.002733,
    'DI': 0.003765,
    'DL': 0.00212,
    'DN': 0.003412,
    'DU': 0.0013794,
    'DV': 0.00259,
    'DY': 0.004492,
    'EB': 0.007068,
    'EE': 0.004031,
    'EG': 0.006025,
    'EH': 0.006084,
    'EL': 0.000429,
    'EP': 0.009269,
    'EU': 0.005064,
    'FC': 0.005712,
    'FD ': 0.005937,
    'FE': 0.007486,
    'FI': 0.005513,
    'FR': 0.00058,
    'FS': 0.006773,
    'GB': 0.009302,
    'GE': 0.004417,
    'GF': 0.004374,
    'GH': 0.003721,
    'GI': 0.002572
}
for k, v in int_denominators.items():
    train[k] = np.round(train[k] / v, 1)
    test[k] = np.round(test[k] / v, 1)

chose_col = ['Id','AF','BQ','AB','DU','DI','FL','CR','DH','BN','DA','EH','CD ','BP', 'DL','EE','FD ','DE','GL','FR','FI','EB','CU','CS', 'BR', 'BZ', 'CC']
train = train[chose_col + ['Class']]
test = test[chose_col]

In [4]:
Imp = SimpleImputer(missing_values=np.nan, strategy='median')

columns_to_select = [col for col in train.columns if col not in ['Class', 'Id']]

train_data = train[columns_to_select].copy()
test_data = test[columns_to_select].copy()

# 填充缺失值
train_data = pd.DataFrame(Imp.fit_transform(train_data), columns=columns_to_select)
test_data = pd.DataFrame(Imp.transform(test_data), columns=columns_to_select)

# 重新组合数据和原始列
train_filled = pd.concat([train['Id'], train_data, train['Class']], axis=1)
test_filled = pd.concat([test['Id'], test_data], axis=1)

train = train_filled.copy()
test = test_filled.copy()
# print(train.shape, test.shape)

In [5]:
from sklearn.cluster import KMeans

k = 7
BNpd = train['BN']

BNpd = pd.concat([train['BN'], test['BN']], axis=0, ignore_index=True)
data = BNpd.values.reshape(-1, 1)
kmodel = KMeans(n_clusters=k)           # k为聚成几类
kmodel.fit(data)  # 训练模型
c = pd.DataFrame(kmodel.cluster_centers_, columns=['cc']) # 求聚类中心
c0 = pd.DataFrame({'cc': [0.0]})
c = pd.concat([c0, c], axis=0, ignore_index=True)
c = c.sort_values(by='cc').reset_index(drop=True)

# 求聚类中心之间的平均值作为分割点
for i in range(c.shape[0] - 1):
    c.iloc[i]['cc'] = (c.iloc[i]['cc'] + c.iloc[i+1]['cc']) / 2
c = c.drop(c.index[-1])

c0 = pd.DataFrame({'cc': [0.0]})
cn = pd.DataFrame({'cc': [max(train['BN'].max(), test['BN'].max()) * 5]})
c = pd.concat([c0, c, cn], axis=0, ignore_index=True)
c = c['cc'].round().astype(int)
c = c.unique()
range_num = c.shape[0] - 1
c = c.tolist()

# 保留旧BN，添加BN_binning
train_BN = train['BN'].values
train_binning = pd.cut(train_BN, c, labels=range(range_num), include_lowest=True).astype(int)
train['BN_binning'] = train_binning
print(train['BN_binning'].value_counts())

test_BN = test['BN'].values
test_binning = pd.cut(test_BN, c, labels=range(range_num), include_lowest=True).astype(int)
test['BN_binning'] = test_binning

BN_binning
5    159
3    147
4    140
2     78
6     66
1     27
Name: count, dtype: int64


In [6]:
Multiply_features = [
#     ['DI', 'DU'],
    ['DU', 'DU'],
    ['DU', 'FR'],
    ['DA', 'DE'],
    ['AB', 'GL'],
]

for j, columns_to_mul in enumerate(Multiply_features):
    mix_col = columns_to_mul[0] + '+' + columns_to_mul[1]
    train[mix_col] = train[columns_to_mul[0]] * train[columns_to_mul[1]]
    test[mix_col] = test[columns_to_mul[0]] * test[columns_to_mul[1]]
    

In [7]:
def change(X):
    X['out_GL'] = 0
    X.loc[X['GL']<1,'out_GL'] = X.loc[X['GL']<1,'GL'].map(lambda x : x-X.loc[X['GL']<1,'GL'].mean())
    X.loc[X['GL']>1.5,'out_GL'] = X.loc[X['GL']>1.5,'GL'].map(lambda x : x-X.loc[X['GL']>1.5,'GL'].mean())
    X.out_GL = X.out_GL.astype('float')
    X['DA*CS'] = np.log(X.DA*2 / X.CS**0.5)#0.2100892\
    return X

In [8]:
train = change(train)
test = change(test)
print(train.columns)

Index(['Id', 'AF', 'BQ', 'AB', 'DU', 'DI', 'FL', 'CR', 'DH', 'BN', 'DA', 'EH',
       'CD ', 'BP', 'DL', 'EE', 'FD ', 'DE', 'GL', 'FR', 'FI', 'EB', 'CU',
       'CS', 'BR', 'BZ', 'CC', 'Class', 'BN_binning', 'DU+DU', 'DU+FR',
       'DA+DE', 'AB+GL', 'out_GL', 'DA*CS'],
      dtype='object')


In [9]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

Poly_features = [
                 ['DI', 'DU'],
#                  ['BR', 'BZ'],
#                  ['CR', 'AB', 'FL',],
#                  ['CR'],
                ]

for j, columns_to_derive in enumerate(Poly_features):
    # 多项式特征衍生
    degree_dim = 3 if j == 0 else 2
    poly = PolynomialFeatures(degree=degree_dim, include_bias=False, interaction_only=True)
    # Z-Score标准化
    scaler = StandardScaler()
    
    poly_features = poly.fit_transform(train[columns_to_derive])
    scaled_features = scaler.fit_transform(poly_features)
    # 生成新的特征列名
    new_feature_names = [f"poly_{j}_{i}" for i in range(scaled_features.shape[1])]
    features_train_df = pd.DataFrame(scaled_features, columns=new_feature_names)
    train = pd.concat([train, features_train_df], axis=1)
    train = train.drop(columns=columns_to_derive)

    # 测试集
    poly_features_test = poly.transform(test[columns_to_derive])
    scaled_features_test = scaler.transform(poly_features_test)
    features_test_df = pd.DataFrame(scaled_features_test, columns=new_feature_names)
    test = pd.concat([test, features_test_df], axis=1)
    test = test.drop(columns=columns_to_derive)

In [10]:
drop_col = [
#     ['BC', 'CL'],
    # ['EU', 'CW '],
    # ['count_isnull'],
    # ['feature_distance_0_0', 'feature_distance_0_1', 'feature_distance_0_2', 'feature_distance_0_3', 'feature_distance_0_4', 'feature_distance_0_5', 'feature_distance_0_6', 'feature_distance_0_7', 'feature_distance_0_8', 'feature_distance_0_9'],
    # ['feature_distance_1_0', 'feature_distance_1_1', 'feature_distance_1_2', 'feature_distance_1_3', 'feature_distance_1_4', 'feature_distance_1_5', 'feature_distance_1_6', 'feature_distance_1_7', 'feature_distance_1_8', 'feature_distance_1_9'],
    # ['BN_binning'],
#     ['CF', 'AF', 'FE', 'CR', 'BR', 'GH', 'EE']
    ['FD ','CS']
]
for dc in drop_col:
    train = train.drop(columns=dc)
    test = test.drop(columns=dc)
    
print(train.columns.tolist())
print(train.shape)

['Id', 'AF', 'BQ', 'AB', 'FL', 'CR', 'DH', 'BN', 'DA', 'EH', 'CD ', 'BP', 'DL', 'EE', 'DE', 'GL', 'FR', 'FI', 'EB', 'CU', 'BR', 'BZ', 'CC', 'Class', 'BN_binning', 'DU+DU', 'DU+FR', 'DA+DE', 'AB+GL', 'out_GL', 'DA*CS', 'poly_0_0', 'poly_0_1', 'poly_0_2']
(617, 34)


In [11]:
y_meta = greeks['Alpha']
X = train.drop(columns=['Id', 'Class'])
y = train['Class']

In [12]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y_meta, test_size=0.2, random_state=25)

# 将标签'A', 'B', 'C', 'D'映射成0，1，2，3
label_encoder = LabelEncoder()
y_meta = label_encoder.fit_transform(y_meta)
X = pd.DataFrame(X)
y_meta = pd.DataFrame(y_meta)
y = pd.DataFrame(y)
# print(y_train.shape, y_test.shape)

In [13]:
from sklearn.metrics import log_loss

def balanced_log_loss(y_true, y_pred):
    # print(y_true)
    nc = np.bincount(y_true)
    return log_loss(y_true, y_pred, sample_weight = 1/nc[y_true], eps=1e-15, labels=[0, 1])

def calc_loss(y_pred, y):
    probabilities = np.concatenate((y_pred[:,:1], np.sum(y_pred[:, 1:], 1, keepdims=True)), axis=1)
    p0 = probabilities[:, :1]       # 计算class=
    p1 = 1 - p0

    y = y.values
    y = np.array([0 if x==0 else 1 for x in y])
    loss = balanced_log_loss(y, p1)
    return loss

In [14]:
from sklearn.model_selection import KFold as KF, GridSearchCV
from sklearn.model_selection import StratifiedKFold as SKF


cv_outer = SKF(n_splits = 8, shuffle=True, random_state=19)
cv_inner = KF(n_splits = 5, shuffle=True, random_state=19)
ros = RandomOverSampler(random_state=19)

In [15]:
import optuna
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore")

# 定义目标函数
def objective(trial):
    params = {
        # 多分类XGBClassifier参数
        # 'objective': 'multi:softprob',
        'num_class': 4,
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'booster': 'gbtree',
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.1),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.1, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.1, 1.0),
    }

    xgb_params = {
            'learning_rate': 0.413327571405248,
            'booster': 'gbtree',
            'lambda': 0.0000263894617720096,
            'alpha': 0.000463768723479341,
            'subsample': 0.237467672874133,
            'colsample_bytree': 0.618829300507829,
            'max_depth': 5,
            'min_child_weight': 9,
            'eta': 2.09477807126539E-06,
            'gamma': 0.000847289463422307,
            'grow_policy': 'depthwise',
            'n_jobs': -1,
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'verbosity': 0,
        }

    # 创建 XGBoost 分类器
    model = XGBClassifier(**params)
    train_preds = np.zeros((X.shape[0], 4))

    for out_id, (train_idx, val_idx) in enumerate(cv_outer.split(X, y_meta), start=1):
        x_train_ori, x_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train_ori, y_val = y_meta.iloc[train_idx], y.iloc[val_idx]
        y_val_meta = y_meta.iloc[val_idx]
        y_train_label = y.iloc[train_idx]

        cols_x_train_ori = len(x_train_ori.columns)
        y_train_ori_df = pd.DataFrame(y_train_ori, columns=['Alpha'])
        x_train_ori_comb = pd.concat((x_train_ori, y_train_ori_df), axis=1)

#       train_ros, y_nonsense = ros.fit_resample(x_train_ori_comb, y_train_label)    # 按 0/1 over sample
        x_train, y_train = ros.fit_resample(x_train_ori, y_train_ori)         # 按 A/B/D/G over sample
        # x_train, y_train = x_train_ori, y_train_ori

        out_X, out_y_meta = x_train, y_train
        # out_y = out_y_meta.apply(lambda x: 0 if x == 'A' else 1)

        model.fit(out_X, out_y_meta, eval_set=[(x_val, y_val_meta)], early_stopping_rounds=100, verbose=False)
        
        # 用训练好的model计算x_val的loss
        val_y_pred = model.predict_proba(x_val)
        train_preds[val_idx] = val_y_pred

    bll = calc_loss(train_preds, y)

    return bll

# 创建 Optuna 优化器并运行优化
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=800)

# 打印调参结果
print('Best trial:', study.best_trial.params)
print('Best Bll:', study.best_value)

[I 2023-08-06 21:56:59,486] A new study created in memory with name: no-name-654b2a94-4c07-41f7-8531-c21c193524b1
[I 2023-08-06 21:57:04,735] Trial 0 finished with value: 0.2731414468026305 and parameters: {'lambda': 1.3982613466897604e-08, 'alpha': 0.023995060080478196, 'max_depth': 4, 'learning_rate': 0.07724597665443066, 'gamma': 8.81536891680552e-05, 'colsample_bytree': 0.4470793255494294, 'min_child_weight': 3, 'subsample': 0.7203651939640793}. Best is trial 0 with value: 0.2731414468026305.
[I 2023-08-06 21:57:14,171] Trial 1 finished with value: 0.3935942673644262 and parameters: {'lambda': 0.00011637727484503234, 'alpha': 4.3763290397408136e-07, 'max_depth': 5, 'learning_rate': 0.012115963815083123, 'gamma': 0.04571692289554108, 'colsample_bytree': 0.7694808384852144, 'min_child_weight': 2, 'subsample': 0.43735587218744254}. Best is trial 0 with value: 0.2731414468026305.
[I 2023-08-06 21:57:19,664] Trial 2 finished with value: 0.235522396207249 and parameters: {'lambda': 0.243

KeyboardInterrupt: 