In [1]:
import numpy as np
import pandas as pd
import polars as pl

from sklearn.metrics import roc_curve, auc, roc_auc_score

from pathlib import Path

In [2]:
def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)

In [3]:
base_path = Path('./')
data_path = base_path / 'data'
def read_img_oof():
    oof_df_tsuma_eva_nes = pl.read_parquet(data_path / 'preds_eva_nes.parquet', columns=['isic_id', 'pred']).rename({'pred': 'pred_tsuma_eva_nes'})
    oof_df_tsuma_conv_nes = pl.read_parquet(data_path / 'preds_conv_nes.parquet', columns=['isic_id', 'pred']).rename({'pred': 'pred_tsuma_conv_nes'})
    oof_df_sub_71 = pl.concat([pl.read_csv(data_path / f'sub71/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_71'})
    oof_df_sub_73 = pl.concat([pl.read_csv(data_path / f'sub73/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_73'})
    oof_df_sub_75 = pl.concat([pl.read_csv(data_path / f'sub75/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_75'})
    oof_df_sub_77 = pl.concat([pl.read_csv(data_path / f'sub77/test_results_fold_{fold}.csv') for fold in range(5)]).select(['isic_id', 'pred']).rename({'pred': 'pred_sub_77'})

    # Merge the data
    # train_df = train_df.join(oof_df_tsuma_eva_nes, on='isic_id', how='left')
    train_df = oof_df_tsuma_eva_nes
    train_df = train_df.join(oof_df_tsuma_conv_nes, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_71, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_73, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_75, on='isic_id', how='left')
    train_df = train_df.join(oof_df_sub_77, on='isic_id', how='left')
    train_df = train_df.to_pandas()

    return train_df
    

In [4]:
oof_df1 = pd.read_csv('ensemble_oof_df_20240904.csv')
oof_df2 = pd.read_csv('ensemble_oof_df_20240905.csv')
oof_kanna_attr = pd.read_csv('ensemble_oof_df_with_att.csv').rename(columns={'pred': 'pred_kanna_attr'})
oof_kanna_no_attr = pd.read_csv('ensemble_oof_df_without_att.csv').rename(columns={'pred': 'pred_kanna_no_attr'})
oof_tsuma_attr = pd.read_parquet('preds_tsuma_plain.parquet').rename(columns={'pred': 'pred_tsuma_attr'})
oof_tsuma_no_attr = pd.read_parquet('preds_tsuma_no_att.parquet').rename(columns={'pred': 'pred_tsuma_no_attr'})
img_oof_df = read_img_oof()

oof_df = pd.merge(oof_df1, oof_df2, on=['isic_id', 'target'], how='left')
oof_df = pd.merge(oof_df, img_oof_df, on=['isic_id'], how='left')
oof_df = pd.merge(oof_df, oof_kanna_attr, on=['isic_id', 'target'], how='left')
oof_df = pd.merge(oof_df, oof_kanna_no_attr, on=['isic_id', 'target'], how='left')
oof_df = pd.merge(oof_df, oof_tsuma_attr, on=['isic_id'], how='left')
oof_df = pd.merge(oof_df, oof_tsuma_no_attr, on=['isic_id'], how='left')


In [5]:
oof_df.head()

Unnamed: 0,isic_id,target,pred_xgb_exp68,pred_lgb_exp68,pred_xgb_exp69,pred_lgb_exp69,pred_xgb_exp70,pred_lgb_exp70,pred_xgb_exp71,pred_lgb_exp71,...,pred_tsuma_eva_nes,pred_tsuma_conv_nes,pred_sub_71,pred_sub_73,pred_sub_75,pred_sub_77,pred_kanna_attr,pred_kanna_no_attr,pred_tsuma_attr,pred_tsuma_no_attr
0,ISIC_0015670,0,1.5e-05,4.8e-05,1.2e-05,4.9e-05,1e-05,5e-05,1.2e-05,4.7e-05,...,0.017248,0.023749,3e-06,1.127719e-06,1e-05,1.729415e-07,1.7e-05,1.5e-05,0.000575,0.000756
1,ISIC_0015845,0,0.041703,0.139432,0.026189,0.139875,0.032432,0.100835,0.032224,0.101122,...,0.576921,0.142342,0.001004,0.0001837846,0.003136,0.0002251682,0.045575,0.040704,0.584633,0.660702
2,ISIC_0015864,0,5e-06,1.7e-05,5e-06,1.2e-05,4e-06,1.3e-05,3e-06,1.1e-05,...,0.004238,0.015459,2e-06,7.042271e-08,6e-06,6.3488e-09,9e-06,6e-06,0.000195,0.000253
3,ISIC_0015902,0,2.6e-05,9.2e-05,2.3e-05,9.8e-05,2.3e-05,8.4e-05,3.6e-05,8.6e-05,...,0.004798,0.016119,1e-06,3.237762e-06,1.4e-05,1.834289e-05,7.7e-05,8.3e-05,0.000148,0.000121
4,ISIC_0024200,0,3.7e-05,0.000171,7.9e-05,0.00018,4.3e-05,0.000136,6e-05,0.000135,...,0.073753,0.23749,6e-06,0.0001609308,2.2e-05,2.54135e-06,3.3e-05,3.1e-05,3.7e-05,6.2e-05


In [6]:
feature_cols1 = [
    'pred_xgb_exp69',
    'pred_xgb_exp70',
    'pred_xgb_exp78',
    'pred_lgb_exp78',
    'pred_xgb_exp79',
    'pred_xgb_exp85',
    'pred_xgb_exp86',
    'pred_xgb_exp92',
    'pred_xgb_exp94',
    'pred_lgb_exp94',
    
    'pred_xgb_exp100',
    'pred_xgb_exp104',
    'pred_lgb_exp104',
    'pred_xgb_exp109',
    'pred_xgb_exp116',
    'pred_lgb_exp116',
    'pred_xgb_exp120',

    'pred_tsuma_eva_nes',
    'pred_tsuma_conv_nes',
    'pred_sub_71',
    'pred_sub_73',
    'pred_sub_75',
    'pred_sub_77',

    # 'pred_kanna_attr',
    'pred_tsuma_attr',
]

feature_cols2 = [
    'pred_xgb_exp73',
    'pred_xgb_exp74',
    'pred_xgb_exp82',
    'pred_lgb_exp82',
    'pred_xgb_exp83',
    'pred_xgb_exp89',
    'pred_xgb_exp90',
    'pred_xgb_exp96',
    'pred_xgb_exp98',
    'pred_lgb_exp98',

    'pred_xgb_exp102',
    'pred_xgb_exp106',
    'pred_lgb_exp106',
    'pred_xgb_exp111',
    'pred_xgb_exp118',
    'pred_lgb_exp118',
    'pred_xgb_exp122',

    # 'pred_tsuma_eva_nes',
    # 'pred_tsuma_conv_nes',
    # 'pred_sub_71',
    # 'pred_sub_73',
    # 'pred_sub_75',
    # 'pred_sub_77',

    # 'pred_kanna_no_attr',
    'pred_tsuma_no_attr',
]

feature_cols = sorted(list(set(feature_cols1 + feature_cols2)))
# feature_cols = [col for col in feature_cols if 'xgb' in col]

In [7]:
def rank_averaging_normalized(df, column):
    """
    指定されたカラムに対してrank averagingを行い、最大値を1に正規化する。

    Parameters:
    df (pd.DataFrame): 対象のDataFrame
    column (str): ランク付けを行うカラム名

    Returns:
    pd.Series: 正規化されたランク
    """
    # 平均ランクを計算
    ranks = df[column].rank(method='average')

    # 最大値を1に正規化
    normalized_ranks = ranks / ranks.max()

    return normalized_ranks
    

In [8]:
for col in feature_cols:
    oof_df[col] = rank_averaging_normalized(oof_df, col)

In [9]:
import optuna
import numpy as np

target_col = 'target'

class OptunaOptimizer:
    def __init__(self, oof_df, feature_cols, score_func, target_col, n_trials=1000):
        self.oof_df = oof_df
        self.feature_cols = feature_cols
        self.score_func = score_func
        self.target_col = target_col
        self.n_trials = n_trials
        self.optimized_weights = None
        self.optimized_score = None

        optuna.logging.set_verbosity(optuna.logging.CRITICAL)

    def ensemble_auc(self, weights):
        weighted_preds = np.dot(self.oof_df[self.feature_cols], weights)
        return -self.score_func(self.oof_df[self.target_col], weighted_preds)

    def objective(self, trial):
        weights = [
            trial.suggest_float(f"weight_{i}", 0, 1)
            for i in range(len(self.feature_cols))
        ]
        total_weight = sum(weights)
        normalized_weights = [w / total_weight for w in weights]
        return self.ensemble_auc(normalized_weights)

    def optimize(self):
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=self.n_trials)
        self.optimized_weights = [
            study.best_trial.params[f"weight_{i}"]
            for i in range(len(self.feature_cols))
        ]
        total_weight = sum(self.optimized_weights)
        self.normalized_optimized_weights = [w / total_weight for w in self.optimized_weights]
        self.optimized_score = -study.best_trial.value

    def get_optimized_results(self):
        return self.normalized_optimized_weights, self.optimized_score

optimizer = OptunaOptimizer(oof_df, feature_cols, score, target_col, n_trials=500)
optimizer.optimize()
optimized_weights, optimized_score = optimizer.get_optimized_results()
print("Optimized Weights:", optimized_weights)
print("Optimized Score:", optimized_score)


Optimized Weights: [0.011141279298150472, 0.012948352857082769, 0.06309438467459093, 0.00847961517420425, 0.028367146377348534, 0.022954160417212304, 0.008983897893506829, 0.02314340509220314, 0.0034093820387068366, 0.005751003990415199, 0.009023426985534336, 0.009109821160730988, 0.02117024616012781, 0.00010027828886669354, 0.00591776874550354, 0.0003611172954374397, 0.01839344486137358, 0.043665402492417724, 0.056907781091371894, 0.03335673997979208, 0.02247510120259927, 0.016649638347636452, 0.004259078143123905, 0.04298629535064806, 0.04059734073034188, 0.0005575167581126155, 0.07784627051935086, 0.019104993237088187, 0.054781472986169345, 0.047684591915857284, 0.012336271224327885, 0.08258410642091343, 0.0027044844911221404, 0.004920520579862821, 0.005026390696763918, 0.015031863911078221, 0.0013726596140122174, 0.00501938180325421, 0.0034539733516995973, 0.01780310065347222, 0.06443530703710885, 0.07209098615087918]
Optimized Score: 0.1838782142745052


In [10]:
for w, col in zip(optimized_weights, feature_cols):
    print(w, col)


0.011141279298150472 pred_lgb_exp104
0.012948352857082769 pred_lgb_exp106
0.06309438467459093 pred_lgb_exp116
0.00847961517420425 pred_lgb_exp118
0.028367146377348534 pred_lgb_exp78
0.022954160417212304 pred_lgb_exp82
0.008983897893506829 pred_lgb_exp94
0.02314340509220314 pred_lgb_exp98
0.0034093820387068366 pred_sub_71
0.005751003990415199 pred_sub_73
0.009023426985534336 pred_sub_75
0.009109821160730988 pred_sub_77
0.02117024616012781 pred_tsuma_attr
0.00010027828886669354 pred_tsuma_conv_nes
0.00591776874550354 pred_tsuma_eva_nes
0.0003611172954374397 pred_tsuma_no_attr
0.01839344486137358 pred_xgb_exp100
0.043665402492417724 pred_xgb_exp102
0.056907781091371894 pred_xgb_exp104
0.03335673997979208 pred_xgb_exp106
0.02247510120259927 pred_xgb_exp109
0.016649638347636452 pred_xgb_exp111
0.004259078143123905 pred_xgb_exp116
0.04298629535064806 pred_xgb_exp118
0.04059734073034188 pred_xgb_exp120
0.0005575167581126155 pred_xgb_exp122
0.07784627051935086 pred_xgb_exp69
0.0191049932370881

In [11]:
import optuna
import numpy as np

# Optunaのログを無効化
optuna.logging.set_verbosity(optuna.logging.CRITICAL)

def ensemble_auc(weights):
    weighted_preds = np.dot(oof_df[feature_cols], weights)
    pauc = score(oof_df['target'], weighted_preds)
    return -pauc

def objective(trial):
    # 重みのパラメータを作成
    weights = [trial.suggest_float(f"weight_{i}", 0, 1) for i in range(len(feature_cols))]
    
    # 重みの合計で各重みを正規化して合計が1になるように調整
    total_weight = sum(weights)
    normalized_weights = [w / total_weight for w in weights]
    
    return ensemble_auc(normalized_weights)

study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=500)
study.optimize(objective, n_trials=1000)

# 最適化された重みを取得して表示
optimized_weights = [study.best_trial.params[f"weight_{i}"] for i in range(len(feature_cols))]
total_weight = sum(optimized_weights)
normalized_optimized_weights = [w / total_weight for w in optimized_weights]

# 最適化されたスコア（目的関数の値）も表示
optimized_score = -study.best_trial.value

print("Optimized Weights:", normalized_optimized_weights)
print("Optimized Score:", optimized_score)


Optimized Weights: [0.0040552336160284185, 0.002747253799181361, 0.031252431541137354, 0.0024021045230937688, 0.004337447259323742, 0.013752357573348264, 0.029042081023879132, 0.049364389204644574, 0.024332286734565754, 0.006629893554152348, 0.0028749305273533263, 0.004302640732283625, 0.025124191087370306, 5.7279948038925966e-05, 0.038597117035996026, 0.006078900548438588, 0.01649303314832602, 0.01397683656536985, 0.04511115965376593, 0.01918244197054116, 0.05071371132213749, 0.028897022157311218, 0.05275668993871741, 0.024899609139464665, 0.019587841042758163, 0.05419860658501224, 0.03413690907468788, 0.019764770798083798, 0.0052026295254847775, 0.050625374869708355, 0.04324145637834624, 0.04666976379272596, 8.285273682720308e-05, 0.008602520332993923, 0.0011187842017636603, 0.031231090162230676, 0.001144884906183836, 0.0012548532556769753, 0.0424751789200783, 0.04448229692263733, 0.05322944048350521, 0.04596970340682601]
Optimized Score: 0.18376637504153542


In [12]:
for w, col in zip(normalized_optimized_weights, feature_cols):
    print(w, col)


0.0040552336160284185 pred_lgb_exp104
0.002747253799181361 pred_lgb_exp106
0.031252431541137354 pred_lgb_exp116
0.0024021045230937688 pred_lgb_exp118
0.004337447259323742 pred_lgb_exp78
0.013752357573348264 pred_lgb_exp82
0.029042081023879132 pred_lgb_exp94
0.049364389204644574 pred_lgb_exp98
0.024332286734565754 pred_sub_71
0.006629893554152348 pred_sub_73
0.0028749305273533263 pred_sub_75
0.004302640732283625 pred_sub_77
0.025124191087370306 pred_tsuma_attr
5.7279948038925966e-05 pred_tsuma_conv_nes
0.038597117035996026 pred_tsuma_eva_nes
0.006078900548438588 pred_tsuma_no_attr
0.01649303314832602 pred_xgb_exp100
0.01397683656536985 pred_xgb_exp102
0.04511115965376593 pred_xgb_exp104
0.01918244197054116 pred_xgb_exp106
0.05071371132213749 pred_xgb_exp109
0.028897022157311218 pred_xgb_exp111
0.05275668993871741 pred_xgb_exp116
0.024899609139464665 pred_xgb_exp118
0.019587841042758163 pred_xgb_exp120
0.05419860658501224 pred_xgb_exp122
0.03413690907468788 pred_xgb_exp69
0.0197647707980

In [13]:
import optuna
import numpy as np

class OptunaOptimizer:
    def __init__(self, oof_df, feature_cols, score, target_col, n_trials=1000):
        self.oof_df = oof_df
        self.feature_cols = feature_cols
        self.score = score
        self.target_col = target_col
        self.n_trials = n_trials
        self.optimized_weights = None
        self.optimized_score = None
        
        # Optunaのログを無効化
        optuna.logging.set_verbosity(optuna.logging.CRITICAL)

    def ensemble_auc(self, weights):
        weighted_preds = np.dot(self.oof_df[self.feature_cols], weights)
        return -self.score(self.oof_df[self.target_col], weighted_preds)

    def objective(self, trial):
        weights = [trial.suggest_float(f"weight_{i}", 0, 1) for i in range(len(self.feature_cols))]
        total_weight = sum(weights)
        normalized_weights = [w / total_weight for w in weights]
        return self.ensemble_auc(normalized_weights)

    def optimize(self):
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=self.n_trials)
        self.optimized_weights = [study.best_trial.params[f"weight_{i}"] for i in range(len(self.feature_cols))]
        total_weight = sum(self.optimized_weights)
        self.normalized_optimized_weights = [w / total_weight for w in self.optimized_weights]
        self.optimized_score = -study.best_trial.value

    def get_optimized_results(self):
        return self.normalized_optimized_weights, self.optimized_score

# 使用例:
# optimizer = OptunaOptimizer(oof_df, feature_cols, score, target_col, n_trials=1000)
# optimizer.optimize()
# optimized_weights, optimized_score = optimizer.get_optimized_results()
# print("Optimized Weights:", optimized_weights)
# print("Optimized Score:", optimized_score)