In [1]:
import numpy as np
import pandas as pd
import polars as pl

from sklearn.metrics import roc_curve, auc, roc_auc_score

from pathlib import Path

In [2]:
def score(solution: np.ndarray, submission: np.ndarray, min_tpr: float=0.80) -> float:
    v_gt = abs(solution-1)
    v_pred = np.array([1.0 - x for x in submission])
    max_fpr = abs(1-min_tpr)
    partial_auc_scaled = roc_auc_score(v_gt, v_pred, max_fpr=max_fpr)
    # change scale from [0.5, 1.0] to [0.5 * max_fpr**2, max_fpr]
    # https://math.stackexchange.com/questions/914823/shift-numbers-into-a-different-range
    partial_auc = 0.5 * max_fpr**2 + (max_fpr - 0.5 * max_fpr**2) / (1.0 - 0.5) * (partial_auc_scaled - 0.5)

    return(partial_auc)

In [3]:
oof_df = pd.read_parquet('data/ensemble_oof_df_20240914.parquet')

In [4]:
oof_df.head()

Unnamed: 0,isic_id,target,pred_xgb_exp1,pred_lgb_exp1,pred_xgb_exp2,pred_lgb_exp2,pred_xgb_exp3,pred_lgb_exp3,pred_xgb_exp4,pred_lgb_exp4,pred_xgb_exp5,pred_lgb_exp5,pred_xgb_exp6,pred_lgb_exp6,pred_xgb_exp7,pred_lgb_exp7,pred_xgb_exp8,pred_lgb_exp8
0,ISIC_0015670,0,2.9e-05,0.00013,3.4e-05,0.000116,6.4e-05,0.000113,5e-05,0.000126,6.6e-05,0.000136,4.8e-05,0.000136,5.1e-05,0.000109,6.2e-05,0.00012
1,ISIC_0015845,0,0.1808,0.302143,0.190036,0.29541,0.143326,0.437202,0.158994,0.468928,0.215653,0.254087,0.216539,0.23866,0.185073,0.241739,0.163325,0.202038
2,ISIC_0015864,0,1.4e-05,2.1e-05,8e-06,1.9e-05,9e-06,2e-05,9e-06,2.2e-05,7e-06,2.3e-05,7e-06,2.2e-05,1e-05,2.2e-05,8e-06,2e-05
3,ISIC_0015902,0,7.8e-05,0.000257,7.2e-05,0.000268,4.2e-05,0.000232,5.9e-05,0.000245,5e-05,0.000151,7.2e-05,0.000168,6.1e-05,0.000149,5.2e-05,0.00016
4,ISIC_0024200,0,0.000112,0.000184,8.5e-05,0.000197,7.1e-05,0.00017,7.5e-05,0.000204,9.6e-05,0.000165,9.7e-05,0.000165,6.6e-05,0.00023,5.5e-05,0.000217


In [5]:
feature_cols1 = [
    'pred_xgb_exp1',
    'pred_lgb_exp1',
    'pred_xgb_exp2',
    'pred_lgb_exp2',
    'pred_xgb_exp3',
    'pred_lgb_exp3',
    'pred_xgb_exp4',
    'pred_lgb_exp4',
]

feature_cols2 = [
    'pred_xgb_exp5',
    'pred_lgb_exp5',
    'pred_xgb_exp6',
    'pred_lgb_exp6',
    'pred_xgb_exp7',
    'pred_lgb_exp7',
    'pred_xgb_exp8',
    'pred_lgb_exp8',
]

feature_cols = sorted(list(set(feature_cols1 + feature_cols2)))
# feature_cols = sorted(list(set(feature_cols1)))
# feature_cols = [col for col in feature_cols if 'xgb' in col]

In [6]:
def rank_averaging_normalized(df, column):
    ranks = df[column].rank(method='average')
    normalized_ranks = ranks / ranks.max()
    return normalized_ranks


In [7]:
for col in feature_cols:
    oof_df[col] = rank_averaging_normalized(oof_df, col)
    

In [10]:
import optuna
import numpy as np

target_col = 'target'

class OptunaOptimizer:
    def __init__(self, oof_df, feature_cols, score_func, target_col, n_trials=1000):
        self.oof_df = oof_df
        self.feature_cols = feature_cols
        self.score_func = score_func
        self.target_col = target_col
        self.n_trials = n_trials
        self.optimized_weights = None
        self.optimized_score = None

        optuna.logging.set_verbosity(optuna.logging.CRITICAL)

    def ensemble_auc(self, weights):
        weighted_preds = np.dot(self.oof_df[self.feature_cols], weights)
        return -self.score_func(self.oof_df[self.target_col], weighted_preds)

    def objective(self, trial):
        weights = [
            trial.suggest_float(f"weight_{i}", 0, 1)
            for i in range(len(self.feature_cols))
        ]
        total_weight = sum(weights)
        normalized_weights = [w / total_weight for w in weights]
        return self.ensemble_auc(normalized_weights)

    def optimize(self):
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=self.n_trials)
        self.optimized_weights = [
            study.best_trial.params[f"weight_{i}"]
            for i in range(len(self.feature_cols))
        ]
        total_weight = sum(self.optimized_weights)
        self.normalized_optimized_weights = [w / total_weight for w in self.optimized_weights]
        self.optimized_score = -study.best_trial.value

    def get_optimized_results(self):
        return self.normalized_optimized_weights, self.optimized_score

optimizer = OptunaOptimizer(oof_df, feature_cols, score, target_col, n_trials=500)
optimizer.optimize()
optimized_weights, optimized_score = optimizer.get_optimized_results()
print("Optimized Weights:", optimized_weights)
print("Optimized Score:", optimized_score)


Optimized Weights: [0.029772532252581717, 0.011213062069398216, 0.009281631159123224, 0.00019545820696807752, 0.27277665977703763, 0.17939193795119193, 0.29731768134570824, 0.200051037237991]
Optimized Score: 0.1751947917658574


In [11]:
for w, col in zip(optimized_weights, feature_cols):
    print(w, col)


0.029772532252581717 pred_lgb_exp1
0.011213062069398216 pred_lgb_exp2
0.009281631159123224 pred_lgb_exp3
0.00019545820696807752 pred_lgb_exp4
0.27277665977703763 pred_xgb_exp1
0.17939193795119193 pred_xgb_exp2
0.29731768134570824 pred_xgb_exp3
0.200051037237991 pred_xgb_exp4
