In [1]:
import pandas as pd
# from carps.analysis.gather_data import normalize_logs, get_interpolated_performance_df
from numpy import trapz
import importlib
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px 
import itertools
import pandas as pd

from carps.analysis.generate_report import *


# importlib.reload(carps)
# importlib.reload(carps.analysis)
# importlib.reload(carps.analysis.performance_over_time)
from carps.analysis.performance_over_time import plot_performance_over_time
from carps.experimenter.database.process_logs import process_logs_from_database
import os


In [3]:
all_experiments = []
all_trajectories = []
current_max_id = 0

for subdir in os.listdir('../experimenter_loose/'):
    experiment_config = pd.read_parquet(f"../experimenter_loose/{subdir}/experiment_config.parquet")
    trials = pd.read_parquet(f"../experimenter_loose/{subdir}/trials.parquet")

    experiment_config = experiment_config[~experiment_config['optimizer_id'].str.startswith("hypershap")]
    experiment_config = experiment_config[~experiment_config['optimizer_id'].str.startswith("random")]
    valid_ids = experiment_config['ID'].unique()
    trials = trials[trials['experiment_id'].isin(valid_ids)]

    id_mapping = {old_id: new_id for old_id, new_id in zip(
        experiment_config['ID'], 
        range(current_max_id, current_max_id + len(experiment_config))
    )}

    experiment_config['ID'] = experiment_config['ID'].map(id_mapping)
    trials['experiment_id'] = trials['experiment_id'].map(id_mapping)

    all_experiments.append(experiment_config)
    all_trajectories.append(trials)

    current_max_id += len(experiment_config)

# Concatenate all
experiment_config_all = pd.concat(all_experiments, ignore_index=True)
trials_all = pd.concat(all_trajectories, ignore_index=True)
trials_all.to_parquet('../experimenter/results/trials.parquet')    
experiment_config_all.to_parquet('../experimenter/results/experiment_config.parquet') 


In [23]:
trials_all = pd.read_parquet('../experimenter/results/trials.parquet')    
experiment_config_all = pd.read_parquet('../experimenter/results/experiment_config.parquet') 


In [6]:
exp_test = experiment_config_all[experiment_config_all['subset_id']=='test']
trials_test = trials_all[trials_all['experiment_id'].isin(exp_test['ID'].unique())]
trials_test.to_parquet('../experimenter/test/trials.parquet')    
exp_test.to_parquet('../experimenter/test/experiment_config.parquet') 

In [7]:
exp_train = experiment_config_all[experiment_config_all['subset_id']=='dev']
trials_train = trials_all[trials_all['experiment_id'].isin(exp_train['ID'].unique())]
trials_train.to_parquet('../experimenter/train/trials.parquet')    
exp_train.to_parquet('../experimenter/train/experiment_config.parquet') 

In [25]:
exp_train = experiment_config_all[experiment_config_all['subset_id']=='dev']
exp_train = exp_train[exp_train['optimizer_id'].isin(['SMAC3-MO-RF', 'fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_random_thresh_0_lin_0', 'fanova_adjust_cs_incumbent_cs_proba_hpi_adjust_prev_cfgs_set_to_random_thresh_0_lin_0','fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0_lin_0', 'fanova_adjust_cs_incumbent_cs_proba_hpi_thresh_0.5', 'fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0.75'])]
trials_train = trials_all[trials_all['experiment_id'].isin(exp_train['ID'].unique())]
trials_train.to_parquet('../experimenter/mine/trials.parquet')    
exp_train.to_parquet('../experimenter/mine/experiment_config.parquet') 

In [26]:
df = pd.read_parquet('../experimenter/mine/processed_logs.parquet')
df = normalize_logs(df)
if 'set' not in df.columns:
    df["set"] = '0'
df = get_interpolated_performance_df(df, n_points=50)


In [None]:
# do prepocessing and create subsets: train/test etc.
df_train = df[df['subset_id']=='dev']
df_test = df[df['subset_id']=='test']

In [9]:
def expand_optimizer_id(df: pd.DataFrame, optimizer_id_col: str = 'optimizer_id') -> pd.DataFrame:
    def parse_optimizer_id(opt_id: str) -> dict:
        info = {
            'hpi_method': None,
            'adjust_cs': 'False',
            'cs_method': 'False',
            'pc_method': 'False',
            'thresh': None,
            'adjust_method': None
        }
        parts = opt_id.split('_')
        info['hpi_method'] = parts[0]

        
        if 'adjust_cs_' in opt_id:
            info['adjust_cs'] = 'distribution'
            info['cs_method'] = parts[parts.index('cs')+1]
            info['adjust_method'] = 'adjust_cs_only'
        if 'constant' in opt_id:
            info['adjust_cs'] = 'constant'
        if 'cs_proba_hpi' in opt_id:
            info['adjust_cs'] = 'cs_proba_hpi'
        if 'set_to' in opt_id:
            info['pc_method'] = parts[parts.index('to')+1]
            if info['adjust_method'] == 'adjust_cs_only':
                info['adjust_method'] = 'both'
            else:
                info['adjust_method'] = 'adjust_pc_only'
        if 'down' in opt_id:
            info['thresh'] = 'down'
        elif 'up' in opt_id:
            info['thresh'] = 'up'
        else: 
            info['thresh'] = parts[parts.index('thresh')+1]
        
        return info

    # Apply the parser to each row
    parsed = df[optimizer_id_col].apply(parse_optimizer_id)
    parsed_df = pd.DataFrame(parsed.tolist())

    # Merge with the original DataFrame
    return pd.concat([df, parsed_df], axis=1)

def check_if_complete_for_max_trials(in_df):
    max_trials = in_df['n_trials_norm'].max()
    filtered_df = in_df[in_df['n_trials_norm'] == max_trials]
    miss_dict = {}
    seeds = filtered_df['seed'].unique()
    expected_combinations = set(itertools.product(tasks[in_df['benchmark_id'].iloc[0]], [max_trials], seeds))
    incomplete = False
    for optimizer_id, group in filtered_df.groupby('optimizer_id'):
        actual_combinations = set(group[['task_id', 'n_trials_norm', 'seed']].itertuples(index=False, name=None))
        missing = expected_combinations - actual_combinations
        if missing:
            print(f"Optimizer {optimizer_id} is not complete for max n_trials_norm = {max_trials}: {len(missing)} combinations missing.")
            miss_dict[optimizer_id] = missing
            incomplete = True
    if not incomplete:
        print('All data complete!')
        return None
    return miss_dict

def read_data(path):
    df = pd.read_parquet(path)
    df = normalize_logs(df)
    df['n_trials_norm'] = df.groupby('experiment_id')['time'].rank(method='dense', pct=True).round(2)
    if 'set' not in df.columns:
        df["set"] = '0'
    df_random = df[df['optimizer_id'].str.startswith('random_')]
    df = df[~df['optimizer_id'].str.startswith('random_')]
    # check_if_complete_for_max_trials(df)
    return df, df_random

def get_perf(df):
    perf = get_interpolated_performance_df(df, n_points=50)
    missing = check_if_complete_for_max_trials(df)
    perf = expand_optimizer_id(perf)
    perf = perf[perf['adjust_method'].notna()]
    return perf, missing

In [15]:
def plot_one_thing(thing, df_plot=None):
    if df_plot is None:
        df_plot = perf_all.copy(deep=True)
        df_plot['optimizer_id'] = df_plot[thing]
        # df_plot = pd.concat([df_plot, base], ignore_index=True)
        plot_performance_per_task(df_plot, output_dir=f"figure_dir/ablation/{thing}", replot=True)
        if len(df_plot['optimizer_id'].unique()) > 2:
            plot_critical_difference(df_plot, output_dir=f"figure_dir/ablation/{thing}", replot=True)
            plot_finalperfbarplot(df_plot, output_dir=f"figure_dir/ablation/{thing}", replot=True)
            plot_ranks_over_time(df_plot, output_dir=f"figure_dir/ablation/{thing}", replot=True)

    # Average over tasks
    avg_task = df_plot.groupby(["optimizer_id", "seed", "n_trials_norm"]).agg(
        mean_perf=("trial_value__cost_inc_norm", "mean")
    ).reset_index()

    # Average over seeds
    avg_seed = avg_task.groupby(["optimizer_id", "n_trials_norm"]).agg(
        mean=("mean_perf", "mean"),
        sem=("mean_perf", "sem")
    ).reset_index()

    auc_per_seed = (
        df_plot.groupby(["optimizer_id", "seed"])
        .apply(lambda group: trapz(group.sort_values("n_trials_norm")["trial_value__cost_inc_norm"],
                                   group.sort_values("n_trials_norm")["n_trials_norm"]))
        .reset_index(name="auc")
    )

    # Compute mean and sem AUC per optimizer
    auc_values = auc_per_seed.groupby("optimizer_id").agg(
        mean=('auc', 'mean'),
        sem=('auc', 'sem')
    ).sort_values("mean", ascending=False).reset_index()


    fig = make_subplots(
        rows=1, cols=2,
        # subplot_titles=("Performance over Time (Normalized)", "AUC per Optimizer"),
        column_widths=[0.6, 0.4],
        horizontal_spacing=0.15
    )
    colors = px.colors.qualitative.Set1
    # Line plot with shaded error (left)
    print(avg_seed["optimizer_id"].unique())
    for i, optimizer in enumerate(avg_seed["optimizer_id"].unique()):
        df_opt = avg_seed[avg_seed["optimizer_id"] == optimizer]
        
        line_color = colors[i % len(colors)]

        # Mean line
        fig.add_trace(go.Scatter(
            x=df_opt["n_trials_norm"],
            y=df_opt["mean"],
            mode="lines",
            name=optimizer,
            line=dict(width=2, color=line_color),
        ), row=1, col=1)

        # Shaded error (sem)
        fig.add_trace(go.Scatter(
            x=pd.concat([df_opt["n_trials_norm"], df_opt["n_trials_norm"][::-1]]),
            y=pd.concat([
                df_opt["mean"] - df_opt["sem"],
                (df_opt["mean"] + df_opt["sem"])[::-1]
            ]),
            fill='toself',
            fillcolor=line_color.replace(')', ',0.1)').replace('rgb', 'rgba'),
            line=dict(color='rgba(255,255,255,0)'),
            hoverinfo="skip",
            showlegend=False
        ), row=1, col=1)

    # Bar chart with error bars (right)
    fig.add_trace(go.Bar(
        x=auc_values["mean"],
        y=auc_values["optimizer_id"],
        orientation="h",
        error_x=dict(type='data', array=auc_values["sem"]),
        marker=dict(color="skyblue", line=dict(color="black", width=1)),
        showlegend=False
    ), row=1, col=2)

    # -------------------------
    # 3. Layout & Font
    # -------------------------

    fig.update_layout(
        height=400,
        width=1200,
        template="plotly_white",
        font=dict(size=16),
        legend=dict(title="Optimizers", font=dict(size=16)),
        margin=dict(t=60, l=50, r=50, b=50)
    )

    fig.update_xaxes(title_text="Normalized Trials", row=1, col=1)
    fig.update_yaxes(title_text="Normalized Inc-Performance", row=1, col=1)
    fig.update_xaxes(title_text="AUC", row=1, col=2)
    fig.update_yaxes(title_text="Optimizer", row=1, col=2)

    fig.show()


In [69]:
# Which HPI method
plot_one_thing('hpi_method')

['fanova']






In [70]:
# Which thresholding?
plot_one_thing('thresh')


Result is not significant and results of the plot may be misleading!


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


FigureCanvasAgg is non-interactive, and thus cannot be shown


Result is not significant and results of the plot may be misleading!




Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Resul

['0.5' '0.75' 'down' 'up']






In [71]:
plot_one_thing('adjust_cs')


Result is not significant and results of the plot may be misleading!


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


FigureCanvasAgg is non-interactive, and thus cannot be shown


Result is not significant and results of the plot may be misleading!




Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!



['False' 'constant' 'cs_proba_hpi' 'distribution']






In [72]:
plot_one_thing('cs_method')


Result is not significant and results of the plot may be misleading!


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


FigureCanvasAgg is non-interactive, and thus cannot be shown


Result is not significant and results of the plot may be misleading!




Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Resul

['False' 'default' 'incumbent' 'random']






In [73]:
plot_one_thing('pc_method')


Result is not significant and results of the plot may be misleading!


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


FigureCanvasAgg is non-interactive, and thus cannot be shown


Result is not significant and results of the plot may be misleading!




Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!



['False' 'default' 'incumbent' 'random']






In [74]:
plot_one_thing('adjust_method')


Result is not significant and results of the plot may be misleading!


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


FigureCanvasAgg is non-interactive, and thus cannot be shown


Result is not significant and results of the plot may be misleading!




Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!



['adjust_cs_only' 'adjust_pc_only' 'both']






In [27]:
auc_per_seed = (
        df.groupby(["optimizer_id", "seed"])
        .apply(lambda group: trapz(group.sort_values("n_trials_norm")["trial_value__cost_inc_norm"],
                                   group.sort_values("n_trials_norm")["n_trials_norm"]))
        .reset_index(name="auc")
    )

# Compute mean and sem AUC per optimizer
auc_values = auc_per_seed.groupby("optimizer_id").agg(
    auc_mean=('auc', 'mean'),
    auc_sem=('auc', 'sem')
).sort_values("auc_mean", ascending=False).reset_index()






In [28]:
merge = pd.merge(auc_values, df, on='optimizer_id', how='left')
merge = merge.drop_duplicates(subset=['optimizer_id'])
merge = merge[['optimizer_id', 'auc_mean', 'auc_sem']]
# merge = merge[['optimizer_id', 'hpi_method', 'adjust_cs', 'cs_method', 'pc_method', 'thresh', 'adjust_method', 'auc_mean', 'auc_sem']]
final_perf = df[df['n_trials_norm'] == 1].groupby(['optimizer_id','seed']).agg({'trial_value__cost_inc_norm':'mean'}).groupby('optimizer_id').agg(
    final_perf_mean=('trial_value__cost_inc_norm', 'mean'),
    final_perf_sem=('trial_value__cost_inc_norm', 'sem')
).sort_values("final_perf_mean", ascending=False).reset_index()

merge = pd.merge(final_perf, merge, on='optimizer_id', how='right')
merge = merge[['optimizer_id', 'auc_mean','auc_sem', 'final_perf_mean', 'final_perf_sem']].sort_values(by='final_perf_mean', ascending=True)
merge = merge.apply(lambda col: col.round(3) if col.name in ['auc_mean', 'auc_sem', 'final_perf_mean', 'final_perf_sem'] else col)
merge.head(20).to_latex()

'\\begin{tabular}{llrrrr}\n\\toprule\n & optimizer_id & auc_mean & auc_sem & final_perf_mean & final_perf_sem \\\\\n\\midrule\n5 & SMAC3-MO-RF & 0.089000 & 0.003000 & 0.055000 & 0.004000 \\\\\n3 & fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0_lin_0 & 0.095000 & 0.006000 & 0.057000 & 0.006000 \\\\\n4 & fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_random_thresh_0_lin_0 & 0.091000 & 0.006000 & 0.058000 & 0.006000 \\\\\n2 & fanova_adjust_cs_incumbent_cs_proba_hpi_adjust_prev_cfgs_set_to_random_thresh_0_lin_0 & 0.095000 & 0.007000 & 0.059000 & 0.009000 \\\\\n1 & fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0.75 & 0.097000 & 0.005000 & 0.070000 & 0.005000 \\\\\n0 & fanova_adjust_cs_incumbent_cs_proba_hpi_thresh_0.5 & 0.101000 & 0.004000 & 0.070000 & 0.005000 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [31]:
merge.sort_values(by='auc_mean', ascending=True)

Unnamed: 0,optimizer_id,auc_mean,auc_sem,final_perf_mean,final_perf_sem
5,SMAC3-MO-RF,0.089,0.003,0.055,0.004
4,fanova_adjust_cs_default_constant_adjust_prev_...,0.091,0.006,0.058,0.006
3,fanova_adjust_cs_default_constant_adjust_prev_...,0.095,0.006,0.057,0.006
2,fanova_adjust_cs_incumbent_cs_proba_hpi_adjust...,0.095,0.007,0.059,0.009
1,fanova_adjust_cs_default_constant_adjust_prev_...,0.097,0.005,0.07,0.005
0,fanova_adjust_cs_incumbent_cs_proba_hpi_thresh...,0.101,0.004,0.07,0.005


In [39]:
df = df[~df['optimizer_id'].isin(['fANOVA_AUC', 'fANOVA_final'])]

In [33]:
df['optimizer_id'] = df['optimizer_id'].replace({'fanova_adjust_cs_incumbent_cs_proba_hpi_thresh_0.5': 'fANOVA_AUC', 'fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0.75':'fANOVA_final','SMAC3-MO-RF':'ParEGO', 'fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0_lin_0':'fanova_fin_0lin0', 'fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_random_thresh_0_lin_0': 'fanova_fin_rnd_0lin0','fanova_adjust_cs_incumbent_cs_proba_hpi_adjust_prev_cfgs_set_to_random_thresh_0_lin_0':'fanova_auc_0lin0'})

In [36]:
df = df[~df['task_id'].str.contains('ManyO')]

In [40]:
# plot best fanova vs best hypershap vs ParEGO
# best_final = merge[merge['final_perf_mean'] == merge['final_perf_mean'].min()]['optimizer_id'].values[0]
# fanova_final = df[df['optimizer_id']==best_final]
# fanova_final['optimizer_id'] = 'fANOVA-ParEGO-final'
# best_auc = merge[merge['auc_mean'] == merge['auc_mean'].min()]['optimizer_id'].values[0]
# fanova_auc = df[df['optimizer_id']==best_auc]
# fanova_auc['optimizer_id'] = 'fANOVA-ParEGO-auc'
# hypershap_final = perf_all[perf_all['optimizer_id']=='hypershap_adjust_cs_random_adjust_prev_cfgs_set_to_random_thresh_0.75']
# hypershap_final['optimizer_id'] = 'HyperSHAP-ParEGO-final'
# hypershap_auc = perf_all[perf_all['optimizer_id']=='hypershap_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_dynamic_decay_down']
# hypershap_auc['optimizer_id'] = 'HyperSHAP-ParEGO'
# df_plot = pd.concat([base_test, fanova_auc, fanova_final], ignore_index=True)

plot_one_thing('optimizer_id', df_plot=df)

['ParEGO' 'fanova_auc_0lin0' 'fanova_fin_0lin0' 'fanova_fin_rnd_0lin0']






In [41]:
plot_performance_over_time(df, y='trial_value__cost_inc_norm')
plot_critical_difference(df, output_dir='figure_dir', replot=True)
plot_performance_per_task(df, output_dir='figure_dir', replot=True)
plot_finalperfbarplot(df, output_dir='figure_dir', replot=True)
plot_ranks_over_time(df, output_dir='figure_dir', replot=True)



Result is not significant and results of the plot may be misleading!


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


FigureCanvasAgg is non-interactive, and thus cannot be shown


Result is not significant and results of the plot may be misleading!




invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


invalid value encountered in scalar divide


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be misleading!


Result is not significant and results of the plot may be mislead

[{'task_type': 'blackbox',
  'set': '0',
  'task_id': None,
  'filename': 'figure_dir/rank_blackbox_0',
  'plot_type': 'rank_over_time',
  'plot_type_pretty': 'Rank over Time',
  'explanation': 'The rank of each optimizer over time compares which optimizer performs better, the lower the rank the better. For each optimizer and task, the performance is averaged over seeds to obtain an estimate of the performance. The rank is then calculated per step and task with the same approach as for the critical difference diagram.'}]

In [22]:
df['multi-objective']

KeyError: 'multi-objective'

# Random

In [186]:
fanova = perf_all[perf_all['optimizer_id']=='fanova_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0.75']
fanova_rnd = rnd_all[rnd_all['optimizer_id']=='random_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_thresh_0.75']
fanova = pd.merge(fanova, fanova_rnd, on=['task_id', 'seed', 'n_trials_norm'], suffixes=('_final', '_rnd'))
fanova['trial_value__cost_inc_norm_diff'] = fanova['trial_value__cost_inc_norm_rnd'] - fanova['trial_value__cost_inc_norm_final']
fanova['optimizer_id'] = 'fANOVA-ParEGO'


hypershap = perf_all[perf_all['optimizer_id']=='hypershap_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_dynamic_decay_down']
hypershap_rnd = rnd_all[rnd_all['optimizer_id']=='random_adjust_cs_default_constant_adjust_prev_cfgs_set_to_incumbent_dynamic_decay_down']
hypershap = pd.merge(hypershap, hypershap_rnd, on=['task_id', 'seed', 'n_trials_norm'], suffixes=('_final', '_rnd'))
hypershap['trial_value__cost_inc_norm_diff'] = hypershap['trial_value__cost_inc_norm_rnd'] - hypershap['trial_value__cost_inc_norm_final']
hypershap['optimizer_id'] = 'HyperSHAP-ParEGO'

df_plot = pd.concat([hypershap, fanova], ignore_index=True)

# Average over tasks
avg_task = df_plot.groupby(["optimizer_id", "seed", "n_trials_norm"]).agg(
    mean_perf=("trial_value__cost_inc_norm_diff", "mean")
).reset_index()

# Average over seeds
avg_seed = avg_task.groupby(["optimizer_id", "n_trials_norm"]).agg(
    mean=("mean_perf", "mean"),
    sem=("mean_perf", "sem")
).reset_index()

auc_per_seed = (
    df_plot.groupby(["optimizer_id", "seed"])
    .apply(lambda group: trapz(group.sort_values("n_trials_norm")["trial_value__cost_inc_norm_diff"],
                                group.sort_values("n_trials_norm")["n_trials_norm"]))
    .reset_index(name="auc")
)

colors = px.colors.qualitative.Set1
fig = go.Figure()
for i, optimizer in enumerate(avg_seed["optimizer_id"].unique()):
    df_opt = avg_seed[avg_seed["optimizer_id"] == optimizer]
    line_color = colors[i % len(colors)]

    # Mean line
    fig.add_trace(go.Scatter(
        x=df_opt["n_trials_norm"],
        y=df_opt["mean"],
        mode="lines",
        name=optimizer,
        line=dict(width=2, color=line_color),
    ))

    # Shaded error (sem)
    fig.add_trace(go.Scatter(
        x=pd.concat([df_opt["n_trials_norm"], df_opt["n_trials_norm"][::-1]]),
        y=pd.concat([
            df_opt["mean"] - df_opt["sem"],
            (df_opt["mean"] + df_opt["sem"])[::-1]
        ]),
        fill='toself',
        fillcolor=line_color.replace(')', ',0.1)').replace('rgb', 'rgba'),
        line=dict(color='rgba(255,255,255,0)'),
        hoverinfo="skip",
        showlegend=False
    ))


fig.update_layout(
    height=400,
    width=1100,
    template="plotly_white",
    font=dict(size=16),
    legend=dict(title="Optimizers", font=dict(size=16)),
    margin=dict(t=60, l=50, r=50, b=50)
)

fig.update_xaxes(title_text="Normalized Trials")
fig.update_yaxes(title_text="Difference in Performance HPI and Random")
fig.show()





Unnamed: 0,task_id,seed,n_trials_norm,diff_trial_value__cost_inc_norm
0,multi-objective/50/dev/yahpo/mo/iaml_glmnet/14...,0,0.00,0.000000
1,multi-objective/50/dev/yahpo/mo/iaml_glmnet/14...,0,0.02,0.000000
2,multi-objective/50/dev/yahpo/mo/iaml_glmnet/14...,0,0.04,0.000000
3,multi-objective/50/dev/yahpo/mo/iaml_glmnet/14...,0,0.06,0.000000
4,multi-objective/50/dev/yahpo/mo/iaml_glmnet/14...,0,0.08,0.000000
...,...,...,...,...
6013,multi-objective/50/dev/hpobench/multiobjective...,4,0.92,-0.005848
6014,multi-objective/50/dev/hpobench/multiobjective...,4,0.94,-0.005809
6015,multi-objective/50/dev/hpobench/multiobjective...,4,0.96,-0.005769
6016,multi-objective/50/dev/hpobench/multiobjective...,4,0.98,-0.005730
