In [1]:
import os
import numpy as np
from plotting_utils import read_data_from_logs, clean_results, check_if_minimized
from performance_profile import BASE_WORKLOADS, HELDOUT_WORKLOADS



In [12]:
SELF_TUING = False

log_path = "../../submissions_algorithms_v0.5/logs/algoperf_scoring_v05/"
if SELF_TUING:
  log_path = os.path.join(log_path, "self_tuning")
else:
  log_path = os.path.join(log_path, "external_tuning")

full_results = read_data_from_logs(log_path)
clean_results_dict = clean_results(full_results, SELF_TUING)

Reading data for submission: shampoo_submission
['ogbg_pytorch', 'librispeech_conformer_pytorch', 'fastmri_tanh_pytorch', 'ogbg_model_size_pytorch', 'criteo1tb_pytorch', 'librispeech_conformer_layernorm_pytorch', 'imagenet_vit_pytorch', 'wmt_pytorch', 'librispeech_deepspeech_pytorch', 'imagenet_resnet_pytorch', 'criteo1tb_embed_init_pytorch', 'fastmri_pytorch', 'imagenet_resnet_large_bn_init_pytorch', 'wmt_glu_tanh_pytorch']
['ogbg_pytorch', 'librispeech_conformer_pytorch', 'fastmri_tanh_pytorch', 'ogbg_model_size_pytorch', 'criteo1tb_pytorch', 'librispeech_conformer_layernorm_pytorch', 'imagenet_vit_pytorch', 'wmt_pytorch', 'librispeech_deepspeech_pytorch', 'imagenet_resnet_pytorch', 'criteo1tb_embed_init_pytorch', 'fastmri_pytorch', 'imagenet_resnet_large_bn_init_pytorch', 'wmt_glu_tanh_pytorch']
['ogbg_pytorch', 'librispeech_conformer_pytorch', 'fastmri_tanh_pytorch', 'ogbg_model_size_pytorch', 'criteo1tb_pytorch', 'librispeech_conformer_layernorm_pytorch', 'imagenet_vit_pytorch', '

In [13]:
# Drop runs from non-selected workloads
for submission_name, submission_results in clean_results_dict.items():
    submission_results = submission_results[submission_results["workload"].isin(BASE_WORKLOADS + HELDOUT_WORKLOADS)]
    clean_results_dict[submission_name] = submission_results

In [14]:
def identify_bugged_runs(df, submission_name):
    # Set to true if the last global step is exactly equal to the max global step
    df["stopped_early"] = df["last_global_step"] == df["max_global_step"]
    df["rerun"] = df["stopped_early"]

    # Set to False if the validation(!) target has been achieved
    # The column "perf_is_minimized" is True if the performance metric is minimized otherwise it should be maximized,
    # this changes the comparison operator when checking if the target has been reached
    df["perf_is_minimized"] = df["validation_metric"].apply(check_if_minimized)
    df["reached_target"] = np.where(
        df["perf_is_minimized"],
        df["performance"].apply(min) < df["performance_target"],
        df["performance"].apply(max) > df["performance_target"]
    )
    df["rerun"] = df["stopped_early"] & ~df["reached_target"]

    # Don't rerun runs that have a faster tuning trial in its study
    # For each study, identify the time of the fastest run that hit the target
    for workload, group in submission_results.groupby("workload"):
        for study, group in group.groupby('study'):
            fastest_run = group["time_to_target"].min()
            df.loc[group.index, "fastest_in_study"] = fastest_run
            # If a run that stopped early is above this time, then we don't need to rerun it.
            df.loc[group.index, "rerun"]  = group["rerun"] & (group["last_submission_time"] < fastest_run)

    # Don't rerun runs that have a slower time than 3 other studies.
    for workload, group in submission_results.groupby("workload"):
        median_workload_score = group["fastest_in_study"].median()
        df.loc[group.index, "median_workload_score"] = median_workload_score
        # Any run in that workload that exceeds this time, does not need to be rerun.
        df.loc[group.index, "rerun"]  = group["rerun"] & (group["last_submission_time"] < median_workload_score)

    # Don't rerun runs if too many studies have already failed.
    for workload, group in submission_results.groupby("workload"):
        # Count the number of studies that have at least one run that is marked as rerun
        num_rerun_studies = group.groupby('study')['rerun'].any().sum()
        # Count the number of studies that have a non-inf score
        num_non_inf_studies = group.groupby('study')['time_to_target'].apply(lambda x: (~np.isinf(x)).any()).sum()
        # Compare the number of studies where a rerun could improve with the number of studies that have a non-inf score
        df.loc[group.index, "rerun"] = group["rerun"] & ~(num_rerun_studies < 3 - num_non_inf_studies)

    # Count the number of trues in the stopped_early column
    num_stopped_early = df["stopped_early"].sum()
    num_rerun = df["rerun"].sum()
    num_total_runs = len(df)
    print(f"Number of runs that need to be rerun for {submission_name}: {num_rerun}")
    # Show all rows where rerun is True
    # print(df[df["rerun"]])
    return num_stopped_early, num_rerun, num_total_runs

num_stopped_early_runs, num_reruns, num_total_runs = 0, 0, 0
for submission_name, submission_results in clean_results_dict.items():
    # if submission_name == "amos":
    num_stopped_early_runs_sub, num_reruns_sub,num_total_runs_sub = identify_bugged_runs(submission_results, submission_name)
    num_stopped_early_runs += num_stopped_early_runs_sub
    num_reruns += num_reruns_sub
    num_total_runs += num_total_runs_sub
print(f"Percentage of runs that stopped early: {num_stopped_early_runs / num_total_runs * 100:.2f}% ({num_stopped_early_runs})")
print(f"Percentage of runs that need to be rerun: {num_reruns / num_total_runs * 100:.2f}% ({num_reruns})")

Number of runs that need to be rerun for shampoo_submission: 0
Number of runs that need to be rerun for prize_qualification_baseline: 6
Number of runs that need to be rerun for caspr_adaptive: 0
Number of runs that need to be rerun for schedule_free_adamw: 0
Number of runs that need to be rerun for schedule_free_prodigy: 0
Number of runs that need to be rerun for amos: 0
Number of runs that need to be rerun for lawa_ema: 0
Number of runs that need to be rerun for lawa_queue: 0
Number of runs that need to be rerun for cyclic_lr: 0
Number of runs that need to be rerun for generalized_adam: 0
Number of runs that need to be rerun for nadamp: 7
Percentage of runs that stopped early: 13.55% (495)
Percentage of runs that need to be rerun: 0.36% (13)


In [5]:
df = clean_results_dict["cyclic_lr"]
df[df["rerun"] == True]
# df

Unnamed: 0,workload,study,trial,time_to_target,runtime,performance,validation_metric,performance_target,runtime_budget,last_global_step,last_submission_time,last_performance,max_global_step,stopped_early,rerun,perf_is_minimized,reached_target,fastest_in_study,median_workload_score
100,imagenet_resnet_large_bn_init,study_0,trial_1,inf,"[108.67666983604433, 616.4995930194855, 1124.3...","[0.001, 0.0682, 0.18028, 0.15948, 0.30378, 0.3...",validation/accuracy,0.76526,63008,186666,40511.67766,0.75634,186666,True,True,False,False,inf,inf
101,imagenet_resnet_large_bn_init,study_0,trial_2,inf,"[108.09170889854433, 615.9908201694489, 1123.8...","[0.001, 0.04144, 0.15404, 0.2591, 0.35324, 0.3...",validation/accuracy,0.76526,63008,186666,40327.383247,0.75858,186666,True,True,False,False,inf,inf
102,imagenet_resnet_large_bn_init,study_0,trial_3,inf,"[108.26602697372437, 616.0899803638458, 1124.0...","[0.001, 0.09756, 0.26284, 0.40116, 0.46478, 0....",validation/accuracy,0.76526,63008,186666,40409.70954,0.76068,186666,True,True,False,False,inf,inf
103,imagenet_resnet_large_bn_init,study_0,trial_4,inf,"[106.49785113334656, 614.4587249755859, 1122.4...","[0.001, 0.04238, 0.17066, 0.27692, 0.35268, 0....",validation/accuracy,0.76526,63008,186666,40281.963847,0.75116,186666,True,True,False,False,inf,inf
104,imagenet_resnet_large_bn_init,study_0,trial_5,inf,"[107.14069032669067, 615.0062892436981, 1122.8...","[0.001, 0.05962, 0.27406, 0.33236, 0.45294, 0....",validation/accuracy,0.76526,63008,186666,40518.136041,0.75352,186666,True,True,False,False,inf,inf
105,imagenet_resnet_large_bn_init,study_1,trial_1,inf,"[108.123605966568, 615.9082245826721, 1123.916...","[0.001, 0.05774, 0.05504, 0.22124, 0.24342, 0....",validation/accuracy,0.76526,63008,186666,40701.223892,0.75884,186666,True,True,False,False,inf,inf
106,imagenet_resnet_large_bn_init,study_1,trial_2,inf,"[107.73529601097108, 615.5508902072906, 1123.5...","[0.001, 0.04108, 0.16094, 0.27576, 0.3267, 0.2...",validation/accuracy,0.76526,63008,186666,40538.64494,0.75758,186666,True,True,False,False,inf,inf
107,imagenet_resnet_large_bn_init,study_1,trial_3,inf,"[108.13523364067078, 615.8964159488678, 1123.7...","[0.001, 0.0653, 0.2257, 0.37492, 0.45372, 0.20...",validation/accuracy,0.76526,63008,186666,40623.108612,0.75766,186666,True,True,False,False,inf,inf
108,imagenet_resnet_large_bn_init,study_1,trial_4,inf,"[107.40123319625854, 615.284832239151, 1123.29...","[0.001, 0.04144, 0.15976, 0.2096, 0.33774, 0.3...",validation/accuracy,0.76526,63008,186666,40231.963471,0.74908,186666,True,True,False,False,inf,inf
109,imagenet_resnet_large_bn_init,study_1,trial_5,inf,"[108.1041738986969, 615.7369914054871, 1123.51...","[0.001, 0.0794, 0.23414, 0.37104, 0.4414, 0.46...",validation/accuracy,0.76526,63008,186666,40475.431898,0.75648,186666,True,True,False,False,inf,inf


In [None]:
df[df["workload"] == "librispeech_deepspeech"]