In [1]:
import os
import numpy as np
from plotting_utils import read_data_from_logs, clean_results, check_if_minimized
from performance_profile import BASE_WORKLOADS, HELDOUT_WORKLOADS



In [21]:
SELF_TUING = True

log_path = "../../submissions_algorithms_v0.5/logs/algoperf_scoring_v05/"
if SELF_TUING:
  log_path = os.path.join(log_path, "self_tuning")
else:
  log_path = os.path.join(log_path, "external_tuning")

full_results = read_data_from_logs(log_path)
clean_results_dict = clean_results(full_results, SELF_TUING)

Reading data for submission: prize_qualification_baseline
['wmt_glu_tanh_jax', 'fastmri_jax', 'criteo1tb_jax', 'librispeech_conformer_layernorm_jax', 'fastmri_tanh_jax', 'ogbg_jax', 'ogbg_model_size_jax', 'imagenet_vit_jax', 'criteo1tb_embed_init_jax', 'librispeech_conformer_jax', 'librispeech_deepspeech_jax', 'wmt_jax', 'imagenet_resnet_large_bn_init_jax', 'imagenet_resnet_jax']
['wmt_glu_tanh_jax', 'fastmri_jax', 'criteo1tb_jax', 'librispeech_conformer_layernorm_jax', 'fastmri_tanh_jax', 'ogbg_jax', 'ogbg_model_size_jax', 'imagenet_vit_jax', 'criteo1tb_embed_init_jax', 'librispeech_conformer_jax', 'librispeech_deepspeech_jax', 'wmt_jax', 'imagenet_resnet_large_bn_init_jax', 'imagenet_resnet_jax']
['wmt_glu_tanh_jax', 'fastmri_jax', 'criteo1tb_jax', 'librispeech_conformer_layernorm_jax', 'fastmri_tanh_jax', 'ogbg_jax', 'ogbg_model_size_jax', 'imagenet_vit_jax', 'criteo1tb_embed_init_jax', 'librispeech_conformer_jax', 'librispeech_deepspeech_jax', 'wmt_jax', 'imagenet_resnet_large_bn_i

In [22]:
# Drop runs from non-selected workloads
for submission_name, submission_results in clean_results_dict.items():
    submission_results = submission_results[submission_results["workload"].isin(BASE_WORKLOADS + HELDOUT_WORKLOADS)]
    clean_results_dict[submission_name] = submission_results

In [23]:
def identify_bugged_runs(df, submission_name):
    # Set to true if the last global step is exactly equal to the max global step
    df["stopped_early"] = df["last_global_step"] == df["max_global_step"]
    df["rerun"] = df["stopped_early"]

    # Set to False if the validation(!) target has been achieved
    # The column "perf_is_minimized" is True if the performance metric is minimized otherwise it should be maximized,
    # this changes the comparison operator when checking if the target has been reached
    df["perf_is_minimized"] = df["validation_metric"].apply(check_if_minimized)
    df["reached_target"] = np.where(
        df["perf_is_minimized"],
        df["performance"].apply(min) < df["performance_target"],
        df["performance"].apply(max) > df["performance_target"]
    )
    df["rerun"] = df["stopped_early"] & ~df["reached_target"]

    # Don't rerun runs that have a faster tuning trial in its study
    # For each study, identify the time of the fastest run that hit the target
    for workload, group in submission_results.groupby("workload"):
        for study, group in group.groupby('study'):
            fastest_run = group["time_to_target"].min()
            df.loc[group.index, "fastest_in_study"] = fastest_run
            # If a run that stopped early is above this time, then we don't need to rerun it.
            df.loc[group.index, "rerun"]  = group["rerun"] & (group["last_submission_time"] < fastest_run)

    # Don't rerun runs that have a slower time than 3 other studies.
    for workload, group in submission_results.groupby("workload"):
        median_workload_score = group["fastest_in_study"].median()
        df.loc[group.index, "median_workload_score"] = median_workload_score
        # Any run in that workload that exceeds this time, does not need to be rerun.
        df.loc[group.index, "rerun"]  = group["rerun"] & (group["last_submission_time"] < median_workload_score)

    # Don't rerun runs if too many studies have already failed.
    for workload, group in submission_results.groupby("workload"):
        # Count the number of studies that have at least one run that is marked as rerun
        num_rerun_studies = group.groupby('study')['rerun'].any().sum()
        # Count the number of studies that have a non-inf score
        num_non_inf_studies = group.groupby('study')['time_to_target'].apply(lambda x: (~np.isinf(x)).any()).sum()
        # Compare the number of studies where a rerun could improve with the number of studies that have a non-inf score
        df.loc[group.index, "rerun"] = group["rerun"] & ~(num_rerun_studies < 3 - num_non_inf_studies)

    # Count the number of trues in the stopped_early column
    num_stopped_early = df["stopped_early"].sum()
    num_rerun = df["rerun"].sum()
    num_total_runs = len(df)
    print(f"Number of runs that need to be rerun for {submission_name}: {num_rerun}")
    # Show all rows where rerun is True
    # print(df[df["rerun"]])
    return num_stopped_early, num_rerun, num_total_runs

num_stopped_early_runs, num_reruns, num_total_runs = 0, 0, 0
for submission_name, submission_results in clean_results_dict.items():
    # if submission_name == "amos":
    num_stopped_early_runs_sub, num_reruns_sub,num_total_runs_sub = identify_bugged_runs(submission_results, submission_name)
    num_stopped_early_runs += num_stopped_early_runs_sub
    num_reruns += num_reruns_sub
    num_total_runs += num_total_runs_sub
print(f"Percentage of runs that stopped early: {num_stopped_early_runs / num_total_runs * 100:.2f}% ({num_stopped_early_runs})")
print(f"Percentage of runs that need to be rerun: {num_reruns / num_total_runs * 100:.2f}% ({num_reruns})")

Number of runs that need to be rerun for prize_qualification_baseline: 3
Number of runs that need to be rerun for schedule_free_adamw: 0
Number of runs that need to be rerun for AdamG: 5
Number of runs that need to be rerun for sinv6_75: 3
Number of runs that need to be rerun for sinv6: 5
Number of runs that need to be rerun for nadamw_sequential: 0
Percentage of runs that stopped early: 5.01% (20)
Percentage of runs that need to be rerun: 4.01% (16)


In [29]:
df = clean_results_dict["sinv6"]
df[df["rerun"] == True]
# df

Unnamed: 0,workload,study,trial,time_to_target,runtime,performance,validation_metric,performance_target,runtime_budget,last_global_step,last_submission_time,last_performance,max_global_step,stopped_early,rerun,perf_is_minimized,reached_target,fastest_in_study,median_workload_score
35,librispeech_deepspeech,study_0,trial_1,inf,"[53.73023533821106, 1493.896743774414, 2934.41...","[2.321268235225967, 0.8966179750330672, 0.8135...",validation/wer,0.119936,166518,144000,122494.487013,0.126321,144000,True,True,True,False,inf,inf
36,librispeech_deepspeech,study_1,trial_1,inf,"[53.06836247444153, 1493.2048256397247, 2933.8...","[2.149154735124593, 0.8966179750330672, 0.8118...",validation/wer,0.119936,166518,144000,112921.996969,0.121581,144000,True,True,True,False,inf,inf
37,librispeech_deepspeech,study_2,trial_1,inf,"[52.72397613525391, 1493.2717809677124, 2933.3...","[2.824459098062311, 0.8966179750330672, 0.8964...",validation/wer,0.119936,166518,144000,120824.194654,0.451181,144000,True,True,True,False,inf,inf
38,librispeech_deepspeech,study_3,trial_1,inf,"[53.00154423713684, 1493.5241582393646, 2933.4...","[3.349807389671452, 0.8966179750330672, 0.8966...",validation/wer,0.119936,166518,144000,121094.140213,0.127528,144000,True,True,True,False,inf,inf
39,librispeech_deepspeech,study_4,trial_1,inf,"[53.85474514961243, 1494.3304085731506, 2934.3...","[1.350212885099974, 0.8966179750330672, 0.8657...",validation/wer,0.119936,166518,144000,120510.44421,0.125182,144000,True,True,True,False,inf,inf


In [27]:
df[df["workload"] == "librispeech_deepspeech"]

Unnamed: 0,workload,study,trial,time_to_target,runtime,performance,validation_metric,performance_target,runtime_budget,last_global_step,last_submission_time,last_performance,max_global_step,stopped_early,rerun,perf_is_minimized,reached_target,fastest_in_study,median_workload_score
44,librispeech_deepspeech,study_0,trial_1,inf,"[130.25048851966858, 1569.1348843574524, 3008....","[2.199275817119683, 0.8967218654950997, 0.8967...",validation/wer,0.119936,166518,144000,131831.472473,0.211616,144000,True,True,True,False,inf,inf
45,librispeech_deepspeech,study_1,trial_1,inf,"[130.5739231109619, 1569.3709280490875, 3008.2...","[3.687539226572684, 0.8967218654950997, 0.8967...",validation/wer,0.119936,166518,144000,131120.979603,0.209936,144000,True,True,True,False,inf,inf
46,librispeech_deepspeech,study_2,trial_1,inf,"[130.6338291168213, 1569.9424760341644, 3009.1...","[3.829324578766958, 0.8844976584753537, 0.8741...",validation/wer,0.119936,166518,144000,132390.194381,0.210023,144000,True,True,True,False,inf,inf
47,librispeech_deepspeech,study_3,trial_1,inf,"[130.56874418258667, 1569.7711989879608, 3009....","[1.8632549606527304, 0.8966832424081495, 0.896...",validation/wer,0.119936,166518,144000,132295.951864,0.211268,144000,True,True,True,False,inf,inf
48,librispeech_deepspeech,study_4,trial_1,inf,"[131.6194725036621, 1570.722844362259, 3009.60...","[1.8258388451697003, 0.8967218654950997, 0.891...",validation/wer,0.119936,166518,144000,131667.567907,0.210312,144000,True,True,True,False,inf,inf
