In [1]:
import asyncio
import itertools
import sys
import os
import pathlib
base_path_str = '.'
base_path = pathlib.Path(base_path_str)
sys.path.append(base_path_str)

import huggingface_hub as hfh
import pandas as pd
import timm
from determined.experimental import client
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
sns.set(rc={"figure.figsize":(10, 10)})


import workspaces

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 500)

All relevant `timm` models:

In [2]:
all_timm_models_df = pd.read_feather(base_path.joinpath('models/selected_timm_models.feather'))
all_timm_models_df

Unnamed: 0,model,top1,top1_err,top5,top5_err,param_count,img_size,crop_pct,interpolation,state_dict_path
0,ig_resnext101_32x48d,85.436,14.564,97.576,2.424,828.41,224,0.875,bilinear,ig_resnext101_32x48-3e41cc8a.pth
1,beit_large_patch16_512,88.602,11.398,98.656,1.344,305.67,512,1.0,bicubic,beit_large_patch16_512_pt22k_ft22kto1k.pth
2,resnet152,82.818,17.182,96.132,3.868,60.19,224,0.95,bicubic,resnet152_a1h-dc400468.pth
3,efficientnet_b0,77.7,22.3,93.532,6.468,5.29,224,0.875,bicubic,efficientnet_b0_ra-3dd342df.pth


Get results from cluster

In [3]:
master = ! echo $GCP_T4X4_MASTER
master = master[0]
client.login(master=master, user='determined', password='')

In [4]:
w = workspaces.Workspace(workspace_name='DeepSpeed',
                         master_url=master,
                         username='determined',
                         password='')

Separating autotuning results from those found with the flops profiler:

In [5]:
all_project_names = w.get_all_project_names()
autotuning_project_names = [name for name in all_project_names if name.split('.')[-1] == 'autotuning']
flops_profiler_project_names = [name for name in all_project_names if name.split('.')[-1] == 'flops_profiler']

In [6]:
flops_profiler_results_df = w.get_trial_best_val_results_df(flops_profiler_project_names)
flops_profiler_results_df

Getting Experiments from resnet152.flops_profiler: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  5.22it/s]
Getting Trials from resnet152.flops_profiler: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 20.19it/s]
Getting Experiments from beit_large_patch16_512.flops_profiler: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  3.72it/s]
Getting Trials from beit_large_patch16_512.flops_profiler: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 20.66it/s]
Getting Experiments from ig_resnext101_32x48d.flops_profiler: 100%|██████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  9.32it/s]
Getting Trials from ig_resnext101_32x48d

Unnamed: 0,FLOPS_per_gpu,checkpoint_path_prefix,dataset_name,ds_config_flops_profiler_detailed,ds_config_flops_profiler_enabled,ds_config_flops_profiler_module_depth,ds_config_flops_profiler_output_file,ds_config_flops_profiler_profile_step,ds_config_flops_profiler_top_modules,ds_config_fp16_enabled,ds_config_fp16_initial_scale_power,ds_config_gradient_accumulation_steps,ds_config_optimizer_TYPE,ds_config_optimizer_params_lr,ds_config_train_micro_batch_size_per_gpu,ds_config_zero_optimization_stage,exp_name,experiment_id,latency,model_name,slots_per_trial,throughput,wall_clock_time
39389,6290000000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,2GPU.resnet152.mini_imagenet.fp16,24714,1.55,resnet152,2,181.07,440.81661
39462,6400000000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,3GPU.resnet152.mini_imagenet.fp16,24715,1.4,resnet152,3,276.21,431.041122
40396,5900000000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,4GPU.resnet152.mini_imagenet.fp16,25045,1.65,resnet152,4,339.65,441.413784
39387,3340000000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,2GPU.beit_large_patch16_512.mini_imagenet.fp16,24711,1.68,beit_large_patch16_512,2,3.58,213.293522
39455,3170000000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,3GPU.beit_large_patch16_512.mini_imagenet.fp16,24712,1.76,beit_large_patch16_512,3,5.1,211.025083
40395,3160000000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,4GPU.beit_large_patch16_512.mini_imagenet.fp16,25044,1.77,beit_large_patch16_512,4,6.78,206.187416
39408,544920000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,2GPU.efficientnet_b0.mini_imagenet.fp16,24717,1.49,efficientnet_b0,2,416.07,525.559363
39459,571710000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,3GPU.efficientnet_b0.mini_imagenet.fp16,24718,1.36,efficientnet_b0,3,659.44,513.235885
40812,499570000000.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,True,-1,/run/determined/workdir/flops_profiler_output.txt,5,1,True,8,1,Adam,1e-10,128,0,4GPU.efficientnet_b0.mini_imagenet.fp16,26137,1.63,efficientnet_b0,4,762.88,515.46058


Due to DS bugs, for all *autotuning* trials the `latency` is in $\mu{\rm s}$, the `throughput` is in records/${\rm ms}$, and the `FLOPS_per_gpu` also has weird units.  Add columns with fixes everywhere so that all time units are in seconds.

In [8]:
autotuning_results_df = w.get_trial_best_val_results_df(autotuning_project_names)
autotuning_results_df.latency /= 10 ** 6
autotuning_results_df.throughput *= 10 ** 3
autotuning_results_df.FLOPS_per_gpu *= 10 ** 6
autotuning_results_df

Getting Experiments from efficientnet_b0.autotuning: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.03s/it]
Getting Trials from efficientnet_b0.autotuning: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 324/324 [00:05<00:00, 62.44it/s]
Getting Experiments from beit_large_patch16_512.autotuning: 100%|████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.19s/it]
Getting Trials from beit_large_patch16_512.autotuning: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 324/324 [00:05<00:00, 59.58it/s]


Unnamed: 0,FLOPS_per_gpu,autotuning_enabled,autotuning_end_profile_step,autotuning_fast,autotuning_metric,autotuning_metric_path,autotuning_start_profile_step,autotuning_tuner_early_stopping,autotuning_tuner_num_trials,autotuning_tuner_type,backward,checkpoint_path_prefix,dataset_name,ds_config_autotuning_enabled,ds_config_autotuning_end_profile_step,ds_config_autotuning_fast,ds_config_autotuning_metric,ds_config_autotuning_start_profile_step,ds_config_autotuning_tuner_early_stopping,ds_config_autotuning_tuner_num_trials,ds_config_autotuning_tuner_type,ds_config_fp16_enabled,ds_config_fp16_initial_scale_power,ds_config_gradient_accumulation_steps,ds_config_optimizer_TYPE,ds_config_optimizer_params_lr,ds_config_train_micro_batch_size_per_gpu,ds_config_zero_optimization_stage,exp_name,experiment_id,forward,fp16_enabled,fp16_initial_scale_power,gradient_accumulation_steps,latency,model_name,optimizer_TYPE,optimizer_params_lr,rank,slots_per_trial,step,throughput,train_batch_size,train_micro_batch_size_per_gpu,wall_clock_time,zero_optimization_allgather_bucket_size,zero_optimization_allgather_partitions,zero_optimization_contiguous_gradients,zero_optimization_overlap_comm,zero_optimization_reduce_bucket_size,zero_optimization_reduce_scatter,zero_optimization_stage,zero_optimization_stage3_gather_16bit_weights_on_model_save,zero_optimization_stage3_max_live_parameters,zero_optimization_stage3_max_reuse_distance,zero_optimization_stage3_param_persistence_threshold,zero_optimization_stage3_prefetch_bucket_size,zero_optimization_sub_group_size
39452,6401901000000.0,True,5,False,FLOPS_per_gpu,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,1016066.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,FLOPS_per_gpu,3,5,50,random,True,8,1,Adam,1e-10,128,[0],2GPU.resnet152.mini_imagenet.fp16.FLOPS_per_gp...,24774,468832.275391,True,8,1,1.530407,resnet152,Adam,1e-10,0,2,45508.609772,184.264681,282,141,494.628466,,,,,,,0,,,,,,
39509,6906425000000.0,True,5,False,FLOPS_per_gpu,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,815159.3,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,FLOPS_per_gpu,3,5,50,random,True,8,1,Adam,1e-10,128,[0],3GPU.resnet152.mini_imagenet.fp16.FLOPS_per_gp...,24775,426584.075928,True,8,1,1.287815,resnet152,Adam,1e-10,0,3,46071.807861,298.179432,384,128,885.781448,,,,,,,0,,,,,,
39552,5862476000000.0,True,5,False,FLOPS_per_gpu,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,1044859.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,FLOPS_per_gpu,3,5,50,random,True,8,1,Adam,1e-10,128,[0],4GPU.resnet152.mini_imagenet.fp16.FLOPS_per_gp...,24776,428036.682129,True,8,1,1.51714,resnet152,Adam,1e-10,0,4,44244.224548,337.477032,512,128,1072.700743,,,,,,,0,,,,,,
39398,5994374000000.0,True,5,False,latency,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,1134109.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,latency,3,5,50,random,True,8,1,Adam,1e-10,128,[0],2GPU.resnet152.mini_imagenet.fp16.latency.random,24777,478546.75293,True,8,1,1.657636,resnet152,Adam,1e-10,0,2,44980.190277,172.534906,286,143,718.473799,,,,,,,0,,,,,,
39482,6489634000000.0,True,5,False,latency,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,941512.5,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,latency,3,5,50,random,True,8,1,Adam,1e-10,128,[0],3GPU.resnet152.mini_imagenet.fp16.latency.random,24778,542468.139648,True,8,1,1.531132,resnet152,Adam,1e-10,0,3,47151.584625,280.184825,429,143,561.872715,,,,,,,0,,,,,,
39412,6288334000000.0,True,5,False,throughput,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,939265.7,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,throughput,3,5,50,random,True,8,1,Adam,1e-10,128,[0],2GPU.resnet152.mini_imagenet.fp16.throughput.r...,24780,427063.62915,True,8,1,1.414397,resnet152,Adam,1e-10,0,2,48067.264557,180.995905,256,128,488.581508,,,,,,,0,,,,,,
39487,6920391000000.0,True,5,False,throughput,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,805281.8,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,throughput,3,5,50,random,True,8,1,Adam,1e-10,128,[0],3GPU.resnet152.mini_imagenet.fp16.throughput.r...,24781,432009.490967,True,8,1,1.285216,resnet152,Adam,1e-10,0,3,47924.991608,298.782396,384,128,877.59801,,,,,,,0,,,,,,
39554,5816515000000.0,True,5,False,throughput,/run/determined/workdir/autotuning_results/z0_...,3,5,50,random,1052772.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,throughput,3,5,50,random,True,8,1,Adam,1e-10,128,[0],4GPU.resnet152.mini_imagenet.fp16.throughput.r...,24782,429756.04248,True,8,1,1.529129,resnet152,Adam,1e-10,0,4,46600.193024,334.831229,512,128,1075.501194,,,,,,,0,,,,,,
39399,6211372000000.0,True,5,False,FLOPS_per_gpu,/run/determined/workdir/autotuning_results/z0_...,3,5,50,gridsearch,1066085.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,FLOPS_per_gpu,3,5,50,gridsearch,True,8,1,Adam,1e-10,128,[0],2GPU.resnet152.mini_imagenet.fp16.FLOPS_per_gp...,24783,466435.333252,True,8,1,1.577351,resnet152,Adam,1e-10,0,2,44830.718994,178.78071,282,141,800.577337,,,,,,,0,,,,,,
39556,6000917000000.0,True,5,False,FLOPS_per_gpu,/run/determined/workdir/autotuning_results/z0_...,3,5,50,gridsearch,1111323.0,shared_fs/ensembles/state_dicts/,mini_imagenet,True,5,False,FLOPS_per_gpu,3,5,50,gridsearch,True,8,1,Adam,1e-10,128,[0],4GPU.resnet152.mini_imagenet.fp16.FLOPS_per_gp...,24785,474460.266113,True,8,1,1.63267,resnet152,Adam,1e-10,0,4,46886.9133,345.446456,564,141,1082.222815,,,,,,,0,,,,,,


There is some weird issue going on w/ reported tuner types.  The `ds_config_autotuning_tuner_type` field is the tuner type specified in the `yaml` config and should be the type used during autotuning.  The `autotuning_tuner_type` field, however, is read out of the actual DS autotuner output and does not always match. Very weird. Similar inconsistencies also happen for the autotuning metric.

See [this Trial](http://104.196.135.13:8080/det/experiments/25858/logs?searchText=tuner_type) for an example.  The conflict can be seen to be within DS by comparing the logs and the checkpoint info, e.g.

In [9]:
conflicts = {}
for arg in ('tuner_type', 'metric', 'fast'):
    conflict_mask = ~(autotuning_results_df[f'autotuning_{arg}'] == autotuning_results_df[f'ds_config_autotuning_{arg}'])
    conflicts[arg] = autotuning_results_df[conflict_mask]

# Check the number of conflicts:
{k: len(v) for k, v in conflicts.items()}



{'tuner_type': 145, 'metric': 114, 'fast': 145}

Visualizing one example:

In [10]:
conflicts['tuner_type'][['ds_config_autotuning_tuner_type', 'autotuning_tuner_type']]

Unnamed: 0,ds_config_autotuning_tuner_type,autotuning_tuner_type
39860,gridsearch,random
40609,gridsearch,model_based
39841,model_based,gridsearch
39828,model_based,gridsearch
39831,gridsearch,model_based
40637,model_based,gridsearch
40620,gridsearch,model_based
40236,model_based,random
40223,model_based,random
40219,random,model_based


They are not all subsets of each other:

In [11]:
for a, b in itertools.product(conflicts.values(), conflicts.values()):
    if a is not b:
        print(a.index.isin(b.index).all())

False
True
True
True
True
False


Use the reported values (non-`ds_config`) values everywhere and continue:

In [12]:
all_results_df = pd.concat([autotuning_results_df, flops_profiler_results_df])

Create a helper autotuning column

In [13]:
all_results_df['autotuning'] = ~all_results_df.autotuning_enabled.isnull()

In [14]:
plots_dir = base_path.joinpath('plots')

def create_masks(df, metric, model_name, tuner_type, fast):
    model_mask = df.model_name == model_name
    throughput_mask = df.autotuning_metric == metric
    fast_mask = df.autotuning_fast == fast
    tuner_type_mask = df.autotuning_tuner_type == tuner_type
    baseline_mask = model_mask & (df.autotuning_metric.isnull())
    mask_without_baseline = model_mask & throughput_mask & fast_mask & tuner_type_mask
    mask_with_baseline = mask_without_baseline | baseline_mask
    
    return baseline_mask, mask_without_baseline, mask_with_baseline 

def get_param_count(model_name):
    params = all_timm_models_df[all_timm_models_df.model==model_name].param_count.iloc[0]
    return params

def create_and_save_plot(df, metric, model_name, tuner_type, fast):
    baseline_mask, mask_without_baseline, mask_with_baseline = create_masks(df, metric, model_name, tuner_type, fast)
    masked_df = df[mask_with_baseline]
    
    fig, ax = plt.subplots(1, 1)
    exp_desc = f"{'fast ' if fast else ''}{tuner_type} {model_name}"
    title_str = exp_desc + f' ({get_param_count(model_name)}M. params.)'
    sns.lineplot(x='slots_per_trial', 
                 y=metric, 
                 hue='autotuning', 
                 data=masked_df, 
                 markers=True, 
                 ci=68,
                 ax=ax).set(title=f"{model_name} {tuner_type} {'fast' if fast else ''}")
    autotuning_means = masked_df.groupby('slots_per_trial').mean()[metric]
    baselines = df[baseline_mask].groupby('slots_per_trial').mean()[metric]
    ratios = autotuning_means / baselines
    for ((slots, mean), (_, ratio)) in zip(autotuning_means.iteritems(), ratios.iteritems()):
        desc = f'{mean:.2e}'
        if ratio == ratio: # funky nan check
            desc += f'  ({ratio:.2f}x)'
        ax.text(slots, mean, desc)
    if baselines.any():
        for slots, mean in baselines.iteritems():
            desc = f'{mean:.2e}'
            ax.text(slots, mean, desc)
            
    plot_subdir = plots_dir.joinpath(metric)
    os.makedirs(plot_subdir, exist_ok=True)
    file_str = '_'.join(exp_desc.split()) + f'_{metric}.png'
    save_path = plot_subdir.joinpath(file_str)
    fig.figure.savefig(save_path, dpi=512)

In [15]:
def no_nans(x):
    for item in x:
        if item != item:
            return False
    return True

def non_nans_iter(x):
    for item in x:
        if item != item:
            continue
        yield item
        
def non_nans_product(*args):
    prod = itertools.product(*(non_nans_iter(x) for x in args))
    return prod

In [None]:
for combo in non_nans_product(all_results_df.autotuning_metric.unique(),
                               all_results_df.model_name.unique(),
                               all_results_df.autotuning_tuner_type.unique(),
                               all_results_df.autotuning_fast.unique()):
    create_and_save_plot(all_results_df, *combo)
    

Tables:

Isolate all trials which have a baseline to compare to, then compute the relative improvement:

In [23]:
baseline_model_names = flops_profiler_results_df.model_name.unique()

In [24]:
results_with_baseline_df = autotuning_results_df[autotuning_results_df.model_name.isin(baseline_model_names)].copy()

In [25]:
def get_baseline_and_ratio(df, baseline_df):
    baseline = []
    ratio = []
    for _, val in df.iterrows():
        model_mask = baseline_df.model_name == val.model_name
        slot_mask = baseline_df.slots_per_trial == val.slots_per_trial
        masked_baseline_df = baseline_df[model_mask & slot_mask]
        baseline_metric = masked_baseline_df[val.autotuning_metric].values
        val_metric = val[val.autotuning_metric]
        baseline.append(baseline_metric)
        ratio.append(val_metric / baseline_metric)
    baseline = np.concatenate(baseline)
    ratio = np.concatenate(ratio)
    return baseline, ratio

In [26]:
results_with_baseline_df['baseline'], results_with_baseline_df['ratio'] = get_baseline_and_ratio(results_with_baseline_df, flops_profiler_results_df)

Create a multi-index dataframe summarizing results.

In [28]:
ratio_summary = results_with_baseline_df.groupby(['autotuning_metric', 'model_name', 'autotuning_tuner_type', 'autotuning_fast', 'slots_per_trial'])['ratio'].mean()

Slice in various ways with `groupby`:

Global:

In [30]:
ratio_summary.groupby(level=0).mean()

autotuning_metric
FLOPS_per_gpu    1.628147
latency          1.060911
throughput       1.663369
Name: ratio, dtype: float64

One more level of detail:

In [32]:
for a in range(1, 5):
    print(ratio_summary.groupby(level=(0, a)).mean(), 80 * '-', sep='\n')

autotuning_metric  model_name            
FLOPS_per_gpu      beit_large_patch16_512    2.448826
                   efficientnet_b0           1.421646
                   resnet152                 1.013970
latency            beit_large_patch16_512    1.067912
                   efficientnet_b0           1.024686
                   resnet152                 1.090135
throughput         beit_large_patch16_512    2.489719
                   efficientnet_b0           1.479497
                   resnet152                 1.020890
Name: ratio, dtype: float64
--------------------------------------------------------------------------------
autotuning_metric  autotuning_tuner_type
FLOPS_per_gpu      gridsearch               1.607662
                   model_based              1.622163
                   random                   1.654617
latency            gridsearch               1.062146
                   model_based              1.060433
                   random                   1.060155
thro

Fast mode isn't much faster:

In [33]:
results_with_baseline_df[results_with_baseline_df.autotuning_fast].wall_clock_time.mean()

859.5790279813278

In [34]:
results_with_baseline_df[~results_with_baseline_df.autotuning_fast].wall_clock_time.mean()

887.0996187408165