In [None]:
from pathlib import Path
import importlib

import numpy as np
import pandas as pd

DATA_DIR = Path("out")
FIGURES_DIR = Path("figures")

In [None]:
utils = importlib.import_module('utils')
importlib.reload(utils)

rsys = 'mpp'
accl = 'h100'
devices_per_worker = 1
date = '15_07_25'
rel = ''

bench =  f'tb_sscaling_stencil_5s_{rsys}_{accl}_{date}{rel}'

dashes = False
df, metadata = utils.load_data(DATA_DIR/f"{bench}.pkl")
# df.drop(columns=['name', 'kernel', 'version', 'hosts', 'time'], inplace=True)

df['devices'] = df['workers'] * devices_per_worker
# df['devices'] = devices_per_worker
df['tflops_per_sec'] = df['total_flops'] * np.pow(1/10, 12) / df['wall_time']
df['tflops_per_task'] = df['total_flops'] * np.pow(1/10, 12) / df['total_tasks']
df_max_flops_per_worker = df.groupby(by='devices', as_index=False)['tflops_per_sec'].max()
df_max_flops_per_worker.rename(columns={'tflops_per_sec': 'max_tflops_per_sec'}, inplace=True)
df = df.merge(df_max_flops_per_worker, on='devices')
df['efficiency'] = df['tflops_per_sec'] * 100 /df['max_tflops_per_sec']
df['task_granularity'] = df['wall_time'] * df['devices'] * np.pow(10, 3) / df['total_tasks']

agg_df = df.groupby(by=['devices', 'radix', 'type', 'output', 'size', 'total_flops', 'total_tasks']).agg(
    runtime_mean=('wall_time', 'mean'),
    # runtime_std=('wall_time', 'std')
).reset_index()

agg_df['tflops_per_sec'] = agg_df['total_flops'] * np.pow(1/10, 12) / agg_df['runtime_mean']
agg_df['flops_per_task'] = agg_df['total_flops'] / agg_df['total_tasks']
agg_df_max_flops_per_worker = agg_df.groupby(by=['devices'], as_index=False)['tflops_per_sec'].max()
agg_df_max_flops_per_worker.rename(columns={'tflops_per_sec': 'max_tflops_per_sec'}, inplace=True)
agg_df = agg_df.merge(agg_df_max_flops_per_worker, on=['devices'])
agg_df['efficiency'] = agg_df['tflops_per_sec'] * 100 /agg_df['max_tflops_per_sec']
agg_df['task_granularity'] = agg_df['runtime_mean'] * agg_df['devices']* np.pow(10, 3) / agg_df['total_tasks']
# agg_df.query('output == 16')
agg_df


metg = lambda x: utils.find_metg(x, x_data='flops_per_task', y_data='efficiency', method=1)

# Get METG(50%)
metg_df = agg_df[['devices', 'flops_per_task', 'efficiency']].groupby(
    ['devices']).apply(metg, include_groups=False
).reset_index()

# metg_df

In [None]:
benchs = [
    # 'tb_wscaling_stencil_5s_mpp_h100_15_07_25',
    'tb_flops_5s_mpp_h100_15_07_25',
    'tb_wscaling_stencil_5s_mpp_mi300a_15_07_25',
    'tb_wscaling_stencil_5s_mpi_h100_15_07_25',
    'tb_wscaling_stencil_5s_mpi_mi300a_15_07_25',
]

devices_per_worker = 1

all_df = pd.concat([utils.load_data(DATA_DIR/f"{bench}.pkl")[0] for bench in benchs])
all_df['sys'] = all_df['version'].apply(lambda x: 'mpi' if 'mpi' in x else 'mpp')
all_df['accl'] = all_df['version'].apply(lambda x: 'h100' if 'h100' in x else 'mi300a')
all_df['devices'] = all_df['workers'] * devices_per_worker
all_df.drop(columns=['name', 'kernel', 'version', 'hosts', 'time'], inplace=True)


agg_all_df = all_df.groupby(by=['sys', 'accl', 'devices', 'type', 'radix', 'output', 'size', 'total_flops', 'total_tasks']).agg(
    runtime_mean=('wall_time', 'mean'),
    runtime_std=('wall_time', 'std')
).reset_index()

agg_all_df['tflops_per_sec'] = agg_all_df['total_flops'] * np.pow(1/10, 12) / agg_all_df['runtime_mean']
agg_all_df['flops_per_task'] = agg_all_df['total_flops'] / agg_all_df['total_tasks']
agg_all_df_max_flops_per_device = agg_all_df.groupby(
    by=['accl', 'devices', 'type', 'radix', 'output', ], as_index=False
)['tflops_per_sec'].max()
agg_all_df_max_flops_per_device.rename(columns={'tflops_per_sec': 'max_tflops_per_sec'}, inplace=True)
agg_all_df = agg_all_df.merge(agg_all_df_max_flops_per_device, on=['accl', 'devices', 'type', 'radix', 'output'])
agg_all_df['efficiency'] = agg_all_df['tflops_per_sec'] * 100 /agg_all_df['max_tflops_per_sec']
agg_all_df['task_granularity'] = agg_all_df['runtime_mean'] * agg_all_df['devices'] * np.pow(10, 3) / agg_all_df['total_tasks']
agg_all_df

agg_all_df['config'] = agg_all_df['sys'].str.upper() + "-" + agg_all_df['accl'].str.upper()

metg_target = 'task_granularity'

metg = lambda x: utils.find_metg(x, x_data=metg_target, y_data='efficiency', method=2)

# Get METG(50%)
metg_df = agg_all_df[['config', 'sys', 'accl', 'devices', metg_target, 'efficiency']].groupby(
    ['config', 'sys', 'accl', 'devices']).apply(metg, include_groups=False
).reset_index()

agg_all_df.query('accl == "h100" & devices == 1')

In [None]:
num_devices = 10

plot_meta = utils.PlotMeta(
    x_axis='task_granularity',
    y_axis='efficiency',
    hue='config',
    style='config',
    xlabel='Task Granularity (ms)',
    ylabel='Efficiency',
    xticks=[],
    yticks=[],
    legend_title='Env',
    legend_labels=[],
)

utils.generic_line_plot_err_bar(
    dataset=agg_all_df.query('devices == @num_devices'),
    title='',
    output_file=FIGURES_DIR/f'granularity_metg_devices_{num_devices}.pdf',
    plot_meta=plot_meta,
    dashes=dashes,
    fifty_percent_line=True,
    log_scale=True,
)

In [None]:
num_devices = 10

plot_meta = utils.PlotMeta(
    x_axis='flops_per_task',
    y_axis='task_granularity',
    hue='config',
    style='config',
    xlabel='Flops per Task',
    ylabel='Time per Task (ms)',
    xticks=[],
    yticks=[],
    legend_title='Env',
    legend_labels=[],
)

utils.generic_line_plot_err_bar(
    dataset=agg_all_df.query('devices == @num_devices'),
    title='',
    output_file=FIGURES_DIR/f'flops_time_devices_{num_devices}.pdf',
    plot_meta=plot_meta,
    dashes=dashes,
    fifty_percent_line=False,
    log_scale=True,
    log_scale_y=True,
)

In [None]:
plot_meta = utils.PlotMeta(
    x_axis='devices',
    y_axis='metg',
    hue='config',
    style='config',
    xlabel='Num Devices',
    ylabel='METG(50%) (Flops)',
    xticks=metg_df['devices'].astype('str').unique(),
    yticks=[],
    legend_title='Env',
    legend_labels=[],
)

utils.generic_line_plot_err_bar(
    dataset=metg_df,
    title='',
    output_file=FIGURES_DIR/f'flops_metg_line.pdf',
    plot_meta=plot_meta,
    dashes=dashes,
    fifty_percent_line=False,
    log_scale_y=True,
    outside_legend=True,
)

In [None]:
num_devices = 10
config = 'MPP-H100'

plot_meta = utils.PlotMeta(
    x_axis='flops_per_task',
    y_axis='task_granularity',
    hue='config',
    style='config',
    xlabel='Flops per Task',
    ylabel='Time per Task (ms)',
    xticks=[],
    yticks=[],
    legend_title='Env',
    legend_labels=[],
)


# plot_meta_metg = utils.PlotMeta(
#     x_axis='devices',
#     y_axis='metg',
#     hue='config',
#     style='config',
#     xlabel='Output Size ($2^{x}$)',
#     ylabel='METG(50%) (ms)',
#     xticks=metg_df['devices'].astype('str').unique(),
#     yticks=[],
#     legend_title='Env',
#     legend_labels=[],
# )

plots_meta = [plot_meta,]
datasets = [agg_all_df.query('devices == @num_devices'),]

utils.generic_multi_line_plot_err_bar(
    datasets=datasets,
    title='',
    output_file=FIGURES_DIR/f'flops_metg.pdf',
    plots_meta=plots_meta,
    dashes=dashes,
    fifty_percent_line=False,
    hlines=metg_df.query('devices == @num_devices & config == @config')['metg'].values,
    log_scale=True,
    log_scale_y=True,
    outside_legend=True,
)