In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from utils import query, job_config, render, register_filter, register_global
import pandas as pd

<IPython.core.display.Javascript object>

In [4]:
relevant_experiments = [
    "neurips21-cifar-centralized-tuning",
    "neurips21-cifar-gossip-tuning",
    "neurips21-cifar-relaysum-grad-tuning",
    "neurips21-cifar-relaysum-model-tuning",
    "neurips21-cifar-d2-tuning",
    "neurips21-cifar-quasi-global-tuning",
    "neurips21-cifar-push-sum-tuning",
    "neurips21-cifar-centralized-reps",
    "neurips21-cifar-gossip-reps",
    "neurips21-cifar-relaysum-grad-reps",
    "neurips21-cifar-relaysum-model-reps",
    "neurips21-cifar-d2-reps",
    "neurips21-cifar-quasi-global-reps",
    "neurips21-cifar-push-sum-reps",
]

In [5]:
data = [query("""query($experiment: String!) {
  jobs(experiment: $experiment, status: FINISHED) {
    job
    id
    status
    config {
      key
      value
    }
    annotations {
      key
      value
    }
    timeseries {
      measurement
      tags
      values
    }
  }
}""", {"experiment": experiment})["jobs"] for experiment in relevant_experiments]

In [6]:
results = []

def optimizer_name(job):
    config = job_config(job)

for experiment in data:
    for job in experiment:
        config = job_config(job)
        if job["status"] != "FINISHED":
            continue
        job_data = {
            "job": job["job"], 
            "algorithm": config["algorithm"], 
            "non_iid_alpha": config["non_iid_alpha"], 
            "num_workers": config["distributed_world_size"], 
            "learning_rate": config["learning_rate"], 
            "momentum": config["momentum"], 
            "topology": config["topology"], 
            "learning_rate": config.get("learning_rate"), 
            "seed": config["seed"]
        }
        for series in job["timeseries"]:
            series_data = {"measurement": series["measurement"], **series["tags"]}
            if series["measurement"] not in {"accuracy", "cross_entropy"}:
                continue
            for entry in series["values"]:
                results.append({
                    **job_data,
                    **series_data,
                    "step": entry["epoch"],
                    "warm_start": config.get("optimizer_warm_start", "baseline"),
                    "starred": {x["key"]: x["value"] for x in job["annotations"]}.get("star", False),
                    "value": entry["value"],
                    "mb": entry["mb"],
                    "task": config["task"]
                })
    df = pd.DataFrame(results)

    df["non_iid_alpha"] = df.non_iid_alpha.fillna(-1)
    df

In [7]:
last_iterations = (
    (df.task.eq("Cifar") & (df.step > 195) & df.measurement.eq("accuracy"))
)
last_values = df[last_iterations & df.split.eq("test")].copy()
achieved_accuracies = last_values.groupby(["task", "non_iid_alpha", "topology", "algorithm", "learning_rate", "momentum", "worker", "seed"]).agg("mean").groupby([ "task", "non_iid_alpha", "topology", "algorithm", "learning_rate", "momentum", "seed"]).agg("min").reset_index()
achieved_accuracies = achieved_accuracies[achieved_accuracies.value > 0]
achieved_accuracies

Unnamed: 0,task,non_iid_alpha,topology,algorithm,learning_rate,momentum,seed,mb,num_workers,starred,step,value
0,Cifar,-1.00,fully-connected,all-reduce,0.050,0.9,1,7.144020e+05,16,0.0,198,0.89765
1,Cifar,-1.00,fully-connected,all-reduce,0.100,0.0,1,7.144020e+05,16,0.0,198,0.87260
2,Cifar,-1.00,fully-connected,all-reduce,0.100,0.0,2,7.144020e+05,16,0.0,198,0.86860
3,Cifar,-1.00,fully-connected,all-reduce,0.100,0.0,3,7.144020e+05,16,0.0,198,0.86880
4,Cifar,-1.00,fully-connected,all-reduce,0.100,0.9,1,7.144020e+05,16,1.0,198,0.90390
5,Cifar,-1.00,fully-connected,all-reduce,0.100,0.9,2,7.144020e+05,16,0.0,198,0.89815
6,Cifar,-1.00,fully-connected,all-reduce,0.100,0.9,3,7.144020e+05,16,0.0,198,0.90305
7,Cifar,-1.00,fully-connected,all-reduce,0.200,0.0,1,7.144020e+05,16,1.0,198,0.87930
8,Cifar,-1.00,fully-connected,all-reduce,0.200,0.0,2,7.144020e+05,16,0.0,198,0.87370
9,Cifar,0.01,chain,relaysum-grad,0.025,0.9,1,1.428804e+06,16,0.0,198,0.83050


In [8]:
idx = ["task", "topology", "algorithm", "momentum", "non_iid_alpha"]
best_lrs_results = achieved_accuracies.loc[achieved_accuracies.groupby(idx)["value"].idxmax()].set_index(idx)

overrides = {
    ("fully-connected", "all-reduce", 0.0, -1): 0.1,
    ("double-binary-trees", "relaysum-grad", 0.0, 1): 0.05,
    ("double-binary-trees", "relaysum-grad", 0.0, .1): 0.05,
    ("double-binary-trees", "relaysum-grad", 0.0, .01): 0.025,
    ("double-binary-trees", "relaysum-grad", 0.9, 1): 0.025,
    ("double-binary-trees", "relaysum-grad", 0.9, .1): 0.0125/2,
    ("double-binary-trees", "relaysum-grad", 0.9, .01): 0.0125/2,
    ("double-binary-trees", "relaysum-grad", 0.9, 1): 0.05/2,
    ("double-binary-trees", "relaysum-grad", 0.9, 0.1): 0.05/2,
    ("double-binary-trees", "relaysum-grad", 0.9, .01): 0.025/2,
    ("double-binary-trees", "relaysum-model", 0.9, .1): 0.1 * 3,
    ("double-binary-trees", "relaysum-model", 0.9, .01): 0.1 * 3 / 2,
    ("exponential", "push-sum", 0.0, 1): 0.4,
    ("exponential", "push-sum", 0.0, .1): 0.2,
    ("exponential", "push-sum", 0.0, .01): 0.2,
    ("exponential", "push-sum", 0.9, 1): 0.1,
    ("exponential", "push-sum", 0.9, .1): 0.1,
    ("exponential", "push-sum", 0.9, .01): 0.025,
    ("chain", "relaysum-model", 0.9, 1): 0.1 * 5,
    ("chain", "relaysum-model", 0.9, .01): 0.025 * 5,
    ("ring", "gossip", 0.9, .1): 0.05,
    ("ring", "gossip", 0.9, .01): 0.0125,
    ("ring", "d2", 0, .01): .1,
    ("ring", "d2", 0.9, .01): 0.0126,
}

def get_tuned_lr(topology, algorithm, momentum, non_iid_alpha):
    if (topology, algorithm, momentum, non_iid_alpha) in overrides:
        return overrides[topology, algorithm, momentum, non_iid_alpha]
    else:
        try:
            return best_lrs_results.loc["Cifar", topology, algorithm, momentum, non_iid_alpha]["learning_rate"].item()
        except pd.core.indexing.IndexingError:
            return None

register_global("tuned_lr", get_tuned_lr)

In [9]:
def get_tuned_results(topology, algorithm, momentum, non_iid_alpha):
    lr = get_tuned_lr(topology, algorithm, momentum, non_iid_alpha)
    df = achieved_accuracies.set_index(["task", "topology", "algorithm", "momentum", "non_iid_alpha", "learning_rate"])
    try:
        return df.loc["Cifar", topology, algorithm, momentum, non_iid_alpha, lr]["value"].array
    except pd.core.indexing.IndexingError:
        return []

register_global("tuned_results", get_tuned_results)

In [10]:
get_tuned_results("fully-connected", "all-reduce", 0.0, -1)

  """


<PandasArray>
[0.8726000387221575, 0.8686000406742096, 0.8688000425696372]
Length: 3, dtype: float64

In [11]:
@register_filter
def percentage(value):
    val = value * 100
    return f"{val:.1f}\%"
@register_filter
def three_digits(value):
    if value is None:
        return ""
    return f"{value:.3f}"
@register_filter
def two_digits(value):
    if value is None:
        return ""
    return f"{value:.2f}"

In [12]:
register_global("topo_names", {
    "ring": "ring",
    "chain": "chain",
    "fully-connected": "fully connected",
    "double-binary-trees": "binary trees",
    "exponential": "time-varying exponential",
})

In [13]:
register_global("clamp", lambda x, a, b: min(max(x, a), b))

In [670]:
%%template table_momentum
%%-set linewidth=1.2

%%-macro results_line(task, topology, algorithm, momentum)
%%-if (task, topology, algorithm, momentum) in data.index
\Var{results(task, topology, algorithm, momentum, 1, 0.87, 0.91)} & 
\Var{results(task, topology, algorithm, momentum, 0.1, 0.8, 0.91)} & 
\Var{results(task, topology, algorithm, momentum, 0.01, 0.6, 0.91)}
%%-endif
%%-endmacro

%%-macro results_line_centralized(task, topology, algorithm, momentum)
\Var{results(task, topology, algorithm, momentum, -1, 0.87, 0.91)} &
\Var{results(task, topology, algorithm, momentum, -1, 0.8, 0.91)} &
\Var{results(task, topology, algorithm, momentum, -1, 0.6, 0.91)}
%%-endmacro

%%-macro results(task, topology, algorithm, momentum, non_iid_alpha, minval, maxval)
%%-set results=tuned_results(topology, algorithm, momentum, non_iid_alpha)
%%-if results
\Var{results.mean() | percentage} 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (\Var{linewidth},0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (\Var{linewidth},0);
    %% set lpos = (clamp(results.min(), minval, maxval) - minval) / (maxval - minval) * linewidth
    %% set rpos = (clamp(results.max(), minval, maxval) - minval) / (maxval - minval) * linewidth
    \draw[line width=.6pt] (\Var{lpos}, 0pt) -- (\Var{rpos}, 0pt);
    %% for res in results
    %% if res > minval
    %% set pos = (res - minval) / (maxval - minval) * linewidth
    \draw[line width=.6pt] (\Var{pos}, -2pt) -- (\Var{pos}, 2pt);
    %% endif
    %% endfor
}
%%-endif
%%-endmacro

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce {\color{gray}(baseline)} & \Var{topo_names["fully-connected"]} & \Var{results_line_centralized("Cifar", "fully-connected", "all-reduce", 0.0)} \\
$\quad+$momentum &  & \Var{results_line_centralized("Cifar", "fully-connected", "all-reduce", 0.9)} \\[1mm]
\textbf{\RelaySumModel}  & \Var{topo_names[topo_us]} & \Var{results_line("Cifar", topo_us, "relaysum-model", 0.0)} \\
\textbf{$\quad+$local momentum}  &  & \Var{results_line("Cifar", topo_us, "relaysum-model", 0.9)} \\[1mm]
\dpsgd~\citep{lian2017dpsgd}  & \Var{topo_names[topo_gossip]} & \Var{results_line("Cifar", topo_gossip, "gossip", 0.0)} \\
$\quad+$quasi-global mom.~\citep{lin2021quasiglobal}   & & \Var{results_line("Cifar", topo_gossip, "quasi-global-momentum", 0.9)} \\[1mm]
\dsquare~\citep{tang2018d2}  & \Var{topo_names[topo_gossip]} & \Var{results_line("Cifar", topo_gossip, "d2", 0.0)} \\
$\quad+$local momentum  & & \Var{results_line("Cifar", topo_gossip, "d2", 0.9)} \\[1mm]
%%-if include_push_sum
Stochastic gradient push~\citep{assran2019sgp}  & \Var{topo_names["exponential"]}~\citep{assran2019sgp}     & \Var{results_line("Cifar", "exponential", "push-sum", 0)} \\
$\quad+$local momentum  &     & \Var{results_line("Cifar", "exponential", "push-sum", 0.9)} \\[1mm]
%%-endif
    \bottomrule
\end{tabularx}

In [671]:
!mkdir -p generated
with open("generated/cifar10-trees.tex", "w") as fp:
    res = render("table_momentum", {"data": best_lrs_results, "topo_us": "double-binary-trees", "topo_gossip": "ring", "include_push_sum": True})
    fp.write(res)
    print(res)

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce {\color{gray}(baseline)} & fully connected & 87.0\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.0, 0pt) -- (0.07800116166472575, 0pt);
    \draw[line width=.6pt] (0.07800116166472575, -2pt) -- (0.07800116166472575, 2pt);
} &
87.0\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.748364080082286, 0pt) -- (0.7920004224235362, 0pt);
    \draw[line width=.6pt] (0.7920004224235362, -2pt) -- (0.7920004224235362, 2pt);
    \draw[line width=.6pt] (0.748364080082286, -2pt) -- (0.748364080082286, 2pt);
    \dr

  """


In [672]:
!mkdir -p generated
with open("generated/cifar10-rings.tex", "w") as fp:
    res = render("table_momentum", {"data": best_lrs_results, "topo_us": "chain", "topo_gossip": "ring", "include_push_sum": False})
    fp.write(res)
    print(res)

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce {\color{gray}(baseline)} & fully connected & 87.0\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.0, 0pt) -- (0.07800116166472575, 0pt);
    \draw[line width=.6pt] (0.07800116166472575, -2pt) -- (0.07800116166472575, 2pt);
} &
87.0\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.748364080082286, 0pt) -- (0.7920004224235362, 0pt);
    \draw[line width=.6pt] (0.7920004224235362, -2pt) -- (0.7920004224235362, 2pt);
    \draw[line width=.6pt] (0.748364080082286, -2pt) -- (0.748364080082286, 2pt);
    \dr

  """


# Table for learning rates used

In [665]:
%%template table_lr
%%-set linewidth=1.2

%%-macro results(task, topology, algorithm, momentum, non_iid_alpha, minval, maxval)
%%-set lr=tuned_lr(topology, algorithm, momentum, non_iid_alpha)
%%-set results=tuned_results(topology, algorithm, momentum, non_iid_alpha)
\Var{lr | three_digits} (\Var{results | length})
%%-endmacro

%%-macro results_line(task, topology, algorithm, momentum)
%%-if (task, topology, algorithm, momentum) in data.index
\Var{results(task, topology, algorithm, momentum, 1, 0.8, 0.91)} & 
\Var{results(task, topology, algorithm, momentum, 0.1, 0.8, 0.91)} & 
\Var{results(task, topology, algorithm, momentum, 0.01, 0.8, 0.91)}
%%-endif
%%-endmacro

%%-macro results_line_centralized(task, topology, algorithm, momentum)
\Var{results(task, topology, algorithm, momentum, -1, 0.8, 0.91)} &
\Var{results(task, topology, algorithm, momentum, -1, 0.8, 0.91)} &
\Var{results(task, topology, algorithm, momentum, -1, 0.8, 0.91)}
%%-endmacro

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce & \Var{topo_names["fully-connected"]} & \Var{results_line_centralized("Cifar", "fully-connected", "all-reduce", 0.0)} \\
$\quad+$momentum &  & \Var{results_line_centralized("Cifar", "fully-connected", "all-reduce", 0.9)} \\[1mm]
\textbf{\RelaySumModel}  & \Var{topo_names[topo_us]} & \Var{results_line("Cifar", topo_us, "relaysum-model", 0.0)} \\
\textbf{$\quad+$local momentum}  &  & \Var{results_line("Cifar", topo_us, "relaysum-model", 0.9)} \\[1mm]
\dpsgd~\citep{lian2017dpsgd}  & \Var{topo_names[topo_gossip]} & \Var{results_line("Cifar", topo_gossip, "gossip", 0.0)} \\
$\quad+$quasi-global mom.~\citep{lin2021quasiglobal}   & & \Var{results_line("Cifar", topo_gossip, "quasi-global-momentum", 0.9)} \\[1mm]
\dsquare~\citep{tang2018d2}  & \Var{topo_names[topo_gossip]} & \Var{results_line("Cifar", topo_gossip, "d2", 0.0)} \\
$\quad+$local momentum  & & \Var{results_line("Cifar", topo_gossip, "d2", 0.9)} \\[1mm]
%%-if include_push_sum
Stochastic gradient push~\citep{assran2019sgp}  & \Var{topo_names["exponential"]}~\citep{assran2019sgp}     & \Var{results_line("Cifar", "exponential", "push-sum", 0)} \\
$\quad+$local momentum  &     & \Var{results_line("Cifar", "exponential", "push-sum", 0.9)} \\[1mm]
%%-endif
    \bottomrule
\end{tabularx}

In [666]:
!mkdir -p generated
with open("generated/cifar10-learning-rates-trees.tex", "w") as fp:
    res = render("table_lr", {"data": best_lrs_results, "topo_us": "double-binary-trees", "topo_gossip": "ring", "include_push_sum": True})
    fp.write(res)
    print(res)

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce & fully connected & 0.100 (3) &
0.100 (3) &
0.100 (3) \\
$\quad+$momentum &  & 0.100 (3) &
0.100 (3) &
0.100 (3) \\[1mm]
\textbf{\RelaySumModel}  & binary trees & 1.200 (3) & 
0.600 (3) & 
0.300 (3) \\
\textbf{$\quad+$local momentum}  &  & 0.600 (3) & 
0.300 (3) & 
0.150 (3) \\[1mm]
\dpsgd~\citep{lian2017dpsgd}  & ring & 0.400 (3) & 
0.100 (3) & 
0.200 (3) \\
$\quad+$quasi-global mom.~\citep{lin2021quasiglobal}   & & 0.100 (3) & 
0.025 (3) & 
0.050 (3) \\[1mm]
\dsquare~\citep{tang2018d2}  & ring & 0.200 (3) & 
0.200 (3) & 
0.100 (3) \\
$\quad+$local momentum  & & 0.050 (3) & 
0.050 (3) & 
0.013 (3) \\[1mm]Stochastic gradient push~\citep{assran2019sgp}  & time-varying exponential~\citep{assran2019sgp}     & 0.400 (3) & 
0.200 (3) & 
0.200 (3)

  """


In [667]:
!mkdir -p generated
with open("generated/cifar10-learning-rates-rings.tex", "w") as fp:
    res = render("table_lr", {"data": best_lrs_results, "topo_us": "chain", "topo_gossip": "ring", "include_push_sum": False})
    fp.write(res)
    print(res)

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce & fully connected & 0.100 (3) &
0.100 (3) &
0.100 (3) \\
$\quad+$momentum &  & 0.100 (3) &
0.100 (3) &
0.100 (3) \\[1mm]
\textbf{\RelaySumModel}  & chain & 2.000 (3) & 
2.000 (3) & 
1.000 (3) \\
\textbf{$\quad+$local momentum}  &  & 0.500 (3) & 
0.500 (3) & 
0.125 (3) \\[1mm]
\dpsgd~\citep{lian2017dpsgd}  & ring & 0.400 (3) & 
0.100 (3) & 
0.200 (3) \\
$\quad+$quasi-global mom.~\citep{lin2021quasiglobal}   & & 0.100 (3) & 
0.025 (3) & 
0.050 (3) \\[1mm]
\dsquare~\citep{tang2018d2}  & ring & 0.200 (3) & 
0.200 (3) & 
0.100 (3) \\
$\quad+$local momentum  & & 0.050 (3) & 
0.050 (3) & 
0.013 (3) \\[1mm]    \bottomrule
\end{tabularx}


  """


In [668]:
df = achieved_accuracies
df[df.topology.eq("exponential") & df.algorithm.eq("push-sum") & df.momentum.eq(0.9) & df.learning_rate.eq(.05)].set_index(["task", "topology", "algorithm", "momentum", "non_iid_alpha", "learning_rate", "seed"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,mb,num_workers,starred,step,value
task,topology,algorithm,momentum,non_iid_alpha,learning_rate,seed,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Cifar,exponential,push-sum,0.9,0.01,0.05,1,1428804.0,16,0.0,198,0.88285
Cifar,exponential,push-sum,0.9,0.01,0.05,3,1428804.0,16,0.0,198,0.8842
Cifar,exponential,push-sum,0.9,0.1,0.05,1,1428804.0,16,0.0,198,0.8809
Cifar,exponential,push-sum,0.9,1.0,0.05,1,1428804.0,16,0.0,198,0.8911


In [669]:
%%template table_compare_topology
%%-set linewidth=1.2
%%-set task="Cifar"
%%-macro results_line(ringname, algorithm, momentum)
\Var{results(task, ringname, algorithm, momentum, 0.01, 0.8, 0.91)} & 
\Var{results(task, "double-binary-trees", algorithm, momentum, 0.01, 0.8, 0.91)}
%%-endmacro

%%-macro results_line_centralized(algorithm, momentum)
\Var{results(task, "fully-connected", algorithm, momentum, -1, 0.8, 0.91)} &
\Var{results(task, "fully-connected", algorithm, momentum, -1, 0.8, 0.91)}
%%-endmacro

%%-macro results(task, topology, algorithm, momentum, non_iid_alpha, minval, maxval)
%%-set results=tuned_results(topology, algorithm, momentum, non_iid_alpha)
%%-if results
\Var{results.mean() | percentage} 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (\Var{linewidth},0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (\Var{linewidth},0);
    %% set lpos = (clamp(results.min(), minval, maxval) - minval) / (maxval - minval) * linewidth
    %% set rpos = (clamp(results.max(), minval, maxval) - minval) / (maxval - minval) * linewidth
    \draw[line width=.6pt] (\Var{lpos}, 0pt) -- (\Var{rpos}, 0pt);
    %% for res in results
    %% if res > minval
    %% set pos = (res - minval) / (maxval - minval) * linewidth
    \draw[line width=.6pt] (\Var{pos}, -2pt) -- (\Var{pos}, 2pt);
    %% endif
    %% endfor
}
%%-else
{\color{gray}did not converge}
%%-endif
%%-endmacro

\tablefontsize
\begin{tabularx}{.7\textwidth}{X l l}
    \toprule
    Algorithm                                              & Ring (Chain for \RelaySum) & Double binary trees \\
    \cmidrule(lr){1-1} \cmidrule(lr){2-3}
    \textbf{\RelaySumModel}                                & \Var{results_line("chain", "relaysum-model", 0.0)} \\
    \textbf{$\quad+$local momentum}                        & \Var{results_line("chain", "relaysum-model", 0.9)} \\[1mm]
    \dpsgd~\citep{lian2017dpsgd}                           & \Var{results_line("ring", "gossip", 0.0)} \\
    $\quad+$quasi-global mom.~\citep{lin2021quasiglobal}   & \Var{results_line("ring", "quasi-global-momentum", 0.9)} \\[1mm]
    \dsquare~\citep{tang2018d2}                            & \Var{results_line("ring", "d2", 0.0)} \\
    $\quad+$local momentum                                 & \Var{results_line("ring", "d2", 0.9)} \\[1mm]
    \bottomrule
\end{tabularx}

In [661]:
!mkdir -p generated
with open("generated/cifar10-ring-vs-tree.tex", "w") as fp:
    res = render("table_compare_topology", {"data": best_lrs_results})
    fp.write(res)
    print(res)

\tablefontsize
\begin{tabularx}{.7\textwidth}{X l l}
    \toprule
    Algorithm                                              & Ring (Chain for \RelaySum) & Double binary trees \\
    \cmidrule(lr){1-1} \cmidrule(lr){2-3}
    \textbf{\RelaySumModel}                                & 86.5\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.6867277459664789, 0pt) -- (0.7178186611695717, 0pt);
    \draw[line width=.6pt] (0.7178186611695717, -2pt) -- (0.7178186611695717, 2pt);
    \draw[line width=.6pt] (0.6867277459664789, -2pt) -- (0.6867277459664789, 2pt);
    \draw[line width=.6pt] (0.7140004716136235, -2pt) -- (0.7140004716136235, 2pt);
} & 
84.6\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.2192731553857972, 0pt) -- (0.6840004920959468, 0pt);
   

  """


In [24]:
get_tuned_lr("double-binary-trees", "relaysum-grad", 0.0, 0.01)

0.025

# Compare RelaySGD against RelaySGD/Grad

In [28]:
%%template table_model_vs_grad
%%-set linewidth=1.2

%%-macro results_line(task, topology, algorithm, momentum)
%%-if (task, topology, algorithm, momentum) in data.index
\Var{results(task, topology, algorithm, momentum, 1, 0.8, 0.91)} & 
\Var{results(task, topology, algorithm, momentum, 0.1, 0.8, 0.91)} & 
\Var{results(task, topology, algorithm, momentum, 0.01, 0.8, 0.91)}
%%-endif
%%-endmacro

%%-macro results_line_centralized(task, topology, algorithm, momentum)
\Var{results(task, topology, algorithm, momentum, -1, 0.8, 0.91)} &
\Var{results(task, topology, algorithm, momentum, -1, 0.8, 0.91)} &
\Var{results(task, topology, algorithm, momentum, -1, 0.8, 0.91)}
%%-endmacro

%%-macro results(task, topology, algorithm, momentum, non_iid_alpha, minval, maxval)
%%-set results=tuned_results(topology, algorithm, momentum, non_iid_alpha)
%%-if results
\Var{results.mean() | percentage} 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (\Var{linewidth},0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (\Var{linewidth},0);
    %% set lpos = (clamp(results.min(), minval, maxval) - minval) / (maxval - minval) * linewidth
    %% set rpos = (clamp(results.max(), minval, maxval) - minval) / (maxval - minval) * linewidth
    \draw[line width=.6pt] (\Var{lpos}, 0pt) -- (\Var{rpos}, 0pt);
    %% for res in results
    %% if res > minval
    %% set pos = (res - minval) / (maxval - minval) * linewidth
    \draw[line width=.6pt] (\Var{pos}, -2pt) -- (\Var{pos}, 2pt);
    %% endif
    %% endfor
}
%%-endif
%%-endmacro

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce {\color{gray}(baseline)} & \Var{topo_names["fully-connected"]} & \Var{results_line_centralized("Cifar", "fully-connected", "all-reduce", 0.0)} \\
$\quad+$momentum &  & \Var{results_line_centralized("Cifar", "fully-connected", "all-reduce", 0.9)} \\[1mm]
\RelaySumModel  & \Var{topo_names[topo_us]} & \Var{results_line("Cifar", topo_us, "relaysum-model", 0.0)} \\
$\quad+$local momentum  &  & \Var{results_line("Cifar", topo_us, "relaysum-model", 0.9)} \\[1mm]
\RelaySumGrad  & \Var{topo_names[topo_us]} & \Var{results_line("Cifar", topo_us, "relaysum-grad", 0.0)} \\
$\quad+$local momentum  &  & \Var{results_line("Cifar", topo_us, "relaysum-grad", 0.9)} \\[1mm]
    \bottomrule
\end{tabularx}

In [29]:
!mkdir -p generated
with open("generated/cifar10-model-vs-grad.tex", "w") as fp:
    res = render("table_model_vs_grad", {"data": best_lrs_results, "topo_us": "chain"})
    fp.write(res)
    print(res)

\tablefontsize
\begin{tabularx}{\textwidth}{l X l l l}
    \toprule
    Algorithm & Topology & $\alpha=1.00$ & $\alpha=0.1$ & $\alpha=.01$ \\
    && (most homogeneous) & & (most heterogeneous) \\
    \cmidrule(lr){1-2} \cmidrule(lr){3-5}
All-reduce {\color{gray}(baseline)} & fully connected & 87.0\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.748364080082286, 0pt) -- (0.7920004224235362, 0pt);
    \draw[line width=.6pt] (0.7920004224235362, -2pt) -- (0.7920004224235362, 2pt);
    \draw[line width=.6pt] (0.748364080082286, -2pt) -- (0.748364080082286, 2pt);
    \draw[line width=.6pt] (0.7505459189414959, -2pt) -- (0.7505459189414959, 2pt);
} &
87.0\% 
\tikz{
    % spacer
    \draw[white,line width=5pt] (0,0) -- (1.2,0);
    % baseline
    \draw[gray,line width=.3pt,->] (0,0) -- (1.2,0);
    \draw[line width=.6pt] (0.748364080082286, 0pt) -- (0.7920004224235362, 0p

  """
