In [None]:
pwd

In [None]:
from pathlib import Path
import json
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

In [None]:
def get_train_dirs(glob : str):
    return list(Path("/mnt/ceph/users/ewulff/particleflow/experiments/")
                  .glob(glob))
                  # .glob("scale_test_cudavisdev_tuned_trasnformer*"))

In [None]:
def get_results(train_dirs):
    results = []
    for td in train_dirs:
        with open(td / "result.json", 'r') as f:
            results.append(json.load(f))
    return results

In [None]:
def get_stats_as_arrays(results):
    throughput = np.zeros(shape=(len(results)))
    mean_epoch_times = np.zeros(shape=(len(results)))
    for result in results:
        ngpus = result["wl-stats"]["GPU"]
        throughput[ngpus-1] = result["wl-scores"]["mean_throughput"]
        mean_epoch_times[ngpus-1] = result["wl-scores"]["mean_epoch_time"]
    return throughput, mean_epoch_times

In [None]:
mpl.rc_file("my_matplotlib_rcparams.txt")

In [None]:
train_dirs = get_train_dirs("scale_testV3*")
results = get_results(train_dirs)
throughput, mean_epoch_times = get_stats_as_arrays(results)

In [None]:
plt.plot(throughput / throughput[0], label="Measured")
gpu1 = throughput[0] / throughput[0]
linear = np.arange(1, 9) * gpu1
plt.plot(linear, color="black", linestyle="--", label="Optimal")
plt.xticks(ticks=range(0,len(throughput)), labels=range(1,len(throughput)+1))
plt.xlabel("NVIDIA H100 GPUs")
plt.ylabel("Speedup over 1 GPU")
plt.title("Single machine scale test, GNN-based MLPF training")
plt.savefig("single_machine_speedup_gnn_mlpf.pdf")
plt.show()

In [None]:
train_dirs = get_train_dirs("scale_test_data*")
results = get_results(train_dirs)
throughput_hvd, mean_epoch_times = get_stats_as_arrays(results)

In [None]:
# throughput_hvd = 1596131. / mean_epoch_times
throughput_hvd = 50000. / mean_epoch_times
plt.plot(throughput / throughput[0], label="TF MirroredStrategy")
plt.plot(throughput_hvd / throughput_hvd[0], label="Horovod")
gpu1 = throughput_hvd[0] / throughput_hvd[0]
linear = np.arange(1, 9) * gpu1
plt.plot(linear, color="black", linestyle="--", label="Linear")
plt.legend()
plt.xticks(ticks=range(0,len(throughput)), labels=range(1,len(throughput)+1))
plt.xlabel("NVIDIA H100 GPUs")
plt.ylabel("Speedup over 1 GPU")
plt.title("Single machine scale test, GNN-based MLPF training")
plt.savefig("single_machine_speedup_gnn_mlpf_TF_and_HVD.pdf")
plt.show()

In [None]:
for i in range(len(throughput)):
    print(round(throughput[i]), round(throughput_hvd[i]))

# Scale test GNN single machine 1 to 8 NVIDIA H100

In [None]:
train_dirs = list(Path("/mnt/ceph/users/ewulff/particleflow/experiments/")
                  .glob("scale_test_tuned_gnn_*"))
train_dirs

In [None]:
results = []
for td in train_dirs:
    with open(td / "result.json", 'r') as f:
        results.append(json.load(f))
throughput_gnn = np.zeros(shape=(8))
for result in results:
    ngpus = result["wl-stats"]["GPU"]
    throughput_gnn[ngpus-1] = result["wl-scores"]["mean_throughput"]

In [None]:
throughput_gnn

In [None]:
plt.plot(throughput_gnn / throughput_gnn[0], label="Measured")
gpu1 = throughput_gnn[0] / throughput_gnn[0]
linear = np.arange(1, 9) * gpu1
plt.plot(linear, color="black", linestyle="--", label="Optimal")
plt.xticks(ticks=range(0,len(throughput_gnn)), labels=range(1,len(throughput_gnn)+1))
plt.xlabel("NVIDIA H100 GPUs")
plt.ylabel("Speedup over single device")
plt.title("Single machine scale test, GNN-based MLPF training")
plt.legend()
plt.savefig("single_machine_scale_test_gnn_mlpf.pdf")
plt.show()

In [None]:
plt.plot(throughput_gnn, label="Measured")
gpu1 = throughput_gnn[0]
linear = np.arange(1, 9) * gpu1
plt.plot(linear, color="black", linestyle="--", label="Optimal")
plt.xticks(ticks=range(0,len(throughput_gnn)), labels=range(1,len(throughput_gnn)+1))
plt.xlabel("NVIDIA H100 GPUs")
plt.ylabel("Througput (samples / second)")
plt.title("Single machine scale test, GNN-based MLPF training")
plt.legend()
plt.savefig("single_machine_scale_test_gnn_mlpf.pdf")
plt.show()

In [None]:
plt.plot(throughput_gnn, label="GNN")
plt.plot(throughput, label="Transformer")
gpu1_gnn = throughput_gnn[0]
gpu1 = throughput[0]
linear = np.arange(1, 9) * gpu1
linear_gnn = np.arange(1, 9) * gpu1_gnn
plt.plot(linear, color="black", linestyle="--", label="Linear scaling")
plt.plot(linear_gnn, color="black", linestyle="--")
plt.xticks(ticks=range(0,len(throughput)), labels=range(1,len(throughput)+1))
plt.xlabel("NVIDIA H100 GPUs")
plt.ylabel("Througput (samples / second)")
plt.title("Single machine scale test, MLPF training")
plt.legend()
plt.savefig("single_machine_scale_test_mlpf.pdf")
plt.show()