In [1]:
# this notebook has logic to preprocess the logs created from our scaling experiments

In [33]:
# imports
import json
import re
import csv

In [39]:
info_pattern = r"^.*iteration.*elapsed time per iteration \(ms\): (.+) \| learning rate:.* samples per second: (.+) \| TFLOPs: (.+) \|$"

In [40]:
# Retrieve the experiment -> job id map
with open('scaling_logs/scaling_exp_det.json','r') as map_file:
    exp_job_map = json.load(map_file)

In [41]:
def get_stats_obj(exp_name, oom, match):
    # sample exp_name GPT_0.35B_MoE128_1BATCH_1GPU_1Node
    exp_det = exp_name.split('_')
    stats_obj = {
     'model_param': exp_det[1][:-1],
     'model_experts': exp_det[2][3:],
     'batch_size': exp_det[3][:-5],
     'tot_gpus': exp_det[4][:-3],
     'tot_nodes': exp_det[5][:-4]
    }
    if oom:
        stats_obj['oom'] = 'true'
    elif match:
        stats_obj['elapsed time per iteration (ms)'] = match.group(1)
        stats_obj['samples per second'] = match.group(2)
        stats_obj['TFLOPs'] = match.group(3)
    return stats_obj

In [42]:
log_file_tmpl = "scaling_logs/slurm-%s.out"

scaling_stats = []
# For each logfile
for exp_name  in exp_job_map:
    job_id = exp_job_map[exp_name]
    log_file_path = log_file_tmpl % job_id
    with open(log_file_path, "r") as log_file:
        log_file_lines = log_file.readlines()
    # Check for out-of-memory exception
    oom = False
    stats_line_match = None
    for line in log_file_lines:
        if 'CUDA out of memory' in line:
            oom = True
            break
        elif match := re.search(info_pattern, line):
            stats_line_match = match
    scaling_stats.append(get_stats_obj(exp_name, oom, stats_line_match))

In [43]:
print(scaling_stats)

[{'model_param': '0.35', 'model_experts': '128', 'batch_size': '1', 'tot_gpus': '1', 'tot_nodes': '1', 'elapsed time per iteration (ms)': '157.0', 'samples per second': '6.369', 'TFLOPs': '15.83'}, {'model_param': '0.35', 'model_experts': '128', 'batch_size': '2', 'tot_gpus': '1', 'tot_nodes': '1', 'elapsed time per iteration (ms)': '194.3', 'samples per second': '10.292', 'TFLOPs': '25.57'}, {'model_param': '0.35', 'model_experts': '128', 'batch_size': '4', 'tot_gpus': '1', 'tot_nodes': '1', 'elapsed time per iteration (ms)': '373.9', 'samples per second': '10.698', 'TFLOPs': '26.58'}, {'model_param': '0.35', 'model_experts': '128', 'batch_size': '8', 'tot_gpus': '1', 'tot_nodes': '1', 'elapsed time per iteration (ms)': '416.9', 'samples per second': '19.190', 'TFLOPs': '47.68'}, {'model_param': '0.35', 'model_experts': '128', 'batch_size': '16', 'tot_gpus': '1', 'tot_nodes': '1', 'elapsed time per iteration (ms)': '923.6', 'samples per second': '17.324', 'TFLOPs': '43.05'}, {'model_p

In [44]:
# Extract all unique keys from the dictionaries in the list
all_keys = set().union(*(d.keys() for d in scaling_stats))

# Write data to TSV file
with open('scaling_stats.tsv', "w", newline="", encoding="utf-8") as tsv_file:
    writer = csv.DictWriter(tsv_file, fieldnames=all_keys, delimiter="\t")
    writer.writeheader()
    writer.writerows(scaling_stats)