Update bundled qthreads to 1.20 #25210

jabraham17 · 2024-06-11T00:20:32Z

Updates the bundled qthreads to 1.20

Testing:

validated that ARM performance did not regress with task microbencmark (tested on Mac M1)
- test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl
- test/parallel/taskCompare/elliot/chpl-taskyield.chpl
validated that x86 performance did not regress with task microbencmark (tested on AMD)
- See above
start_test test/release/examples on ARM
Full linux64 paratest w/wo comm

Mac M1 results (n=5)

Benchmark	Qthreads 1.19	Qthreads 1.20	percent diff
empty-coforall	1.664	1.543	7.55%
empty-forBegin	1.659	1.728	4.07%
empty-forall	1.648	1.703	3.28%
empty-serialCoforall	0.106	0.108	1.87%
empty-serialForBegin	0.177	0.177	0.0%
empty-serialForall	0.033	0.033	0.0%
taskyield Elapsed time 1:	0.255	0.223	13.29%
taskyield Elapsed time 4:	1.029	1.060	2.97%
taskyield Elapsed time 16:	2.822	2.969	5.08%

Linux x86 AMD results (n=5)

Benchmark	Qthreads 1.19	Qthreads 1.20	percent diff
empty-coforall	5.159	5.336	3.37%
empty-forBegin	7.805	7.961	1.98%
empty-forall	5.087	5.246	3.08%
empty-serialCoforall	0.304	0.308	1.31%
empty-serialForBegin	0.548	0.546	0.37%
empty-serialForall	0.029	0.027	7.14%
taskyield Elapsed time 1:	0.323	0.324	0.31%
taskyield Elapsed time 4:	1.189	1.199	0.84%
taskyield Elapsed time 16:	2.856	2.896	1.39%

Script to collect data on various machines

#!/usr/bin/env python3

import subprocess
import re
import os
import argparse
import math
import statistics


def compile(cmd):
    full_cmd = "chpl --fast {}".format(cmd)
    print("Running {}".format(full_cmd))
    subprocess.run(full_cmd, shell=True, check=True)


def run(cmd, keys, out_dir, tpn, args):
    full_cmd = "./{} {}".format(cmd, args)

    print("Running {}".format(full_cmd))
    out = subprocess.run(
        full_cmd, shell=True, check=True, capture_output=True, encoding="utf-8"
    )
    print(out.stdout)
    data = []
    for key in keys:
        out_key = re.search(r"{} *(\S+)".format(key), out.stdout)
        d = out_key.group(1) if out_key else "No perf key found"
        data.append(d)

    out_file = os.path.join(out_dir, "{}.dat".format(cmd))
    if not os.path.exists(out_file):
        with open(out_file, "w") as f:
            f.write("threads-per-node,{}\n".format(",".join(keys)))
    with open(out_file, "a") as f:
        f.write("{},{}\n".format(tpn, ",".join(data)))

def stats(data):
    n = len(data)
    mean = statistics.mean(data)
    std = statistics.stdev(data)
    minv = min(data)
    maxv = max(data)
    return n, mean, std, minv, maxv

def print_summary(benchmarks):
    for benchmark, (header, data) in benchmarks.items():
        print(f"{benchmark}")
        for h in header:
            n, mean, std, minv, maxv = stats(data[h])
            print(f"  {h:20} n={n}, mean={mean:6.3f}, std={std:6.3f}, min={minv:6.3f}, max={maxv:6.3f}")

def print_md(benchmarks):
    order = sorted(benchmarks.keys())
    print("| Benchmark | Mean Time | Std Dev |")
    print("| --- | --- | --- |")
    for benchmark in order:
        header, data = benchmarks[benchmark]
        for h in header:
            n, mean, std, minv, maxv = stats(data[h])
            benchname = benchmark if len(header) == 1 else f"{benchmark} {h}"
            print(f"| {benchname} | {mean:6.3f} | {std:6.3f} |")


def summarize(out_dir, fmt=None):
    benchmarks = {}
    for file in os.listdir(out_dir):
        if not file.endswith(".dat"):
            continue
        with open(os.path.join(out_dir, file), "r") as fp:
            lines = fp.readlines()
        benchmark = file.removesuffix(".dat")
        def get_csv(line, skip=0):
            return line.strip().split(",")[skip:]
        header = get_csv(lines[0], skip=1)
        data = {h: [] for h in header}
        for line in lines[1:]:
            parts = get_csv(line, skip=1)
            for i, h in enumerate(header):
                data[h].append(float(parts[i]))

        benchmarks[benchmark] = (header, data)

    if fmt == 'md':
        print_md(benchmarks)
    else:
        print_summary(benchmarks)


def main(args):

    CHPL_HOME = os.environ.get("CHPL_HOME", None)
    if CHPL_HOME is None:
        print("Missing CHPL_HOME environment variable")
        return
    print("CHPL_HOME={}".format(CHPL_HOME))


    if args.summarize:
        print("Summarizing results")
        if not os.path.exists(args.out_dir):
            print("No output directory found")
            return
        summarize(args.out_dir, fmt=('md' if args.markdown else None))

        return


    if not args.skip_compile:
        print(
            subprocess.check_output("which chpl", shell=True, encoding="utf-8")
        )
        print(
            subprocess.check_output(
                f"{CHPL_HOME}/util/printchplenv --anon",
                shell=True,
                encoding="utf-8",
            )
        )

        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=forBeginT -o empty-forBegin"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=coforallT -o empty-coforall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=forallT -o empty-forall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=serialForBeginT -o empty-serialForBegin"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=serialCoforallT -o empty-serialCoforall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=serialForallT -o empty-serialForall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/chpl-taskyield.chpl -o taskyield"
        )

    oldenv = os.environ.copy()

    os.makedirs(args.out_dir, exist_ok=True)

    for t in args.tpns:
        if t != 0:
            os.environ["CHPL_RT_NUM_THREADS_PER_LOCALE"] = str(t)
        else:
            os.environ.pop("CHPL_RT_NUM_THREADS_PER_LOCALE", None)

        execopts="--numTrials=500000 --printTimings=true"
        run("empty-forBegin", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-coforall", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-forall", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-serialForBegin", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-serialCoforall", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-serialForall", ["Elapsed time:"], args.out_dir, t, execopts)
        run(
            "taskyield",
            ["Elapsed time 1:", "Elapsed time 4:", "Elapsed time 16:"],
            args.out_dir,
            t,
            execopts,
        )

    os.environ = oldenv


def get_args():
    a = argparse.ArgumentParser()
    a.add_argument("--out-dir", default="out")
    a.add_argument("--skip-compile", default=False, action="store_true")
    a.add_argument(
        "--threads-per-node",
        dest="tpns",
        default=(0,),
        help="a value of zero indicates that CHPL_RT_NUM_THREADS_PER_LOCALE is unset",
        nargs="+",
        type=int,
    )
    a.add_argument("--summarize", default=False, action="store_true")
    a.add_argument("--markdown", default=False, action="store_true")
    args = a.parse_args()

    print("Running with:")
    print("  out_dir: {}".format(args.out_dir))
    print("  skip_compile: {}".format(args.skip_compile))
    print("  threads-per-node: {}".format(args.tpns))

    return args


if __name__ == "__main__":
    main(get_args())

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

jabraham17 added 2 commits July 15, 2024 16:52

update qthread bundle to 1.20

9c3b53b

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

update readme directions to update qthread bundle

a895599

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

jabraham17 force-pushed the update-qthreads branch from 2120050 to a895599 Compare July 15, 2024 23:53

all mac arm64 builds

a18df1b

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

jabraham17 marked this pull request as ready for review July 16, 2024 21:25

jabraham17 added 4 commits July 23, 2024 16:03

reupdate bundle

d982428

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

update patch

d8436d9

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

fix patch

087d4bb

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

remove workaround

6c42222

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Update bundled qthreads to 1.20 #25210

Update bundled qthreads to 1.20 #25210

jabraham17 commented Jun 11, 2024 •

edited

Loading

Update bundled qthreads to 1.20 #25210

Are you sure you want to change the base?

Update bundled qthreads to 1.20 #25210

Conversation

jabraham17 commented Jun 11, 2024 • edited Loading

Mac M1 results (n=5)

Linux x86 AMD results (n=5)

jabraham17 commented Jun 11, 2024 •

edited

Loading