Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update bundled qthreads to 1.20 #25210

Open
wants to merge 7 commits into
base: main
Choose a base branch
from

Conversation

jabraham17
Copy link
Member

@jabraham17 jabraham17 commented Jun 11, 2024

Updates the bundled qthreads to 1.20

Testing:

  • validated that ARM performance did not regress with task microbencmark (tested on Mac M1)
    • test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl
    • test/parallel/taskCompare/elliot/chpl-taskyield.chpl
  • validated that x86 performance did not regress with task microbencmark (tested on AMD)
    • See above
  • start_test test/release/examples on ARM
  • Full linux64 paratest w/wo comm

Mac M1 results (n=5)

Benchmark Qthreads 1.19 Qthreads 1.20 percent diff
empty-coforall 1.664 1.543 7.55%
empty-forBegin 1.659 1.728 4.07%
empty-forall 1.648 1.703 3.28%
empty-serialCoforall 0.106 0.108 1.87%
empty-serialForBegin 0.177 0.177 0.0%
empty-serialForall 0.033 0.033 0.0%
taskyield Elapsed time 1: 0.255 0.223 13.29%
taskyield Elapsed time 4: 1.029 1.060 2.97%
taskyield Elapsed time 16: 2.822 2.969 5.08%

Linux x86 AMD results (n=5)

Benchmark Qthreads 1.19 Qthreads 1.20 percent diff
empty-coforall 5.159 5.336 3.37%
empty-forBegin 7.805 7.961 1.98%
empty-forall 5.087 5.246 3.08%
empty-serialCoforall 0.304 0.308 1.31%
empty-serialForBegin 0.548 0.546 0.37%
empty-serialForall 0.029 0.027 7.14%
taskyield Elapsed time 1: 0.323 0.324 0.31%
taskyield Elapsed time 4: 1.189 1.199 0.84%
taskyield Elapsed time 16: 2.856 2.896 1.39%

Script to collect data on various machines

#!/usr/bin/env python3

import subprocess
import re
import os
import argparse
import math
import statistics


def compile(cmd):
    full_cmd = "chpl --fast {}".format(cmd)
    print("Running {}".format(full_cmd))
    subprocess.run(full_cmd, shell=True, check=True)


def run(cmd, keys, out_dir, tpn, args):
    full_cmd = "./{} {}".format(cmd, args)

    print("Running {}".format(full_cmd))
    out = subprocess.run(
        full_cmd, shell=True, check=True, capture_output=True, encoding="utf-8"
    )
    print(out.stdout)
    data = []
    for key in keys:
        out_key = re.search(r"{} *(\S+)".format(key), out.stdout)
        d = out_key.group(1) if out_key else "No perf key found"
        data.append(d)

    out_file = os.path.join(out_dir, "{}.dat".format(cmd))
    if not os.path.exists(out_file):
        with open(out_file, "w") as f:
            f.write("threads-per-node,{}\n".format(",".join(keys)))
    with open(out_file, "a") as f:
        f.write("{},{}\n".format(tpn, ",".join(data)))

def stats(data):
    n = len(data)
    mean = statistics.mean(data)
    std = statistics.stdev(data)
    minv = min(data)
    maxv = max(data)
    return n, mean, std, minv, maxv

def print_summary(benchmarks):
    for benchmark, (header, data) in benchmarks.items():
        print(f"{benchmark}")
        for h in header:
            n, mean, std, minv, maxv = stats(data[h])
            print(f"  {h:20} n={n}, mean={mean:6.3f}, std={std:6.3f}, min={minv:6.3f}, max={maxv:6.3f}")

def print_md(benchmarks):
    order = sorted(benchmarks.keys())
    print("| Benchmark | Mean Time | Std Dev |")
    print("| --- | --- | --- |")
    for benchmark in order:
        header, data = benchmarks[benchmark]
        for h in header:
            n, mean, std, minv, maxv = stats(data[h])
            benchname = benchmark if len(header) == 1 else f"{benchmark} {h}"
            print(f"| {benchname} | {mean:6.3f} | {std:6.3f} |")


def summarize(out_dir, fmt=None):
    benchmarks = {}
    for file in os.listdir(out_dir):
        if not file.endswith(".dat"):
            continue
        with open(os.path.join(out_dir, file), "r") as fp:
            lines = fp.readlines()
        benchmark = file.removesuffix(".dat")
        def get_csv(line, skip=0):
            return line.strip().split(",")[skip:]
        header = get_csv(lines[0], skip=1)
        data = {h: [] for h in header}
        for line in lines[1:]:
            parts = get_csv(line, skip=1)
            for i, h in enumerate(header):
                data[h].append(float(parts[i]))

        benchmarks[benchmark] = (header, data)

    if fmt == 'md':
        print_md(benchmarks)
    else:
        print_summary(benchmarks)


def main(args):

    CHPL_HOME = os.environ.get("CHPL_HOME", None)
    if CHPL_HOME is None:
        print("Missing CHPL_HOME environment variable")
        return
    print("CHPL_HOME={}".format(CHPL_HOME))


    if args.summarize:
        print("Summarizing results")
        if not os.path.exists(args.out_dir):
            print("No output directory found")
            return
        summarize(args.out_dir, fmt=('md' if args.markdown else None))

        return


    if not args.skip_compile:
        print(
            subprocess.check_output("which chpl", shell=True, encoding="utf-8")
        )
        print(
            subprocess.check_output(
                f"{CHPL_HOME}/util/printchplenv --anon",
                shell=True,
                encoding="utf-8",
            )
        )

        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=forBeginT -o empty-forBegin"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=coforallT -o empty-coforall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=forallT -o empty-forall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=serialForBeginT -o empty-serialForBegin"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=serialCoforallT -o empty-serialCoforall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/empty-chpl-taskspawn.chpl -staskingMode=serialForallT -o empty-serialForall"
        )
        compile(
            f"{CHPL_HOME}/test/parallel/taskCompare/elliot/chpl-taskyield.chpl -o taskyield"
        )

    oldenv = os.environ.copy()

    os.makedirs(args.out_dir, exist_ok=True)

    for t in args.tpns:
        if t != 0:
            os.environ["CHPL_RT_NUM_THREADS_PER_LOCALE"] = str(t)
        else:
            os.environ.pop("CHPL_RT_NUM_THREADS_PER_LOCALE", None)

        execopts="--numTrials=500000 --printTimings=true"
        run("empty-forBegin", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-coforall", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-forall", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-serialForBegin", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-serialCoforall", ["Elapsed time:"], args.out_dir, t, execopts)
        run("empty-serialForall", ["Elapsed time:"], args.out_dir, t, execopts)
        run(
            "taskyield",
            ["Elapsed time 1:", "Elapsed time 4:", "Elapsed time 16:"],
            args.out_dir,
            t,
            execopts,
        )

    os.environ = oldenv


def get_args():
    a = argparse.ArgumentParser()
    a.add_argument("--out-dir", default="out")
    a.add_argument("--skip-compile", default=False, action="store_true")
    a.add_argument(
        "--threads-per-node",
        dest="tpns",
        default=(0,),
        help="a value of zero indicates that CHPL_RT_NUM_THREADS_PER_LOCALE is unset",
        nargs="+",
        type=int,
    )
    a.add_argument("--summarize", default=False, action="store_true")
    a.add_argument("--markdown", default=False, action="store_true")
    args = a.parse_args()

    print("Running with:")
    print("  out_dir: {}".format(args.out_dir))
    print("  skip_compile: {}".format(args.skip_compile))
    print("  threads-per-node: {}".format(args.tpns))

    return args


if __name__ == "__main__":
    main(get_args())

Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
@jabraham17 jabraham17 marked this pull request as ready for review July 16, 2024 21:25
Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
Signed-off-by: Jade Abraham <jade.abraham@hpe.com>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

None yet

1 participant