Skip to content

Commit

Permalink
compiler: Refresh advisor profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
georgebisbas committed Jun 7, 2022
1 parent 5e0f6d0 commit e9777f6
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 23 deletions.
34 changes: 21 additions & 13 deletions benchmarks/user/advisor/run_advisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
def run_with_advisor(path, output, name, exec_args):
path = Path(path)
check(path.is_file(), '%s not found' % path)
check(path.suffix == '.py', '%s not a regular Python file' % path)
check(path.suffix == '.py', '%s not a Python file' % path)

# Create a directory to store the profiling report
if name is None:
Expand All @@ -49,15 +49,20 @@ def run_with_advisor(path, output, name, exec_args):
output = Path(output).joinpath(name)
output.mkdir(parents=True, exist_ok=True)

# Intel Advisor must be available through either Intel Parallel Studio
# or Intel oneAPI (currently tested versions include IPS 2020 Update 2 and
# Intel Advisor and Intel compilers must be available through either Intel Parallel
# Studio or Intel oneAPI (currently tested versions include IPS 2020 Update 2 and
# oneAPI 2021 beta08)
try:
ret = check_output(['advixe-cl', '--version']).decode("utf-8")
except FileNotFoundError:
check(False, "Error: Couldn't detect `advixe-cl` to run Intel Advisor.")

# If Advisor is available, so is the Intel compiler
try:
ret = check_output(['icc', '--version']).decode("utf-8")
except FileNotFoundError:
check(False, "Error: Couldn't detect Intel Compiler (icc).")

# All good, Intel compiler and advisor are available
os.environ['DEVITO_ARCH'] = 'intel'

# Tell Devito to instrument the generated code for Advisor
Expand All @@ -68,12 +73,13 @@ def run_with_advisor(path, output, name, exec_args):
if devito_logging is None:
os.environ['DEVITO_LOGGING'] = 'WARNING'

with progress('Set up multi-threading environment'):
# Roofline analyses only make sense with threading enabled
with progress('Setting up multi-threading environment'):
# Roofline analyses are recommended with threading enabled
os.environ['DEVITO_LANGUAGE'] = 'openmp'

# We must be able to do thread pinning, otherwise any results would be
# meaningless. Currently, we only support doing that via numactl
# Thread pinning is strongly recommended for reliable results.
# This script is using numactl for this purpose. Users may want to set their
# own pinning: https://hpc-wiki.info/hpc/Binding/Pinning
try:
ret = check_output(['numactl', '--show']).decode("utf-8")
ret = dict(i.split(':') for i in ret.split('\n') if i)
Expand All @@ -94,6 +100,9 @@ def run_with_advisor(path, output, name, exec_args):
# `stackoverflow.com/questions/17053671/python-how-do-you-stop-numpy-from-multithreading` # noqa
os.environ['NUMEXPR_NUM_THREADS'] = '1'

# To build a roofline with Advisor, we need to run two analyses back to
# back, `survey` and `tripcounts`.

numactl_cmd = [
'numactl',
'--cpunodebind=0'
Expand All @@ -109,21 +118,20 @@ def run_with_advisor(path, output, name, exec_args):
'-run-pass-thru=--no-altstack', # Avoids `https://software.intel.com/en-us/vtune-amplifier-help-error-message-stack-size-is-too-small` # noqa
'-run-pass-thru=-timestamp=sys', # Avoids 'VTune Amplifier may detect which timer source to use incorrectly on Intel® Xeon® processor E5-XXXX processors (200287361)' # noqa
'-strategy ldconfig:notrace:notrace', # Avoids `https://software.intel.com/en-us/forums/intel-vtune-amplifier-xe/topic/779309` # noqa
'-start-paused', # The generated code will enable/disable Advisor on a loop basis
'-start-paused', # The generated code will enable/disable Advisor on a loop basis according to the decorated pragmas # noqa
]
advisor_flops = [
'--collect=tripcounts',
'--enable-cache-simulation', # Switch to '-enable-cache-simulation' for a CARM roofline `https://software.intel.com/content/www/us/en/develop/articles/integrated-roofline-model-with-intel-advisor.html` # noqa
'--enable-cache-simulation', # Switch to '-enable-cache-simulation' for a CARM roofline model `https://software.intel.com/content/www/us/en/develop/articles/integrated-roofline-model-with-intel-advisor.html` # noqa
'--flop',
'--stacks',
'--collect=map',
'-start-paused',
]
py_cmd = [sys.executable, str(path)] + exec_args.split()

# To build a roofline with Advisor, we need to run two analyses back to
# back, `survey` and `tripcounts`. These are preceded by a "pure" python
# run to warmup the jit cache
# Before collecting the `survey` and `tripcounts` a "pure" python run to warmup the
# jit cache is preceded

log('Starting Intel Advisor\'s `roofline` analysis for `%s`' % name)
dt = datetime.datetime.now()
Expand Down
15 changes: 6 additions & 9 deletions devito/core/autotuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,16 +75,13 @@ def autotune(operator, args, level, mode):
# Detect the time-stepping Iteration; shrink its iteration range so that
# each autotuning run only takes a few iterations
steppers = {i for i in flatten(trees) if i.dim.is_Time}
if len(steppers) == 0:
stepper = None
timesteps = 1
elif len(steppers) == 1:
if len(steppers) == 1:
stepper = steppers.pop()
timesteps = init_time_bounds(stepper, at_args, args)
if timesteps is None:
return args, {}
else:
warning("cannot perform autotuning unless there is one time loop; skipping")
warning("cannot perform autotuning with %d time loops; skipping" % len(steppers))
return args, {}

# Use a fresh Timer for auto-tuning
Expand Down Expand Up @@ -220,7 +217,7 @@ def init_time_bounds(stepper, at_args, args):


def check_time_bounds(stepper, at_args, args, mode):
if mode != 'runtime' or stepper is None:
if mode != 'runtime':
return True
dim = stepper.dim.root
if stepper.direction is Backward:
Expand Down Expand Up @@ -319,13 +316,13 @@ def generate_block_shapes(blockable, args, level):
for bs in list(ret):
handle = []
for v in options['blocksize-l1']:
# To be a valid blocksize, it must be smaller than and divide evenly
# the parent's block size
# To be a valid block size, it must be smaller than
# and divide evenly the parent's block size
if all(v <= i and i % v == 0 for _, i in bs):
ret.append(bs + tuple((d.step, v) for d in level_1))
ret.remove(bs)

# Generate level-n (n > 1) block shapes
# Generate level-n (n > 2) block shapes
# TODO -- currently, there's no Operator producing depth>2 hierarchical blocking,
# so for simplicity we ignore this for the time being

Expand Down
2 changes: 1 addition & 1 deletion examples/misc/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def callback_shape(ctx, param, value):

def callback_opts(ctx, param, value):
if value is True:
return ('blocking', 'simd', 'openmp', {'blockinner': True})
return ('advanced', {'blockinner': True, 'blockrelax': True})
else:
return 'noop'

Expand Down

0 comments on commit e9777f6

Please sign in to comment.