-
Notifications
You must be signed in to change notification settings - Fork 222
/
autotuning.py
177 lines (151 loc) · 6.73 KB
/
autotuning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
from __future__ import absolute_import
from collections import OrderedDict
from itertools import combinations
from functools import reduce
from operator import mul
import resource
from devito.ir.iet import Iteration, FindNodes, FindSymbols
from devito.logger import perf, warning
from devito.parameters import configuration
__all__ = ['autotune']
def autotune(operator, arguments, parameters, tunable):
"""
Acting as a high-order function, take as input an operator and a list of
operator arguments to perform empirical autotuning. Some of the operator
arguments are marked as tunable.
"""
# We get passed all the arguments, but the cfunction only requires a subset
at_arguments = OrderedDict([(p.name, arguments[p.name]) for p in parameters])
# User-provided output data must not be altered
output = [i.name for i in operator.output]
for k, v in arguments.items():
if k in output:
at_arguments[k] = v.copy()
iterations = FindNodes(Iteration).visit(operator.body)
dim_mapper = {i.dim.name: i.dim for i in iterations}
# Shrink the iteration space of time-stepping dimension so that auto-tuner
# runs will finish quickly
steppers = [i for i in iterations if i.dim.is_Time]
if len(steppers) == 0:
timesteps = 1
elif len(steppers) == 1:
stepper = steppers[0]
start = at_arguments[stepper.dim.min_name]
timesteps = stepper.extent(start=start, finish=options['at_squeezer']) - 1
if timesteps < 0:
timesteps = options['at_squeezer'] - timesteps
perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps)
at_arguments[stepper.dim.min_name] = start
at_arguments[stepper.dim.max_name] = timesteps
if stepper.dim.is_Stepping:
at_arguments[stepper.dim.parent.min_name] = start
at_arguments[stepper.dim.parent.max_name] = timesteps
else:
warning("AutoTuner: Couldn't understand loop structure; giving up")
return arguments
# Attempted block sizes ...
mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable])
# ... Defaults (basic mode)
blocksizes = [OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']]
# ... Always try the entire iteration space (degenerate block)
itershape = [mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper]
blocksizes.append(OrderedDict([(i, mapper[i].iteration.extent(0, j-1))
for i, j in zip(mapper, itershape)]))
# ... More attempts if auto-tuning in aggressive mode
if configuration['autotuning'].level == 'aggressive':
blocksizes = more_heuristic_attempts(blocksizes)
# How many temporaries are allocated on the stack?
# Will drop block sizes that might lead to a stack overflow
functions = FindSymbols('symbolics').visit(operator.body +
operator.elemental_functions)
stack_shapes = [i.symbolic_shape for i in functions if i.is_Array and i._mem_stack]
stack_space = sum(reduce(mul, i, 1) for i in stack_shapes)*operator._dtype().itemsize
# Note: there is only a single loop over 'blocksize' because only
# square blocks are tested
timings = OrderedDict()
for bs in blocksizes:
illegal = False
for k, v in at_arguments.items():
if k in bs:
val = bs[k]
start = mapper[k].original_dim.symbolic_start.subs(arguments)
end = mapper[k].original_dim.symbolic_end.subs(arguments)
if val <= mapper[k].iteration.extent(start, end):
at_arguments[k] = val
else:
# Block size cannot be larger than actual dimension
illegal = True
break
if illegal:
continue
# Make sure we remain within stack bounds, otherwise skip block size
dim_sizes = {}
for k, v in at_arguments.items():
if k in bs:
dim_sizes[mapper[k].argument.symbolic_size] = bs[k]
elif k in dim_mapper:
dim_sizes[dim_mapper[k].symbolic_size] = v
try:
bs_stack_space = stack_space.xreplace(dim_sizes)
except AttributeError:
bs_stack_space = stack_space
try:
if int(bs_stack_space) > options['at_stack_limit']:
continue
except TypeError:
# We should never get here
warning("AutoTuner: Couldn't determine stack size; skipping block shape %s"
% str(bs))
continue
# Use AutoTuner-specific profiler structs
timer = operator.profiler.timer.reset()
at_arguments[operator.profiler.name] = timer
operator.cfunction(*list(at_arguments.values()))
elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_)
timings[tuple(bs.items())] = elapsed
perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" %
(','.join('%d' % i for i in bs.values()), elapsed, timesteps))
try:
best = dict(min(timings, key=timings.get))
perf("AutoTuner: Selected block shape %s" % best)
except ValueError:
warning("AutoTuner: Couldn't find legal block shapes")
return arguments
# Build the new argument list
tuned = OrderedDict()
for k, v in arguments.items():
tuned[k] = best[k] if k in mapper else v
# Reset the profiling struct
assert operator.profiler.name in tuned
tuned[operator.profiler.name] = operator.profiler.timer.reset()
return tuned
def more_heuristic_attempts(blocksizes):
# Ramp up to higher block sizes
handle = OrderedDict([(i, options['at_blocksize'][-1]) for i in blocksizes[0]])
for i in range(3):
new_bs = OrderedDict([(k, v*2) for k, v in handle.items()])
blocksizes.insert(blocksizes.index(handle) + 1, new_bs)
handle = new_bs
handle = []
# Extended shuffling for the smaller block sizes
for bs in blocksizes[:4]:
for i in blocksizes:
handle.append(OrderedDict(list(bs.items())[:-1] + [list(i.items())[-1]]))
# Some more shuffling for all block sizes
for bs in list(blocksizes):
ncombs = len(bs)
for i in range(ncombs):
for j in combinations(bs, i+1):
item = [(k, bs[k]*2 if k in j else v) for k, v in bs.items()]
handle.append(OrderedDict(item))
unique = []
for i in blocksizes + handle:
if i not in unique:
unique.append(i)
return unique
options = {
'at_squeezer': 4,
'at_blocksize': sorted({8, 16, 24, 32, 40, 64, 128}),
'at_stack_limit': resource.getrlimit(resource.RLIMIT_STACK)[0] / 4
}
"""Autotuning options."""