-
Notifications
You must be signed in to change notification settings - Fork 225
/
misc.py
141 lines (110 loc) · 5.3 KB
/
misc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import cgen
from devito.ir import (Forward, List, Prodder, FindNodes, Transformer,
filter_iterations, retrieve_iteration_tree)
from devito.logger import warning
from devito.passes.iet.engine import iet_pass
from devito.symbolics import MIN, MAX, evalrel
from devito.tools import is_integer, split
__all__ = ['avoid_denormals', 'hoist_prodders', 'relax_incr_dimensions', 'is_on_device']
@iet_pass
def avoid_denormals(iet):
"""
Introduce nodes in the Iteration/Expression tree that will expand to C
macros telling the CPU to flush denormal numbers in hardware. Denormals
are normally flushed when using SSE-based instruction sets, except when
compiling shared objects.
"""
if iet.is_ElementalFunction:
return iet, {}
header = (cgen.Comment('Flush denormal numbers to zero in hardware'),
cgen.Statement('_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)'),
cgen.Line())
body = iet.body._rebuild(body=(List(header=header),) + iet.body.body)
iet = iet._rebuild(body=body)
return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
@iet_pass
def hoist_prodders(iet):
"""
Move Prodders within the outer levels of an Iteration tree.
"""
mapper = {}
for tree in retrieve_iteration_tree(iet):
for prodder in FindNodes(Prodder).visit(tree.root):
if prodder._periodic:
try:
key = lambda i: i.dim.is_Block and i.dim.step != 1
candidate = filter_iterations(tree, key)[-1]
except IndexError:
# Fallback: use the outermost Iteration
candidate = tree.root
mapper[candidate] = candidate._rebuild(nodes=(candidate.nodes +
(prodder._rebuild(),)))
mapper[prodder] = None
iet = Transformer(mapper, nested=True).visit(iet)
return iet, {}
@iet_pass
def relax_incr_dimensions(iet, **kwargs):
"""
This pass adjusts the bounds of blocked Iterations in order to include the "remainder
regions". Without the relaxation that occurs in this pass, the only way to iterate
over the entire iteration space is to have step increments that are perfect divisors
of the iteration space (e.g. in case of an iteration space of size 67 and block size
8 only 64 iterations would be computed, as `67 - 67mod8 = 64`.
A simple 1D example: nested Iterations are transformed from:
<Iteration x0_blk0; (x_m, x_M, x0_blk0_size)>
<Iteration x; (x0_blk0, x0_blk0 + x0_blk0_size - 1, 1)>
to:
<Iteration x0_blk0; (x_m, x_M, x0_blk0_size)>
<Iteration x; (x0_blk0, MIN(x_M, x0_blk0 + x0_blk0_size - 1)), 1)>
"""
mapper = {}
for tree in retrieve_iteration_tree(iet):
iterations = [i for i in tree if i.dim.is_Block]
if not iterations:
continue
root = iterations[0]
if root in mapper:
continue
assert all(i.direction is Forward for i in iterations)
outer, inner = split(iterations, lambda i: not i.dim.parent.is_Block)
# Get root's `symbolic_max` out of each outer Dimension
roots_max = {i.dim.root: i.symbolic_max for i in outer}
# Process inner iterations and adjust their bounds
for n, i in enumerate(inner):
# The Iteration's maximum is the MIN of (a) the `symbolic_max` of current
# Iteration e.g. `x0_blk0 + x0_blk0_size - 1` and (b) the `symbolic_max`
# of the current Iteration's root Dimension e.g. `x_M`. The generated
# maximum will be `MIN(x0_blk0 + x0_blk0_size - 1, x_M)
# In some corner cases an offset may be added (e.g. after CIRE passes)
# E.g. assume `i.symbolic_max = x0_blk0 + x0_blk0_size + 1` and
# `i.dim.symbolic_max = x0_blk0 + x0_blk0_size - 1` then the generated
# maximum will be `MIN(x0_blk0 + x0_blk0_size + 1, x_M + 2)`
root_max = roots_max[i.dim.root] + i.symbolic_max - i.dim.symbolic_max
iter_max = evalrel(min, [i.symbolic_max, root_max])
mapper[i] = i._rebuild(limits=(i.symbolic_min, iter_max, i.step))
if mapper:
iet = Transformer(mapper, nested=True).visit(iet)
headers = [('%s(a,b)' % MIN.name, ('(((a) < (b)) ? (a) : (b))')),
('%s(a,b)' % MAX.name, ('(((a) > (b)) ? (a) : (b))'))]
else:
headers = []
return iet, {'headers': headers}
def is_on_device(obj, gpu_fit):
"""
True if the given object is allocated in the device memory, False otherwise.
Parameters
----------
obj : Indexed or Function
The target object.
gpu_fit : list of Function
The Function's which are known to definitely fit in the device memory. This
information is given directly by the user through the compiler option
`gpu-fit` and is propagated down here through the various stages of lowering.
"""
functions = (obj.function,)
fsave = [f for f in functions if f.is_TimeFunction and is_integer(f.save)]
if 'all-fallback' in gpu_fit and fsave:
warning("TimeFunction %s assumed to fit the GPU memory" % fsave)
return True
return all(f in gpu_fit for f in fsave)