-
Notifications
You must be signed in to change notification settings - Fork 29
/
stream-scaling
executable file
·280 lines (244 loc) · 9.34 KB
/
stream-scaling
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
#!/bin/bash
# Automated download, compile, and run of the stream memory bandwidth
# test to show memory scaling as number of CPU cores increases.
#
# Takes a single optional parameter input for the maximum number of
# cores to test. Defaults to 8, unless this is a system where it can
# determine that information from /proc/cpuinfo It will very likely
# guess correctly on Linux for example.
#
# Compiling stream on mainstream Linux systems requires gcc 4.2
# for the OpenMP libraries used here to be available.
#
# The default way stream is compiled, it operates on a array of
# 2,000,000 elements taking up approximately 46MB of RAM. If the
# total amount of processor cache on your system exceeds this amount,
# that means more of the data will fit in cache than intended, and
# the results will be inflated. Accordingly, this cache size is
# estimated (in a way that only works on Linux), and the size of
# the array used is increased to be twice as large as that total.
# Limit the maximum array sized used so that the data structure fits
# into a memory block without overflow. This makes for about 3GB
# of memory just for the main array, plus some other structures,
# and just fits on most 64-bit systems. A lower limit may
# be needed on some sytems.
MAX_ARRAY_SIZE=130000000
#
# Determine maximum cores to test
#
if [ -n "$1" ] ; then
MAX_CORES="$1"
elif [ -f "/proc/cpuinfo" ] ; then
MAX_CORES=`grep -c processor /proc/cpuinfo`
fi
if [ -z "$MAX_CORES" ] ; then
# Might as well have a default bigger than most systems ship with
# if all else fails
MAX_CORES=8
fi
# Uncomment this to get verbose output of every stream run
# By default, the first one includes full details, while later
# ones only show the Triad output and a confirmation of
# core count
#VERBOSE=1
# Uncomment to show more debugging output
#DEBUG=1
function total_cache_size {
# Total up all of the non-instructional caches for every CPU
# on the system.
#
# Takes one input: the name of the variable to save the computed
# total cache size to. Used bash eval track to pass that back.
# Value returned is in bytes.
#
# Inside of /sys/devices/system/cpu/cpu0/cache/ are a series of
# files named index[0..n] that represent each of the layers of
# cache on this CPU. Each is labeled with a level, size, and
# type, contained in files with those names. Valid types include
# "Instruction", "Data", and "Unified". Typical levels are 1
# through 3. And sizes vary, but are always listed in values
# ending with "K".
local __resultvar=$1
local TOTAL_CACHE_KB=0
for C in /sys/devices/system/cpu/cpu*
do
for I in $C/cache/index*
do
if [ ! -f $I/size ] ; then
continue
fi
local LEVEL=`cat $I/level`
local CACHE=`cat $I/size`
local TYPE=`cat $I/type`
echo CPU $C Level $LEVEL Cache: $CACHE \($TYPE\)
if [ "$TYPE" = "Instruction" ] ; then
# Don't count instruction caches, just data & unified
continue
fi
# Check the last character of the string to make
# sure it's "K"; if not, we don't know what
# we're looking at here
local KB=`expr "$CACHE" : '.*\(.\)'`
if [ "$KB" = "K" ] ; then
# Parse just the digits here
local K=${CACHE%K}
((TOTAL_CACHE_KB = TOTAL_CACHE_KB + K))
else
echo Error: can\'t interpret format of CPU cache information in $I/size
return
fi
done
done
((TOTAL_CACHE = TOTAL_CACHE_KB * 1024))
eval $__resultvar="'$TOTAL_CACHE'"
}
function simple_cache_size {
# Original, simpler cache size computation. Doesn't give accurate
# results at all on processors with L3 caches. Intel CPUs will
# typically publish that size into /proc/cpuinfo, while some
# AMD processors with large L3 caches will instead publish
# their L2 cache size to there. Ultimately this is a better approach
# anyway, because it will sum all of the various cache levels,
# rather than just using the one that get published to the CPU
# summary value.
#
# Left here as example code, in case some future processors that
# provide cache info in /proc/cpuinfo but not /sys/devices/system/cpu
# turn up.
local TOTAL_CACHE_KB=0
for cache in `grep "cache size" /proc/cpuinfo | cut -d":" -f 2 | cut -d" " -f 2`
do
if [ -n "$cache" ] ; then
((TOTAL_CACHE_KB = TOTAL_CACHE_KB + cache))
fi
done
# Convert this from its unit of kilobytes into regular bytes, because "MB"
# figures from stream are 1M, not 2^20
local TOTAL_CACHE
((TOTAL_CACHE = TOTAL_CACHE_KB * 1024))
eval $__resultvar="'$TOTAL_CACHE'"
}
#
# stream_array_elements determines how large the array stream
# runs against needs to be to avoid caching effects.
#
# Takes one input: the name of the variable to save the needed
# array size to.
#
function stream_array_elements {
# Bash normally doesn't let functions return values usefully.
# This and below eval __resultvar let it set variables outside
# of the function more cleanly than using globals here.
local __resultvar=$1
local NEEDED_SIZE=2000000
total_cache_size TOTAL_CACHE
if [ -z "$TOTAL_CACHE" ] ; then
echo Unable to guess cache size on this system. Using default.
NEEDED_SIZE = 2000000
eval $__resultvar="'$NEEDED_SIZE'"
return
fi
echo Total CPU system cache: $TOTAL_CACHE bytes
# We know that every 1 million array entries in stream produces approximately
# 22 million bytes (not megabytes!) of data. Round that down to make more
# entries required. And then increase the estimated sum of cache sizes by
# an order of magnitude to compute how large the array should be, to make
# sure cache effects are minimized.
local BYTES_PER_ARRAY_ENTRY=22
((NEEDED_SIZE = 10 * TOTAL_CACHE / BYTES_PER_ARRAY_ENTRY))
echo Suggested minimum array elements needed: $NEEDED_SIZE
if [ $NEEDED_SIZE -lt 2000000 ] ; then
NEEDED_SIZE=2000000
fi
# The array sizing code will overflow 32 bits on systems with many
# processors having lots of cache. The crash looks like this:
#
# $ gcc -O3 -DN=133823657 -fopenmp stream.c -o stream
# /tmp/ccecdC49.o: In function `checkSTREAMresults':
# stream.c:(.text+0x34): relocation truncated to fit: R_X86_64_32S against `.bss'
# /tmp/ccecdC49.o: In function `main.omp_fn.6':
# stream.c:(.text+0x2a6): relocation truncated to fit: R_X86_64_32S against `.bss'
# stream.c:(.text+0x348): relocation truncated to fit: R_X86_64_32S against `.bss'
# stream.c:(.text+0x388): relocation truncated to fit: R_X86_64_32S against `.bss'
# /tmp/ccecdC49.o: In function `main.omp_fn.8':
# stream.c:(.text+0x4ed): relocation truncated to fit: R_X86_64_32S against `.bss'
# stream.c:(.text+0x514): relocation truncated to fit: R_X86_64_32S against `.bss'
# stream.c:(.text+0x548): relocation truncated to fit: R_X86_64_32S against `.bss'
# stream.c:(.text+0x58c): relocation truncated to fit: R_X86_64_32S against `.bss'
# /tmp/ccecdC49.o: In function `main.omp_fn.9':
# stream.c:(.text+0x615): relocation truncated to fit: R_X86_64_32S against `.bss'
# stream.c:(.text+0x660): relocation truncated to fit: R_X86_64_32S against `.bss'
# stream.c:(.text+0x6ab): additional relocation overflows omitted from the output
# collect2: ld returned 1 exit status
# Clamp the upper value to a smaller maximum size to try and avoid this
# error. 130,000,000 makes for approximately a 3GB array.
if [ $NEEDED_SIZE -gt $MAX_ARRAY_SIZE ] ; then
NEEDED_SIZE=$MAX_ARRAY_SIZE
echo Limiting array size to fit into a 32 bit structure
fi
# Given the sizing above uses a factor of 10X cache size, this reduced size
# is still large enough for current generation procesors up to the 48 core
# range. For example, a system containing 8 Intel Xeon L7555 processors with
# 4 cores having 24576 KB cache each will suggest:
#
# Total CPU system cache: 814743552 bytes
# Computed minimum array elements needed: 370337978
#
# So using 130,000,000 instead of 370,337,978 still an array >3X the
# size of cache sum. Really large systems with >48 processors might overflow
# this still, but hopefully this limitation will be addressed by the
# underlying stream code being called here eventually, rather than
# trying to work around it here.
echo Array elements used: $NEEDED_SIZE
eval $__resultvar="'$NEEDED_SIZE'"
return
}
#
# Execute cache size estimations
#
echo === CPU cache information ===
stream_array_elements ARRAY_SIZE
ARRAY_FLAG="-DN=$ARRAY_SIZE"
if [ -n "$DEBUG" ] ; then
echo Array size is $ARRAY_SIZE
echo Array flag is $ARRAY_FLAG
fi
#
# Download and compile stream
#
echo
echo === Check and build stream ===
if [ ! -f stream.c ] ; then
wget http://www.cs.virginia.edu/stream/FTP/Code/stream.c
fi
# Since the array size is fixed at compile time, we have to
# recompile it each time, in case the binary already there
# was generated on a system with a smaller cache
if [ -f stream ] ; then
rm stream
fi
gcc -O3 $ARRAY_FLAG -fopenmp stream.c -o stream
if [ ! -x stream ] ; then
echo Error: did not find valid stream program compiled here, aborting
exit 1
fi
#
# Run the test
#
echo
echo === Testing up to $MAX_CORES cores ===
echo
i=1
while [[ $i -le $MAX_CORES ]] ; do
export OMP_NUM_THREADS="$i"
if [ -n "$VERBOSE" ] ; then
./stream
elif [ "$i" -eq 1 ] ; then
./stream
else
# Show just a summary after the first line.
./stream | egrep "Number of Threads requested|Function|Triad"
fi
((i++))
echo
done