/
backend.h
438 lines (351 loc) · 21.1 KB
/
backend.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
#pragma once
// Standard C++ includes
#include <algorithm>
#include <array>
#include <functional>
#include <map>
#include <numeric>
#include <string>
// Standard C includes
#include <cassert>
// CUDA includes
#include <cuda.h>
#include <cuda_runtime.h>
#if __has_include(<nccl.h>)
#include <nccl.h>
#define NCCL_AVAILABLE
#endif
// GeNN includes
#include "backendExport.h"
// GeNN code generator includes
#include "code_generator/backendSIMT.h"
#include "code_generator/codeStream.h"
// Forward declarations
namespace filesystem
{
class path;
}
//--------------------------------------------------------------------------
// GeNN::CodeGenerator::CUDA::DeviceSelectMethod
//--------------------------------------------------------------------------
namespace GeNN::CodeGenerator::CUDA
{
//! Methods for selecting CUDA device
enum class DeviceSelect
{
OPTIMAL, //!< Pick optimal device based on how well kernels can be simultaneously simulated and occupancy
MOST_MEMORY, //!< Pick device with most global memory
MANUAL, //!< Use device specified by user
};
//--------------------------------------------------------------------------
// CodeGenerator::CUDA::BlockSizeSelect
//--------------------------------------------------------------------------
//! Methods for selecting CUDA kernel block size
enum class BlockSizeSelect
{
OCCUPANCY, //!< Pick optimal blocksize for each kernel based on occupancy
MANUAL, //!< Use block sizes specified by user
};
//--------------------------------------------------------------------------
// CodeGenerator::CUDA::Preferences
//--------------------------------------------------------------------------
//! Preferences for CUDA backend
struct Preferences : public PreferencesBase
{
Preferences()
{
std::fill(manualBlockSizes.begin(), manualBlockSizes.end(), 32);
}
//! Should PTX assembler information be displayed for each CUDA kernel during compilation?
bool showPtxInfo = false;
//! Should line info be included in resultant executable for debugging/profiling purposes?
bool generateLineInfo = false;
//! Generate corresponding NCCL batch reductions
bool enableNCCLReductions = false;
//! How to select GPU device
DeviceSelect deviceSelectMethod = DeviceSelect::MANUAL;
//! If device select method is set to DeviceSelect::MANUAL, id of device to use
unsigned int manualDeviceID = 0;
//! How to select CUDA blocksize
BlockSizeSelect blockSizeSelectMethod = BlockSizeSelect::OCCUPANCY;
//! If block size select method is set to BlockSizeSelect::MANUAL, block size to use for each kernel
KernelBlockSize manualBlockSizes;
//! How much constant cache is already used and therefore can't be used by GeNN?
/*! Each of the four modules which includes CUDA headers(neuronUpdate, synapseUpdate, custom update, init and runner)
Takes 72 bytes of constant memory for a lookup table used by cuRAND. If your application requires
additional constant cache, increase this */
size_t constantCacheOverhead = 72 * 5;
//! NVCC compiler options for all GPU code
std::string userNvccFlags = "";
void updateHash(boost::uuids::detail::sha1 &hash) const
{
// Superclass
PreferencesBase::updateHash(hash);
// **NOTE** showPtxInfo, generateLineInfo and userNvccFlags only affect makefiles/msbuild
// **NOTE** block size optimization is also not relevant, the chosen block size is hashed in the backend
// **NOTE** while device selection is also not relevant as the chosen device is hashed in the backend, DeviceSelect::MANUAL_OVERRIDE is used in the backend
//! Update hash with preferences
Utils::updateHash(deviceSelectMethod, hash);
Utils::updateHash(constantCacheOverhead, hash);
Utils::updateHash(enableNCCLReductions, hash);
}
};
//--------------------------------------------------------------------------
// CodeGenerator::CUDA::Backend
//--------------------------------------------------------------------------
class BACKEND_EXPORT State : public Runtime::StateBase
{
public:
State(const Runtime::Runtime &base);
//------------------------------------------------------------------------
// Public API
//------------------------------------------------------------------------
//! To be called on one rank to generate ID before creating communicator
void ncclGenerateUniqueID();
//! Get pointer to unique ID
unsigned char *ncclGetUniqueID();
//! Get size of unique ID in bytes
size_t ncclGetUniqueIDSize() const;
//! Initialise communicator
void ncclInitCommunicator(int rank, int numRanks);
private:
//----------------------------------------------------------------------------
// Type defines
//----------------------------------------------------------------------------
typedef void (*VoidFunction)(void);
typedef unsigned char* (*BytePtrFunction)(void);
typedef void (*NCCLInitCommunicatorFunction)(int, int);
//------------------------------------------------------------------------
// Members
//------------------------------------------------------------------------
VoidFunction m_NCCLGenerateUniqueID;
BytePtrFunction m_NCCLGetUniqueID;
NCCLInitCommunicatorFunction m_NCCLInitCommunicator;
const size_t *m_NCCLUniqueIDSize;
};
//--------------------------------------------------------------------------
// CodeGenerator::CUDA::Backend
//--------------------------------------------------------------------------
class BACKEND_EXPORT Backend : public BackendSIMT
{
public:
Backend(const KernelBlockSize &kernelBlockSizes, const Preferences &preferences,
int device, bool zeroCopy);
//--------------------------------------------------------------------------
// CodeGenerator::BackendSIMT virtuals
//--------------------------------------------------------------------------
//! On some older devices, shared memory atomics are actually slower than global memory atomics so should be avoided
virtual bool areSharedMemAtomicsSlow() const final;
//! Get the prefix to use for shared memory variables
virtual std::string getSharedPrefix() const final{ return "__shared__ "; }
//! Get the ID of the current thread within the threadblock
virtual std::string getThreadID(unsigned int axis = 0) const final;
//! Get the ID of the current thread block
virtual std::string getBlockID(unsigned int axis = 0) const final;
//! How many 'lanes' does underlying hardware have?
/*! This is typically used for warp-shuffle algorithms */
virtual unsigned int getNumLanes() const final;
//! Get the name of the count-leading-zeros function
virtual std::string getCLZ() const final { return "__clz"; }
//! Get name of atomic operation
virtual std::string getAtomic(const Type::ResolvedType &type,
AtomicOperation op = AtomicOperation::ADD,
AtomicMemSpace memSpace = AtomicMemSpace::GLOBAL) const final;
//! Generate a warp reduction across getNumLanes lanes into lane 0
virtual void genWarpReduction(CodeStream& os, const std::string& variable,
VarAccessMode access, const Type::ResolvedType& type) const final;
//! Generate a shared memory barrier
virtual void genSharedMemBarrier(CodeStream &os) const final;
//! For SIMT backends which initialize RNGs on device, initialize population RNG with specified seed and sequence
virtual void genPopulationRNGInit(CodeStream &os, const std::string &globalRNG, const std::string &seed, const std::string &sequence) const final;
//! Generate a preamble to add substitution name for population RNG
virtual std::string genPopulationRNGPreamble(CodeStream &os, const std::string &globalRNG) const final;
//! If required, generate a postamble for population RNG
/*! For example, in OpenCL, this is used to write local RNG state back to global memory*/
virtual void genPopulationRNGPostamble(CodeStream &os, const std::string &globalRNG) const final;
//! Generate code to skip ahead local copy of global RNG
virtual std::string genGlobalRNGSkipAhead(CodeStream &os, const std::string &sequence) const final;
//! Get type of population RNG
virtual Type::ResolvedType getPopulationRNGType() const final;
//--------------------------------------------------------------------------
// CodeGenerator::BackendBase virtuals
//--------------------------------------------------------------------------
virtual void genNeuronUpdate(CodeStream &os, ModelSpecMerged &modelMerged, BackendBase::MemorySpaces &memorySpaces,
HostHandler preambleHandler) const final;
virtual void genSynapseUpdate(CodeStream &os, ModelSpecMerged &modelMerged, BackendBase::MemorySpaces &memorySpaces,
HostHandler preambleHandler) const final;
virtual void genCustomUpdate(CodeStream &os, ModelSpecMerged &modelMerged, BackendBase::MemorySpaces &memorySpaces,
HostHandler preambleHandler) const final;
virtual void genInit(CodeStream &os, ModelSpecMerged &modelMerged, BackendBase::MemorySpaces &memorySpaces,
HostHandler preambleHandler) const final;
virtual void genDefinitionsPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const final;
virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const final;
virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const final;
virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const final;
virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const final;
//! Create backend-specific runtime state object
/*! \param runtime runtime object */
virtual std::unique_ptr<GeNN::Runtime::StateBase> createState(const Runtime::Runtime &runtime) const final;
//! Create backend-specific array object
/*! \param type data type of array
\param count number of elements in array, if non-zero will allocate
\param location location of array e.g. device-only*/
virtual std::unique_ptr<Runtime::ArrayBase> createArray(const Type::ResolvedType &type, size_t count,
VarLocation location, bool uninitialized) const final;
//! Create array of backend-specific population RNGs (if they are initialised on host this will occur here)
/*! \param count number of RNGs required*/
virtual std::unique_ptr<Runtime::ArrayBase> createPopulationRNG(size_t count) const final;
//! Generate code to allocate variable with a size known at runtime
virtual void genLazyVariableDynamicAllocation(CodeStream &os,
const Type::ResolvedType &type, const std::string &name, VarLocation loc,
const std::string &countVarName) const final;
//! Generate code for pushing a variable with a size known at runtime to the 'device'
virtual void genLazyVariableDynamicPush(CodeStream &os,
const Type::ResolvedType &type, const std::string &name,
VarLocation loc, const std::string &countVarName) const final;
//! Generate code for pulling a variable with a size known at runtime from the 'device'
virtual void genLazyVariableDynamicPull(CodeStream &os,
const Type::ResolvedType &type, const std::string &name,
VarLocation loc, const std::string &countVarName) const final;
//! Generate code for pushing a new pointer to a dynamic variable into the merged group structure on 'device'
virtual void genMergedDynamicVariablePush(CodeStream &os, const std::string &suffix, size_t mergedGroupIdx,
const std::string &groupIdx, const std::string &fieldName,
const std::string &egpName) const final;
//! When generating function calls to push to merged groups, backend without equivalent of Unified Virtual Addressing e.g. OpenCL 1.2 may use different types on host
virtual std::string getMergedGroupFieldHostTypeName(const Type::ResolvedType &type) const final;
//! Generate a single RNG instance
/*! On single-threaded platforms this can be a standard RNG like M.T. but, on parallel platforms, it is likely to be a counter-based RNG */
virtual void genGlobalDeviceRNG(CodeStream &definitions, CodeStream &runner, CodeStream &allocations, CodeStream &free) const final;
virtual void genTimer(CodeStream &definitions, CodeStream &runner, CodeStream &allocations, CodeStream &free,
CodeStream &stepTimeFinalise, const std::string &name, bool updateInStepTime) const final;
//! Generate code to return amount of free 'device' memory in bytes
virtual void genReturnFreeDeviceMemoryBytes(CodeStream &os) const final;
//! On backends which support it, generate a runtime assert
virtual void genAssert(CodeStream &os, const std::string &condition) const final;
virtual void genMakefilePreamble(std::ostream &os) const final;
virtual void genMakefileLinkRule(std::ostream &os) const final;
virtual void genMakefileCompileRule(std::ostream &os) const final;
virtual void genMSBuildConfigProperties(std::ostream &os) const final;
virtual void genMSBuildImportProps(std::ostream &os) const final;
virtual void genMSBuildItemDefinitions(std::ostream &os) const final;
virtual void genMSBuildCompileModule(const std::string &moduleName, std::ostream &os) const final;
virtual void genMSBuildImportTarget(std::ostream &os) const final;
//! As well as host pointers, are device objects required?
virtual bool isArrayDeviceObjectRequired() const final{ return true; }
//! As well as host pointers, are additional host objects required e.g. for buffers in OpenCL?
virtual bool isArrayHostObjectRequired() const final{ return false; }
//! Different backends seed RNGs in different ways. Does this one initialise population RNGS on device?
virtual bool isPopulationRNGInitialisedOnDevice() const final { return true; }
//! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels
virtual bool isHostReductionRequired() const final { return getPreferences<Preferences>().enableNCCLReductions; }
//! How many bytes of memory does 'device' have
virtual size_t getDeviceMemoryBytes() const final{ return m_ChosenDevice.totalGlobalMem; }
//! Some backends will have additional small, fast, memory spaces for read-only data which might
//! Be well-suited to storing merged group structs. This method returns the prefix required to
//! Place arrays in these and their size in preferential order
virtual MemorySpaces getMergedGroupMemorySpaces(const ModelSpecMerged &modelMerged) const final;
//! Get hash digest of this backends identification and the preferences it has been configured with
virtual boost::uuids::detail::sha1::digest_type getHashDigest() const final;
//--------------------------------------------------------------------------
// Public API
//--------------------------------------------------------------------------
const cudaDeviceProp &getChosenCUDADevice() const{ return m_ChosenDevice; }
int getChosenDeviceID() const{ return m_ChosenDeviceID; }
int getRuntimeVersion() const{ return m_RuntimeVersion; }
std::string getNVCCFlags() const;
private:
//--------------------------------------------------------------------------
// Private methods
//--------------------------------------------------------------------------
std::string getNCCLReductionType(VarAccessMode mode) const;
std::string getNCCLType(const Type::ResolvedType &type) const;
void genKernelDimensions(CodeStream &os, Kernel kernel, size_t numThreadsX, size_t batchSize, size_t numBlockThreadsY = 1) const;
template<typename T>
void genMergedStructArrayPush(CodeStream &os, const std::vector<T> &groups) const
{
// Loop through groups
for(const auto &g : groups) {
// Check that a memory space has been assigned
assert(!g.getMemorySpace().empty());
// Implement merged group array in previously assigned memory space
os << g.getMemorySpace() << " Merged" << T::name << "Group" << g.getIndex() << " d_merged" << T::name << "Group" << g.getIndex() << "[" << g.getGroups().size() << "];" << std::endl;
// Write function to update
os << "void pushMerged" << T::name << "Group" << g.getIndex() << "ToDevice(unsigned int idx, ";
g.generateStructFieldArgumentDefinitions(os, *this);
os << ")";
{
CodeStream::Scope b(os);
// Loop through sorted fields and build struct on the stack
os << "Merged" << T::name << "Group" << g.getIndex() << " group = {";
const auto sortedFields = g.getSortedFields(*this);
for(const auto &f : sortedFields) {
os << f.name << ", ";
}
os << "};" << std::endl;
// Push to device
os << "CHECK_CUDA_ERRORS(cudaMemcpyToSymbolAsync(d_merged" << T::name << "Group" << g.getIndex() << ", &group, ";
os << "sizeof(Merged" << T::name << "Group" << g.getIndex() << "), idx * sizeof(Merged" << T::name << "Group" << g.getIndex() << ")));" << std::endl;
}
}
}
template<typename G>
void genNCCLReduction(EnvironmentExternalBase &env, G &cg) const
{
CodeStream::Scope b(env.getStream());
env.getStream() << "// merged custom update host reduction group " << cg.getIndex() << std::endl;
env.getStream() << "for(unsigned int g = 0; g < " << cg.getGroups().size() << "; g++)";
{
CodeStream::Scope b(env.getStream());
// Get reference to group
env.getStream() << "const auto *group = &merged" << G::name << "Group" << cg.getIndex() << "[g]; " << std::endl;
EnvironmentGroupMergedField<G> groupEnv(env, cg);
buildSizeEnvironment(groupEnv);
// Loop through variables
const auto *cm = cg.getArchetype().getModel();
for(const auto &v : cm->getVars()) {
// If variable is reduction target
if(v.access & VarAccessModeAttribute::REDUCE) {
// Add pointer field
const auto resolvedType = v.type.resolve(cg.getTypeContext());
groupEnv.addField(resolvedType.createPointer(), "_" + v.name, v.name,
[v](const auto &runtime, const auto &g, size_t)
{
return runtime.getArray(g, v.name);
});
// Add NCCL reduction
groupEnv.print("CHECK_NCCL_ERRORS(ncclAllReduce($(_" + v.name + "), $(_" + v.name + "), $(_size)");
groupEnv.printLine(", " + getNCCLType(resolvedType) + ", " + getNCCLReductionType(getVarAccessMode(v.access)) + ", ncclCommunicator, 0));");
}
}
// Loop through variable references
for(const auto &v : cm->getVarRefs()) {
// If variable reference ios reduction target
if(v.access & VarAccessModeAttribute::REDUCE) {
// Add pointer field
const auto resolvedType = v.type.resolve(cg.getTypeContext());
groupEnv.addField(resolvedType.createPointer(), "_" + v.name, v.name,
[v](const auto &runtime, const auto &g, size_t)
{
const auto varRef = g.getVarReferences().at(v.name);
return varRef.getTargetArray(runtime);
});
// Add NCCL reduction
groupEnv.print("CHECK_NCCL_ERRORS(ncclAllReduce($(_" + v.name + "), $(_" + v.name + "), $(_size)");
groupEnv.printLine(", " + getNCCLType(v.type.resolve(cg.getTypeContext())) + ", " + getNCCLReductionType(v.access) + ", ncclCommunicator, 0));");
}
}
}
}
//! Get the safe amount of constant cache we can use
size_t getChosenDeviceSafeConstMemBytes() const
{
return m_ChosenDevice.totalConstMem - getPreferences<Preferences>().constantCacheOverhead;
}
//--------------------------------------------------------------------------
// Members
//--------------------------------------------------------------------------
const int m_ChosenDeviceID;
cudaDeviceProp m_ChosenDevice;
int m_RuntimeVersion;
};
} // GeNN::CUDA::CodeGenerator