Skip to content

Commit

Permalink
CUDA: Support NVVM70 / CUDA 11.2
Browse files Browse the repository at this point in the history
Starting with CUDA 11.2, a new version of NVVM is provided that is based
on LLVM 7.0. This requires a number of changes to support, which must be
maintained in parallel with the existing support for NVVM based on LLVM
3.4. This PR adds these changes, which consist of:

- Addition of a function to query the NVVM IR version, and a property
  indicating whether the NVVM in use is based on LLVM 3.4 or 7.0
  (`is_nvvm70`).
- The CAS hack (inserting a text-based implementation of `cmpxchg` with
  pre-LLVM 3.5 semantics in a function) is only needed with NVVM 3.4 -
  on NVVM 7.0, llvmlite is used to build `cmpxchg` instructions directly
  instead.
- Templates for other atomics (inc, dec, min, max) have the right form
  of the `cmpxchg` instruction inserted depending on the NVVM version.
- The datalayout shorthand is now only replaced for NVVM 3.4.
- There are now two variants of the functions to rewrite the IR -
  `llvm100_to_70_ir` and `llvm100_to_34_ir`. `llvm100_to_34_ir` is the
  old `llvm_39_to_34_ir` with a name reflecting what it currently does.
- `llvm100_to_70_ir` removes the `willreturn` attribute from functions,
  as it is not supported by LLVM 7.0. It also converts DISPFlags to main
  subprogram DIFlags.  For example, `spflags: DISPFlagDefinition |
  DISPFlagOptimized` is rewritten as `isDefinition: true, isOptimized:
  true`.
- For NVVM 7.0, the `DIBuilder` also used for the CPU target can be used,
  instead of the `NvvmDIBuilder` that was needed to support NVVM 3.4.
- Some tests are updated to support modified function names, and also to
  expect a CUDA version of 11.2.
- `test_nvvm_driver` is updated to include appropriate IR for both NVVM
  3.4 and 7.0. Some refactoring also makes its code clearer (e.g.
  renaming `get_ptx()` to `get_nvvimir()`, because it returns NVVM IR
  and not PTX).
- Some optimizations in LLVM 7.0 result in different code generation in
  `test_constmem`, so alternative expected results are added for when
  NVVM 7.0 is used. Note that this recovers some optimizations that were
  lost when IR optimization using llvmlite was switched off (PR numba#6030,
  "Don't optimize IR before sending it to NVVM").
- `test_debuginfo` is updated to match the format of the debuginfo
  section produced by both NVVM 3.4 and 7.0 (there is some variation in
  whitespace between these versions).
  • Loading branch information
gmarkall committed Jan 25, 2021
1 parent f57bd25 commit 43d5090
Show file tree
Hide file tree
Showing 10 changed files with 234 additions and 118 deletions.
223 changes: 168 additions & 55 deletions numba/cuda/cudadrv/nvvm.py
Expand Up @@ -101,6 +101,11 @@ class NVVM(object):

# nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),

# nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
# int* minorDbg )
'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
POINTER(c_int), POINTER(c_int)),
}

# Singleton reference
Expand All @@ -127,13 +132,34 @@ def __new__(cls):

return cls.__INSTANCE

def __init__(self):
ir_versions = self.get_ir_version()
self._majorIR = ir_versions[0]
self._minorIR = ir_versions[1]
self._majorDbg = ir_versions[2]
self._minorDbg = ir_versions[3]

@property
def is_nvvm70(self):
return (self._majorIR, self._minorIR) >= (1, 6)

def get_version(self):
major = c_int()
minor = c_int()
err = self.nvvmVersion(byref(major), byref(minor))
self.check_error(err, 'Failed to get version.')
return major.value, minor.value

def get_ir_version(self):
majorIR = c_int()
minorIR = c_int()
majorDbg = c_int()
minorDbg = c_int()
err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
byref(majorDbg), byref(minorDbg))
self.check_error(err, 'Failed to get IR version.')
return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value

def check_error(self, error, msg, exit=False):
if error:
exc = NvvmError(msg, RESULT_CODE_NAMES[error])
Expand Down Expand Up @@ -399,15 +425,26 @@ def get(self):
return self.bc


ir_numba_atomic_cas = """
ir_numba_cas_hack = """
define internal {T} @___numba_atomic_{T}_cas_hack({T}* %ptr, {T} %cmp, {T} %val) alwaysinline {{
%out = cmpxchg volatile {T}* %ptr, {T} %cmp, {T} %val monotonic
ret {T} %out
}}
""" # noqa: E501

cas_nvvm70 = """
%cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
%cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
""" # noqa: E501


cas_nvvm34 = """
%cas = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic
""" # noqa: E501


# Translation of code from CUDA Programming Guide v6.5, section B.12
ir_numba_atomic_binary = """
ir_numba_atomic_binary_template = """
define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
%iptr = bitcast {T}* %ptr to {Ti}*
Expand All @@ -419,7 +456,7 @@ def get(self):
%dold = bitcast {Ti} %old to {T}
%dnew = {OP} {T} %dold, %val
%new = bitcast {T} %dnew to {Ti}
%cas = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic
{CAS}
%repeat = icmp ne {Ti} %cas, %old
br i1 %repeat, label %attempt, label %done
Expand All @@ -429,18 +466,18 @@ def get(self):
}}
""" # noqa: E501

ir_numba_atomic_inc = """
define internal {T} @___numba_atomic_{Tu}_inc({T}* %ptr, {T} %val) alwaysinline {{
ir_numba_atomic_inc_template = """
define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
entry:
%old2 = load volatile {T}, {T}* %ptr
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
attempt:
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%bndchk = icmp ult {T} %old, %val
%inc = add {T} %old, 1
%new = select i1 %bndchk, {T} %inc, {T} 0
%cas = cmpxchg volatile {T}* %ptr, {T} %old, {T} %new monotonic
{CAS}
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
Expand All @@ -449,18 +486,18 @@ def get(self):
}}
""" # noqa: E501

ir_numba_atomic_dec = """
define internal {T} @___numba_atomic_{Tu}_dec({T}* %ptr, {T} %val) alwaysinline {{
ir_numba_atomic_dec_template = """
define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
entry:
%old2 = load volatile {T}, {T}* %ptr
%old2 = load volatile {T}, {T}* %iptr
br label %attempt
attempt:
%old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
%dec = add {T} %old, -1
%bndchk = icmp ult {T} %dec, %val
%new = select i1 %bndchk, {T} %dec, {T} %val
%cas = cmpxchg volatile {T}* %ptr, {T} %old, {T} %new monotonic
{CAS}
%repeat = icmp ne {T} %cas, %old
br i1 %repeat, label %attempt, label %done
Expand All @@ -469,7 +506,7 @@ def get(self):
}}
""" # noqa: E501

ir_numba_atomic_minmax = """
ir_numba_atomic_minmax_template = """
define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
entry:
%ptrval = load volatile {T}, {T}* %ptr
Expand All @@ -488,10 +525,10 @@ def get(self):
attempt:
; Attempt to swap in the value
%iold = bitcast {T} %dold to {Ti}
%old = bitcast {T} %dold to {Ti}
%iptr = bitcast {T}* %ptr to {Ti}*
%ival = bitcast {T} %val to {Ti}
%cas = cmpxchg volatile {Ti}* %iptr, {Ti} %iold, {Ti} %ival monotonic
%new = bitcast {T} %val to {Ti}
{CAS}
%dcas = bitcast {Ti} %cas to {T}
br label %lt_check
Expand All @@ -501,6 +538,33 @@ def get(self):
""" # noqa: E501


def ir_cas(Ti):
if NVVM().is_nvvm70:
return cas_nvvm70.format(Ti=Ti)
else:
return cas_nvvm34.format(Ti=Ti)


def ir_numba_atomic_binary(T, Ti, OP, FUNC):
params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
return ir_numba_atomic_binary_template.format(**params)


def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
FUNC=FUNC, CAS=ir_cas(Ti))

return ir_numba_atomic_minmax_template.format(**params)


def ir_numba_atomic_inc(T, Tu):
return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))


def ir_numba_atomic_dec(T, Tu):
return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))


def _replace_datalayout(llvmir):
"""
Find the line containing the datalayout and replace it
Expand All @@ -525,59 +589,58 @@ def llvm_to_ptx(llvmir, **opts):

cu = CompilationUnit()
libdevice = LibDevice(arch=opts.get('arch', 'compute_20'))
# New LLVM generate a shorthand for datalayout that NVVM does not know
llvmir = _replace_datalayout(llvmir)
# Replace with our cmpxchg and atomic implementations because LLVM 3.5 has
# a new semantic for cmpxchg.

replacements = [
('declare double @___numba_atomic_double_add(double*, double)',
ir_numba_atomic_binary.format(T='double', Ti='i64', OP='fadd',
FUNC='add')),
ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
('declare float @___numba_atomic_float_sub(float*, float)',
ir_numba_atomic_binary.format(T='float', Ti='i32', OP='fsub',
FUNC='sub')),
ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
('declare double @___numba_atomic_double_sub(double*, double)',
ir_numba_atomic_binary.format(T='double', Ti='i64', OP='fsub',
FUNC='sub')),
ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
('declare i64 @___numba_atomic_u64_inc(i64*, i64)',
ir_numba_atomic_inc.format(T='i64', Tu='u64')),
ir_numba_atomic_inc(T='i64', Tu='u64')),
('declare i64 @___numba_atomic_u64_dec(i64*, i64)',
ir_numba_atomic_dec.format(T='i64', Tu='u64')),
('declare i32 @___numba_atomic_i32_cas_hack(i32*, i32, i32)',
ir_numba_atomic_cas.format(T='i32')),
('declare i64 @___numba_atomic_i64_cas_hack(i64*, i64, i64)',
ir_numba_atomic_cas.format(T='i64')),
ir_numba_atomic_dec(T='i64', Tu='u64')),
('declare float @___numba_atomic_float_max(float*, float)',
ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='',
OP='nnan olt', PTR_OR_VAL='ptr',
FUNC='max')),
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare double @___numba_atomic_double_max(double*, double)',
ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='',
OP='nnan olt', PTR_OR_VAL='ptr',
FUNC='max')),
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
PTR_OR_VAL='ptr', FUNC='max')),
('declare float @___numba_atomic_float_min(float*, float)',
ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='',
OP='nnan ogt', PTR_OR_VAL='ptr',
FUNC='min')),
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare double @___numba_atomic_double_min(double*, double)',
ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='',
OP='nnan ogt', PTR_OR_VAL='ptr',
FUNC='min')),
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
PTR_OR_VAL='ptr', FUNC='min')),
('declare float @___numba_atomic_float_nanmax(float*, float)',
ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='nan',
OP='ult', PTR_OR_VAL='', FUNC='max')),
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare double @___numba_atomic_double_nanmax(double*, double)',
ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='nan',
OP='ult', PTR_OR_VAL='', FUNC='max')),
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
PTR_OR_VAL='', FUNC='max')),
('declare float @___numba_atomic_float_nanmin(float*, float)',
ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='nan',
OP='ugt', PTR_OR_VAL='', FUNC='min')),
ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('declare double @___numba_atomic_double_nanmin(double*, double)',
ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='nan',
OP='ugt', PTR_OR_VAL='', FUNC='min')),
ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
PTR_OR_VAL='', FUNC='min')),
('immarg', '')
]

if not NVVM().is_nvvm70:
# Replace with our cmpxchg implementation because LLVM 3.5 has a new
# semantic for cmpxchg.
replacements += [
('declare i32 @___numba_atomic_i32_cas_hack(i32*, i32, i32)',
ir_numba_cas_hack.format(T='i32')),
('declare i64 @___numba_atomic_i64_cas_hack(i64*, i64, i64)',
ir_numba_cas_hack.format(T='i64'))
]
# Newer LLVMs generate a shorthand for datalayout that NVVM34 does not
# know
llvmir = _replace_datalayout(llvmir)

for decl, fn in replacements:
llvmir = llvmir.replace(decl, fn)

Expand All @@ -587,7 +650,11 @@ def llvm_to_ptx(llvmir, **opts):
# pass to NVVM.
llvmir = llvmir.replace('llvm.numba_nvvm.atomic', 'llvm.nvvm.atomic')

llvmir = llvm39_to_34_ir(llvmir)
if NVVM().is_nvvm70:
llvmir = llvm100_to_70_ir(llvmir)
else:
llvmir = llvm100_to_34_ir(llvmir)

cu.add_module(llvmir.encode('utf8'))
cu.add_module(libdevice.get())

Expand Down Expand Up @@ -642,10 +709,51 @@ def patch_ptx_debug_pubnames(ptx):

re_parenthesized_list = re.compile(r"\((.*)\)")

re_spflags = re.compile(r"spFlags: (.*),")

spflagmap = {
'DISPFlagDefinition': 'isDefinition',
'DISPFlagOptimized': 'isOptimized',
}


def llvm100_to_70_ir(ir):
"""
Convert LLVM 10.0 IR for LLVM 7.0.
"""
buf = []
for line in ir.splitlines():
if line.startswith('attributes #'):
# Remove function attributes unsupported by LLVM 7.0
m = re_attributes_def.match(line)
attrs = m.group(1).split()
attrs = ' '.join(a for a in attrs if a != 'willreturn')
line = line.replace(m.group(1), attrs)

if '!DISubprogram' in line:
# Replace the DISPFlags (LLVM 10.0) with main subprogram DIFlags
# (LLVM 7.0). Example:
#
# spflags: DISPFlagDefinition | DISPFlagOptimized
#
# becomes:
#
# isDefinition: true, isOptimized: true
m = re_spflags.search(line)
flags = m.group(1).split(' | ')
new_flags = ", ".join([ '%s: true' % spflagmap[f] for f in flags ])
start_of_line = line[:m.span()[0]]
end_of_line = line[m.span()[1] - 1:]
line = start_of_line + new_flags + end_of_line

buf.append(line)

return '\n'.join(buf)

def llvm39_to_34_ir(ir):

def llvm100_to_34_ir(ir):
"""
Convert LLVM 3.9 IR for LLVM 3.4.
Convert LLVM 10.0 IR for LLVM 3.4.
"""
def parse_out_leading_type(s):
par_level = 0
Expand Down Expand Up @@ -790,7 +898,12 @@ def set_cuda_kernel(lfunc):

# set nvvm ir version
i32 = ir.IntType(32)
md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
if NVVM().is_nvvm70:
ir_versions = [i32(1), i32(6), i32(3), i32(0)]
else:
ir_versions = [i32(1), i32(2), i32(2), i32(0)]

md_ver = m.add_metadata(ir_versions)
m.add_named_metadata('nvvmir.version', md_ver)


Expand Down
4 changes: 1 addition & 3 deletions numba/cuda/cudaimpl.py
Expand Up @@ -774,9 +774,7 @@ def ptx_atomic_cas_tuple(context, builder, sig, args):
if aryty.dtype in (cuda.cudadecl.integer_numba_types):
lmod = builder.module
bitwidth = aryty.dtype.bitwidth
return builder.call(nvvmutils.declare_atomic_cas_int(lmod,
bitwidth),
(ptr, old, val))
return nvvmutils.atomic_cmpxchg(builder, lmod, bitwidth, ptr, old, val)
else:
raise TypeError('Unimplemented atomic compare_and_swap '
'with %s array' % dtype)
Expand Down
9 changes: 9 additions & 0 deletions numba/cuda/nvvmutils.py
Expand Up @@ -12,6 +12,15 @@ def declare_atomic_cas_int(lmod, isize):
lc.Type.int(isize)))
return lmod.get_or_insert_function(fnty, fname)


def atomic_cmpxchg(builder, lmod, isize, ptr, cmp, val):
if nvvm.NVVM().is_nvvm70:
out = builder.cmpxchg(ptr, cmp, val, 'monotonic', 'monotonic')
return builder.extract_value(out, 0)
else:
return builder.call(declare_atomic_cas_int(lmod, isize),
(ptr, cmp, val))

# For atomic intrinsics, "numba_nvvm" prevents LLVM 9 onwards auto-upgrading
# them into atomicrmw instructions that are not recognized by NVVM. It is
# replaced with "nvvm" in llvm_to_ptx later, after the module has been parsed
Expand Down

0 comments on commit 43d5090

Please sign in to comment.