CUDA: Support NVVM70 / CUDA 11.2

Starting with CUDA 11.2, a new version of NVVM is provided that is based on LLVM 7.0. This requires a number of changes to support, which must be maintained in parallel with the existing support for NVVM based on LLVM 3.4. This PR adds these changes, which consist of: - Addition of a function to query the NVVM IR version, and a property indicating whether the NVVM in use is based on LLVM 3.4 or 7.0 (`is_nvvm70`). - The CAS hack (inserting a text-based implementation of `cmpxchg` with pre-LLVM 3.5 semantics in a function) is only needed with NVVM 3.4 - on NVVM 7.0, llvmlite is used to build `cmpxchg` instructions directly instead. - Templates for other atomics (inc, dec, min, max) have the right form of the `cmpxchg` instruction inserted depending on the NVVM version. - The datalayout shorthand is now only replaced for NVVM 3.4. - There are now two variants of the functions to rewrite the IR - `llvm100_to_70_ir` and `llvm100_to_34_ir`. `llvm100_to_34_ir` is the old `llvm_39_to_34_ir` with a name reflecting what it currently does. - `llvm100_to_70_ir` removes the `willreturn` attribute from functions, as it is not supported by LLVM 7.0. It also converts DISPFlags to main subprogram DIFlags. For example, `spflags: DISPFlagDefinition | DISPFlagOptimized` is rewritten as `isDefinition: true, isOptimized: true`. - For NVVM 7.0, the `DIBuilder` also used for the CPU target can be used, instead of the `NvvmDIBuilder` that was needed to support NVVM 3.4. - Some tests are updated to support modified function names, and also to expect a CUDA version of 11.2. - `test_nvvm_driver` is updated to include appropriate IR for both NVVM 3.4 and 7.0. Some refactoring also makes its code clearer (e.g. renaming `get_ptx()` to `get_nvvimir()`, because it returns NVVM IR and not PTX). - Some optimizations in LLVM 7.0 result in different code generation in `test_constmem`, so alternative expected results are added for when NVVM 7.0 is used. Note that this recovers some optimizations that were lost when IR optimization using llvmlite was switched off (PR numba#6030, "Don't optimize IR before sending it to NVVM"). - `test_debuginfo` is updated to match the format of the debuginfo section produced by both NVVM 3.4 and 7.0 (there is some variation in whitespace between these versions).
gmarkall · Jan 25, 2021 · 43d5090 · 43d5090
1 parent f57bd25
commit 43d5090
Show file tree

Hide file tree

Showing 10 changed files with 234 additions and 118 deletions.
diff --git a/numba/cuda/cudadrv/nvvm.py b/numba/cuda/cudadrv/nvvm.py
@@ -101,6 +101,11 @@ class NVVM(object):
 
         # nvvmResult nvvmGetProgramLog(nvvmProgram cu, char *buffer)
         'nvvmGetProgramLog': (nvvm_result, nvvm_program, c_char_p),
+
+        # nvvmResult nvvmIRVersion (int* majorIR, int* minorIR, int* majorDbg,
+        #                           int* minorDbg )
+        'nvvmIRVersion': (nvvm_result, POINTER(c_int), POINTER(c_int),
+                          POINTER(c_int), POINTER(c_int)),
     }
 
     # Singleton reference
@@ -127,13 +132,34 @@ def __new__(cls):
 
         return cls.__INSTANCE
 
+    def __init__(self):
+        ir_versions = self.get_ir_version()
+        self._majorIR = ir_versions[0]
+        self._minorIR = ir_versions[1]
+        self._majorDbg = ir_versions[2]
+        self._minorDbg = ir_versions[3]
+
+    @property
+    def is_nvvm70(self):
+        return (self._majorIR, self._minorIR) >= (1, 6)
+
     def get_version(self):
         major = c_int()
         minor = c_int()
         err = self.nvvmVersion(byref(major), byref(minor))
         self.check_error(err, 'Failed to get version.')
         return major.value, minor.value
 
+    def get_ir_version(self):
+        majorIR = c_int()
+        minorIR = c_int()
+        majorDbg = c_int()
+        minorDbg = c_int()
+        err = self.nvvmIRVersion(byref(majorIR), byref(minorIR),
+                                 byref(majorDbg), byref(minorDbg))
+        self.check_error(err, 'Failed to get IR version.')
+        return majorIR.value, minorIR.value, majorDbg.value, minorDbg.value
+
     def check_error(self, error, msg, exit=False):
         if error:
             exc = NvvmError(msg, RESULT_CODE_NAMES[error])
@@ -399,15 +425,26 @@ def get(self):
         return self.bc
 
 
-ir_numba_atomic_cas = """
+ir_numba_cas_hack = """
 define internal {T} @___numba_atomic_{T}_cas_hack({T}* %ptr, {T} %cmp, {T} %val) alwaysinline {{
     %out = cmpxchg volatile {T}* %ptr, {T} %cmp, {T} %val monotonic
     ret {T} %out
 }}
 """ # noqa: E501
 
+cas_nvvm70 = """
+    %cas_success = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic monotonic
+    %cas = extractvalue {{ {Ti}, i1 }} %cas_success, 0
+""" # noqa: E501
+
+
+cas_nvvm34 = """
+    %cas = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic
+""" # noqa: E501
+
+
 # Translation of code from CUDA Programming Guide v6.5, section B.12
-ir_numba_atomic_binary = """
+ir_numba_atomic_binary_template = """
 define internal {T} @___numba_atomic_{T}_{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
 entry:
     %iptr = bitcast {T}* %ptr to {Ti}*
@@ -419,7 +456,7 @@ def get(self):
     %dold = bitcast {Ti} %old to {T}
     %dnew = {OP} {T} %dold, %val
     %new = bitcast {T} %dnew to {Ti}
-    %cas = cmpxchg volatile {Ti}* %iptr, {Ti} %old, {Ti} %new monotonic
+    {CAS}
     %repeat = icmp ne {Ti} %cas, %old
     br i1 %repeat, label %attempt, label %done
 
@@ -429,18 +466,18 @@ def get(self):
 }}
 """ # noqa: E501
 
-ir_numba_atomic_inc = """
-define internal {T} @___numba_atomic_{Tu}_inc({T}* %ptr, {T} %val) alwaysinline {{
+ir_numba_atomic_inc_template = """
+define internal {T} @___numba_atomic_{Tu}_inc({T}* %iptr, {T} %val) alwaysinline {{
 entry:
-    %old2 = load volatile {T}, {T}* %ptr
+    %old2 = load volatile {T}, {T}* %iptr
     br label %attempt
 
 attempt:
     %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
     %bndchk = icmp ult {T} %old, %val
     %inc = add {T} %old, 1
     %new = select i1 %bndchk, {T} %inc, {T} 0
-    %cas = cmpxchg volatile {T}* %ptr, {T} %old, {T} %new monotonic
+    {CAS}
     %repeat = icmp ne {T} %cas, %old
     br i1 %repeat, label %attempt, label %done
 
@@ -449,18 +486,18 @@ def get(self):
 }}
 """ # noqa: E501
 
-ir_numba_atomic_dec = """
-define internal {T} @___numba_atomic_{Tu}_dec({T}* %ptr, {T} %val) alwaysinline {{
+ir_numba_atomic_dec_template = """
+define internal {T} @___numba_atomic_{Tu}_dec({T}* %iptr, {T} %val) alwaysinline {{
 entry:
-    %old2 = load volatile {T}, {T}* %ptr
+    %old2 = load volatile {T}, {T}* %iptr
     br label %attempt
 
 attempt:
     %old = phi {T} [ %old2, %entry ], [ %cas, %attempt ]
     %dec = add {T} %old, -1
     %bndchk = icmp ult {T} %dec, %val
     %new = select i1 %bndchk, {T} %dec, {T} %val
-    %cas = cmpxchg volatile {T}* %ptr, {T} %old, {T} %new monotonic
+    {CAS}
     %repeat = icmp ne {T} %cas, %old
     br i1 %repeat, label %attempt, label %done
 
@@ -469,7 +506,7 @@ def get(self):
 }}
 """ # noqa: E501
 
-ir_numba_atomic_minmax = """
+ir_numba_atomic_minmax_template = """
 define internal {T} @___numba_atomic_{T}_{NAN}{FUNC}({T}* %ptr, {T} %val) alwaysinline {{
 entry:
     %ptrval = load volatile {T}, {T}* %ptr
@@ -488,10 +525,10 @@ def get(self):
 
 attempt:
     ; Attempt to swap in the value
-    %iold = bitcast {T} %dold to {Ti}
+    %old = bitcast {T} %dold to {Ti}
     %iptr = bitcast {T}* %ptr to {Ti}*
-    %ival = bitcast {T} %val to {Ti}
-    %cas = cmpxchg volatile {Ti}* %iptr, {Ti} %iold, {Ti} %ival monotonic
+    %new = bitcast {T} %val to {Ti}
+    {CAS}
     %dcas = bitcast {Ti} %cas to {T}
     br label %lt_check
 
@@ -501,6 +538,33 @@ def get(self):
 """ # noqa: E501
 
 
+def ir_cas(Ti):
+    if NVVM().is_nvvm70:
+        return cas_nvvm70.format(Ti=Ti)
+    else:
+        return cas_nvvm34.format(Ti=Ti)
+
+
+def ir_numba_atomic_binary(T, Ti, OP, FUNC):
+    params = dict(T=T, Ti=Ti, OP=OP, FUNC=FUNC, CAS=ir_cas(Ti))
+    return ir_numba_atomic_binary_template.format(**params)
+
+
+def ir_numba_atomic_minmax(T, Ti, NAN, OP, PTR_OR_VAL, FUNC):
+    params = dict(T=T, Ti=Ti, NAN=NAN, OP=OP, PTR_OR_VAL=PTR_OR_VAL,
+                  FUNC=FUNC, CAS=ir_cas(Ti))
+
+    return ir_numba_atomic_minmax_template.format(**params)
+
+
+def ir_numba_atomic_inc(T, Tu):
+    return ir_numba_atomic_inc_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
+
+
+def ir_numba_atomic_dec(T, Tu):
+    return ir_numba_atomic_dec_template.format(T=T, Tu=Tu, CAS=ir_cas(T))
+
+
 def _replace_datalayout(llvmir):
     """
     Find the line containing the datalayout and replace it
@@ -525,59 +589,58 @@ def llvm_to_ptx(llvmir, **opts):
 
     cu = CompilationUnit()
     libdevice = LibDevice(arch=opts.get('arch', 'compute_20'))
-    # New LLVM generate a shorthand for datalayout that NVVM does not know
-    llvmir = _replace_datalayout(llvmir)
-    # Replace with our cmpxchg and atomic implementations because LLVM 3.5 has
-    # a new semantic for cmpxchg.
+
     replacements = [
         ('declare double @___numba_atomic_double_add(double*, double)',
-         ir_numba_atomic_binary.format(T='double', Ti='i64', OP='fadd',
-                                       FUNC='add')),
+         ir_numba_atomic_binary(T='double', Ti='i64', OP='fadd', FUNC='add')),
         ('declare float @___numba_atomic_float_sub(float*, float)',
-         ir_numba_atomic_binary.format(T='float', Ti='i32', OP='fsub',
-                                       FUNC='sub')),
+         ir_numba_atomic_binary(T='float', Ti='i32', OP='fsub', FUNC='sub')),
         ('declare double @___numba_atomic_double_sub(double*, double)',
-         ir_numba_atomic_binary.format(T='double', Ti='i64', OP='fsub',
-                                       FUNC='sub')),
+         ir_numba_atomic_binary(T='double', Ti='i64', OP='fsub', FUNC='sub')),
         ('declare i64 @___numba_atomic_u64_inc(i64*, i64)',
-         ir_numba_atomic_inc.format(T='i64', Tu='u64')),
+         ir_numba_atomic_inc(T='i64', Tu='u64')),
         ('declare i64 @___numba_atomic_u64_dec(i64*, i64)',
-         ir_numba_atomic_dec.format(T='i64', Tu='u64')),
-        ('declare i32 @___numba_atomic_i32_cas_hack(i32*, i32, i32)',
-         ir_numba_atomic_cas.format(T='i32')),
-        ('declare i64 @___numba_atomic_i64_cas_hack(i64*, i64, i64)',
-         ir_numba_atomic_cas.format(T='i64')),
+         ir_numba_atomic_dec(T='i64', Tu='u64')),
         ('declare float @___numba_atomic_float_max(float*, float)',
-         ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='',
-                                       OP='nnan olt', PTR_OR_VAL='ptr',
-                                       FUNC='max')),
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan olt',
+                                PTR_OR_VAL='ptr', FUNC='max')),
         ('declare double @___numba_atomic_double_max(double*, double)',
-         ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='',
-                                       OP='nnan olt', PTR_OR_VAL='ptr',
-                                       FUNC='max')),
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan olt',
+                                PTR_OR_VAL='ptr', FUNC='max')),
         ('declare float @___numba_atomic_float_min(float*, float)',
-         ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='',
-                                       OP='nnan ogt', PTR_OR_VAL='ptr',
-                                       FUNC='min')),
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='', OP='nnan ogt',
+                                PTR_OR_VAL='ptr', FUNC='min')),
         ('declare double @___numba_atomic_double_min(double*, double)',
-         ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='',
-                                       OP='nnan ogt', PTR_OR_VAL='ptr',
-                                       FUNC='min')),
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='', OP='nnan ogt',
+                                PTR_OR_VAL='ptr', FUNC='min')),
         ('declare float @___numba_atomic_float_nanmax(float*, float)',
-         ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='nan',
-                                       OP='ult', PTR_OR_VAL='', FUNC='max')),
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ult',
+                                PTR_OR_VAL='', FUNC='max')),
         ('declare double @___numba_atomic_double_nanmax(double*, double)',
-         ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='nan',
-                                       OP='ult', PTR_OR_VAL='', FUNC='max')),
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ult',
+                                PTR_OR_VAL='', FUNC='max')),
         ('declare float @___numba_atomic_float_nanmin(float*, float)',
-         ir_numba_atomic_minmax.format(T='float', Ti='i32', NAN='nan',
-                                       OP='ugt', PTR_OR_VAL='', FUNC='min')),
+         ir_numba_atomic_minmax(T='float', Ti='i32', NAN='nan', OP='ugt',
+                                PTR_OR_VAL='', FUNC='min')),
         ('declare double @___numba_atomic_double_nanmin(double*, double)',
-         ir_numba_atomic_minmax.format(T='double', Ti='i64', NAN='nan',
-                                       OP='ugt', PTR_OR_VAL='', FUNC='min')),
+         ir_numba_atomic_minmax(T='double', Ti='i64', NAN='nan', OP='ugt',
+                                PTR_OR_VAL='', FUNC='min')),
         ('immarg', '')
     ]
 
+    if not NVVM().is_nvvm70:
+        # Replace with our cmpxchg implementation because LLVM 3.5 has a new
+        # semantic for cmpxchg.
+        replacements += [
+            ('declare i32 @___numba_atomic_i32_cas_hack(i32*, i32, i32)',
+             ir_numba_cas_hack.format(T='i32')),
+            ('declare i64 @___numba_atomic_i64_cas_hack(i64*, i64, i64)',
+             ir_numba_cas_hack.format(T='i64'))
+        ]
+        # Newer LLVMs generate a shorthand for datalayout that NVVM34 does not
+        # know
+        llvmir = _replace_datalayout(llvmir)
+
     for decl, fn in replacements:
         llvmir = llvmir.replace(decl, fn)
 
@@ -587,7 +650,11 @@ def llvm_to_ptx(llvmir, **opts):
     # pass to NVVM.
     llvmir = llvmir.replace('llvm.numba_nvvm.atomic', 'llvm.nvvm.atomic')
 
-    llvmir = llvm39_to_34_ir(llvmir)
+    if NVVM().is_nvvm70:
+        llvmir = llvm100_to_70_ir(llvmir)
+    else:
+        llvmir = llvm100_to_34_ir(llvmir)
+
     cu.add_module(llvmir.encode('utf8'))
     cu.add_module(libdevice.get())
 
@@ -642,10 +709,51 @@ def patch_ptx_debug_pubnames(ptx):
 
 re_parenthesized_list = re.compile(r"\((.*)\)")
 
+re_spflags = re.compile(r"spFlags: (.*),")
+
+spflagmap = {
+    'DISPFlagDefinition': 'isDefinition',
+    'DISPFlagOptimized': 'isOptimized',
+}
+
+
+def llvm100_to_70_ir(ir):
+    """
+    Convert LLVM 10.0 IR for LLVM 7.0.
+    """
+    buf = []
+    for line in ir.splitlines():
+        if line.startswith('attributes #'):
+            # Remove function attributes unsupported by LLVM 7.0
+            m = re_attributes_def.match(line)
+            attrs = m.group(1).split()
+            attrs = ' '.join(a for a in attrs if a != 'willreturn')
+            line = line.replace(m.group(1), attrs)
+
+        if '!DISubprogram' in line:
+            # Replace the DISPFlags (LLVM 10.0) with main subprogram DIFlags
+            # (LLVM 7.0). Example:
+            #
+            #     spflags: DISPFlagDefinition | DISPFlagOptimized
+            #
+            # becomes:
+            #
+            #     isDefinition: true, isOptimized: true
+            m = re_spflags.search(line)
+            flags = m.group(1).split(' | ')
+            new_flags = ", ".join([ '%s: true' % spflagmap[f] for f in flags ])
+            start_of_line = line[:m.span()[0]]
+            end_of_line = line[m.span()[1] - 1:]
+            line = start_of_line + new_flags + end_of_line
+
+        buf.append(line)
+
+    return '\n'.join(buf)
 
-def llvm39_to_34_ir(ir):
+
+def llvm100_to_34_ir(ir):
     """
-    Convert LLVM 3.9 IR for LLVM 3.4.
+    Convert LLVM 10.0 IR for LLVM 3.4.
     """
     def parse_out_leading_type(s):
         par_level = 0
@@ -790,7 +898,12 @@ def set_cuda_kernel(lfunc):
 
     # set nvvm ir version
     i32 = ir.IntType(32)
-    md_ver = m.add_metadata([i32(1), i32(2), i32(2), i32(0)])
+    if NVVM().is_nvvm70:
+        ir_versions = [i32(1), i32(6), i32(3), i32(0)]
+    else:
+        ir_versions = [i32(1), i32(2), i32(2), i32(0)]
+
+    md_ver = m.add_metadata(ir_versions)
     m.add_named_metadata('nvvmir.version', md_ver)
 
 

diff --git a/numba/cuda/cudaimpl.py b/numba/cuda/cudaimpl.py
@@ -774,9 +774,7 @@ def ptx_atomic_cas_tuple(context, builder, sig, args):
     if aryty.dtype in (cuda.cudadecl.integer_numba_types):
         lmod = builder.module
         bitwidth = aryty.dtype.bitwidth
-        return builder.call(nvvmutils.declare_atomic_cas_int(lmod,
-                                                             bitwidth),
-                            (ptr, old, val))
+        return nvvmutils.atomic_cmpxchg(builder, lmod, bitwidth, ptr, old, val)
     else:
         raise TypeError('Unimplemented atomic compare_and_swap '
                         'with %s array' % dtype)

diff --git a/numba/cuda/nvvmutils.py b/numba/cuda/nvvmutils.py
@@ -12,6 +12,15 @@ def declare_atomic_cas_int(lmod, isize):
                              lc.Type.int(isize)))
     return lmod.get_or_insert_function(fnty, fname)
 
+
+def atomic_cmpxchg(builder, lmod, isize, ptr, cmp, val):
+    if nvvm.NVVM().is_nvvm70:
+        out = builder.cmpxchg(ptr, cmp, val, 'monotonic', 'monotonic')
+        return builder.extract_value(out, 0)
+    else:
+        return builder.call(declare_atomic_cas_int(lmod, isize),
+                            (ptr, cmp, val))
+
 # For atomic intrinsics, "numba_nvvm" prevents LLVM 9 onwards auto-upgrading
 # them into atomicrmw instructions that are not recognized by NVVM. It is
 # replaced with "nvvm" in llvm_to_ptx later, after the module has been parsed