In [1]:
%env LD_LIBRARY_PATH=/usr/local/cuda/lib
%env PATH=/usr/local/cuda/bin:/usr/bin

env: LD_LIBRARY_PATH=/usr/local/cuda/lib
env: PATH=/usr/local/cuda/bin:/usr/bin


In [2]:
import tvm
import tvm.testing
from tvm import te
import numpy as np

In [8]:
tgt_gpu = tvm.target.Target(target='cuda', host='llvm')
tgt_cpu = tvm.target.Target(target='llvm', host='llvm')

In [24]:
mask = te.placeholder((n,), name='mask')
print(mask)

Tensor(shape=[n], op.name=mask)


In [25]:
n = te.var("n")
A = te.placeholder((n, ), name='A')
B = te.compute((n,), lambda i: A[i]+mask[i], name='B')


In [27]:
s = te.create_schedule(B.op)

test_kernel = tvm.build(s, [A, B, mask, n], target=tgt_cpu, name='test_kernel')

In [28]:
print(tvm.lower(s, [A, B, mask, n], simple_mode=True))

# from tvm.script import ir as I
# from tvm.script import tir as T

@I.ir_module
class Module:
    @T.prim_func
    def main(A: T.handle, B: T.handle, mask: T.handle, n: T.int32):
        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
        stride = T.var("int32")
        A_1 = T.match_buffer(A, (n,), strides=(stride,), type="auto")
        stride_1 = T.var("int32")
        B_1 = T.match_buffer(B, (n,), strides=(stride_1,), type="auto")
        n_1 = T.var("int32")
        stride_2 = T.var("int32")
        mask_1 = T.match_buffer(mask, (n_1,), strides=(stride_2,), type="auto")
        for i in range(n):
            B_2 = T.Buffer((stride_1 * n,), data=B_1.data, type="auto")
            A_2 = T.Buffer((stride * n,), data=A_1.data, type="auto")
            mask_2 = T.Buffer((stride_2 * n_1,), data=mask_1.data, type="auto")
            B_2[i * stride_1] = A_2[i * stride] + mask_2[i * stride_2]


In [4]:
run_cuda = True

tgt_gpu = tvm.target.Target(target='cuda', host='llvm')

n = te.var("n")
A = te.placeholder((n, ), name='A')
B = te.placeholder((n, ), name="B")
C = te.compute(A.shape, lambda i : A[i] + B[i], name="C")
print(type(C))

s = te.create_schedule(C.op)

bx, tx = s[C].split(C.op.axis[0], factor=64)

s[C].bind(bx, te.thread_axis("blockIdx.x"))
s[C].bind(tx, te.thread_axis("threadIdx.x"))

fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name='myadd')

dev = tvm.device(tgt_gpu.kind.name, 0)

n=1024
a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
fadd(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy())

if(tgt_gpu.kind.name=='cuda'
    or tgt_gpu.kind.name=='rocm'
    or tgt_gpu.kind.name.startwith('opencl')):
    dev_module = fadd.imported_modules[0]
    print("-----GPU code-----")
    print(dev_module.get_source())

    

<class 'tvm.te.tensor.Tensor'>
-----GPU code-----

#ifdef _WIN32
  using uint = unsigned int;
  using uchar = unsigned char;
  using ushort = unsigned short;
  using int64_t = long long;
  using uint64_t = unsigned long long;
#else
  #define uint unsigned int
  #define uchar unsigned char
  #define ushort unsigned short
  #define int64_t long long
  #define uint64_t unsigned long long
#endif
extern "C" __global__ void __launch_bounds__(64) myadd_kernel0(float* __restrict__ C, float* __restrict__ A, float* __restrict__ B, int n, int stride, int stride_1, int stride_2) {
  if (((int)blockIdx.x) < (n >> 6)) {
    C[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride)] = (A[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride_1)] + B[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride_2)]);
  } else {
    if (((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) < n) {
      C[(((((int)blockIdx.x) * 64) + ((int)threadIdx.x)) * stride)] = (A[(((((int)blockIdx.x) * 64) + ((int)thr

In [5]:
from tvm.contrib import cc
from tvm.contrib import utils

temp = utils.tempdir()
fadd.save(temp.relpath('myadd.o')) # `fadd` is host function
fadd.imported_modules[0].save(temp.relpath('myadd.cubin')) # `fadd.imported_functions[0]` is the gpu kernel
cc.create_shared(temp.relpath('myadd.so'), [temp.relpath('myadd.o')])
print(temp.listdir())


['myadd.cubin', 'myadd.o', 'myadd.so', 'myadd.tvm_meta.json']


In [6]:
fadd1 = tvm.runtime.load_module(temp.relpath('myadd.so'))
fadd1_dev = tvm.runtime.load_module(temp.relpath('myadd.cubin'))
fadd1.import_module(fadd1_dev)

In [7]:
fadd1(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy())

In [8]:
fadd.export_library(temp.relpath('myadd_pack.so'))
fadd2 = tvm.runtime.load_module(temp.relpath('myadd_pack.so'))
fadd2(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy()+b.numpy())