In [2]:
import tvm
from tvm.script import tir as T
from tvm.script import relax as R
from tvm import relay
from tvm import relax
import numpy as np
import IPython

from tvm import meta_schedule as ms

In [3]:
@tvm.script.ir_module
class MNIST:
    @T.prim_func
    def div255(X: T.Buffer((1,1,28,28), "float32"),
               Y: T.Buffer((28,28), "float32")):
        for i, j in T.grid(28,28):
            with T.block("Y"):
                vi, vj = T.axis.remap("SS", (i,j))
                Y[vi, vj] = X[0,0,vi, vj] * T.float32(1/255)

    @T.prim_func
    def Conv2DRelu_1(X: T.Buffer((28,28),"float32"),
                     W: T.Buffer((8, 1, 5, 5), "float32"),
                     B: T.Buffer((8, 1, 1), "float32"),
                     Y: T.Buffer((8, 28, 28), "float32")):
        padded = T.alloc_buffer((32,32), "float32")
        A = T.alloc_buffer((8,28,28), "float32")
        for i, j in T.grid(32, 32):
            with T.block("pad_init"):
                vi, vj = T.axis.remap("SS", [i, j])
                padded[vi, vj] = 0
        for i, j in T.grid(28, 28):
            with T.block("copy"):
                vi, vj = T.axis.remap("SS", [i, j])
                padded[vi+2, vj+2] = X[vi, vj]
        for q, i, j, di, dj in T.grid(8, 28, 28, 5, 5):
            with T.block("conv"):
                vq, vi, vj, vdi, vdj = T.axis.remap("SSSSS", [q, i, j, di, dj])
                with T.init():
                    A[vq, vi, vj] = 0
                A[vq, vi, vj] += padded[vi+vdi-2, vj+vdj-2] * W[vq, 1, vdi, vdj]
        for q, i, j in T.grid(8, 28, 28):
            with T.block("biasrelu"):
                vq, vi, vj = T.axis.remap("SSS", [q, i, j])
                Y[vq, vi, vj] = T.max(A[vq, vi, vj] + B[vq, 0, 0], 0)

    @T.prim_func
    def MaxPooling2D_1(X: T.Buffer((8, 28, 28), "float32"),
                       Y: T.Buffer((8, 14,14), "float32")):
        for q, i, j in T.grid(8, 14, 14):
            with T.block("Y"):
                vq, vi, vj = T.axis.remap("SSS", [q,i,j])
                Y[vq, vi, vj] = T.max(T.max(X[vq, 2*vi, 2*vj], X[vq, 2*vi, 2*vj+1]), T.max(X[vq, 2*vi+1, 2*vj], X[vq, 2*vi+1, 2*vj+1]))

    @T.prim_func
    def Conv2DRelu_2(X: T.Buffer((8, 14, 14),"float32"),
                     W: T.Buffer((16, 8, 5, 5), "float32"),
                     B: T.Buffer((16, 1, 1), "float32"),
                     Y: T.Buffer((16, 14, 14), "float32")):
        padded = T.alloc_buffer((8,20,20), "float32")
        A = T.alloc_buffer((16,14,14),"float32")
        for k, i, j in T.grid(8, 20, 20):
            with T.block("pad_init"):
                vk, vi, vj = T.axis.remap("SSS", [k, i, j])
                padded[vk, vi, vj] = 0
        for k, i, j in T.grid(8, 14, 14):
            with T.block("copy"):
                vk, vi, vj = T.axis.remap("SSS", [k, i, j])
                padded[vk, vi+2, vj+2] = X[vk, vi, vj]
        for k, i, j, q, di, dj in T.grid(16, 14, 14, 8, 5, 5):
            with T.block("conv"):
                vq, vi, vj, vk, vdi, vdj = T.axis.remap("SSSSSS", [q, i, j, k, di, dj])
                with T.init():
                    A[vq, vi, vj] = 0
                A[vq, vi, vj] += padded[vk, vi+vdi-2, vj+vdj-2] * W[vq, vk, vdi, vdj]
        for q, i, j in T.grid(16, 14, 14):
            with T.block("biasrelu"):
                vq, vi, vj = T.axis.remap("SSS", [q, i, j])
                Y[vq, vi, vj] = T.max(A[vq, vi, vj] + B[vq, 0, 0], 0)

    @T.prim_func
    def MaxPooling2D_2(X: T.Buffer((16, 14, 14), "float32"),
                       Y: T.Buffer((16, 4, 4), "float32")):
        for q, i, j, di, dj in T.grid(16, 4, 4, 3, 3):
            with T.block("Y"):
                vq, vi, vj, vdi, vdj = T.axis.remap("SSSSS", [q, i, j, di, dj])
                with T.init():
                    Y[vq, vi, vj] = 0
                Y[vq, vi, vj] = T.max(Y[vq, vi, vj], X[vq, 3*vi+vdi, 3*vj+vdj])

    @T.prim_func
    def Dense(X: T.Buffer((16, 4, 4), "float32"),
              W: T.Buffer((10, 256), "float32"),
              B: T.Buffer((1, 10), "float32"),
              Y: T.Buffer((10), "float32")):
        temp = T.alloc_buffer((10), "float32")
        for i, k1, k2, k3 in T.grid(10, 16, 4, 4):
            with T.block("mul"):
                vi, vk1, vk2, vk3 = T.axis.remap("SRRR", [i, k1, k2, k3])
                with T.init():
                    temp[vi] = 0
                temp[vi] = temp[vi] + X[vk1, vk2, vk3]*W[vi, 16*vk1+4*vk2+vk3]
        for i in range(10):
            with T.block("bias"):
                vi = T.axis.spatial(10, i)
                Y[vi] = temp[vi] + B[1, vi]

    @R.function
    def main(x: R.Tensor((1, 1, 28, 28), "float32"),
             w0: R.Tensor((8, 1, 5, 5), "float32"),
             b0: R.Tensor((8, 1, 1), "float32"),
             w1: R.Tensor((16, 8, 5, 5), "float32"),
             b1: R.Tensor((16, 1, 1), "float32"),
             w2: R.Tensor((10, 256), "float32"),
             b2: R.Tensor((1, 10), "float32")
             ):
        with R.dataflow():
            lv0 = R.call_dps_packed("div255", (x, ), R.Tensor((28, 28), "float32"))
            lv1 = R.call_dps_packed("Conv2DRelu_1", (lv0, w0, b0), R.Tensor((8, 28, 28), "float32"))
            lv2 = R.call_dps_packed("MaxPooling2D_1", (lv1, ), R.Tensor((8, 14, 14), "float32"))
            lv3 = R.call_dps_packed("Conv2DRelu_2", (lv2, w1, b1), R.Tensor((16, 14, 14), "float32"))
            lv4 = R.call_dps_packed("MaxPooling2D_2", (lv3, ), R.Tensor((16, 4, 4), "float32"))
            out = R.call_dps_packed("Dense", (lv4, w2, b2), R.Tensor((10,), "float32"))
            R.output(out)
        return out

In [8]:
conv = MNIST["Conv2DRelu_2"].with_attr("global_symbol", "main")
conv.show()

In [9]:
database = ms.tune_tir(
    mod = conv,
    target="llvm --num-cores=4",
    max_trials_global=64,
    num_trials_per_iter=64,
    work_dir="./tune_tmp",
)
sch = ms.tir_integration.compile_tir(database, conv, "llvm --num-cores=4")

2025-04-03 18:19:52 [INFO] Logging directory: ./tune_tmp/logs
2025-04-03 18:19:52 [INFO] LocalBuilder: max_workers = 2
2025-04-03 18:19:54 [INFO] LocalRunner: max_workers = 1
2025-04-03 18:19:56 [INFO] [task_scheduler.cc:159] Initializing Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,1260672,1,,,,0,


2025-04-03 18:19:56 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 1260672 |      1 |            N/A |          N/A |                   N/A |      0 |      
------------------------------------------------------------------------------------------------------
Total trials: 0
Total latency (us): 0


Total trials: 0
Total latency (us): 0

2025-04-03 18:19:56 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2025-04-03 18:20:04 [INFO] [task_scheduler.cc:193] Sending 2 sample(s) to builder
2025-04-03 18:20:06 [INFO] [task_scheduler.cc:195] Sending 2 sample(s) to runner
2025-04-03 18:20:07 [DEBUG] XGB iter   0: tr-p-rmse: 0.247840	tr-a-peak@32: 0.990882	tr-rmse: 0.578186	tr-rmse: 0.578186
2025-04-03 18:20:07 [DEBUG] XGB iter  25: tr-p-rmse: 0.009118	tr-a-peak@32: 0.990882	

Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,1260672,1,6.5899,191.3049,191.3049,2,


2025-04-03 18:20:07 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 1260672 |      1 |         6.5899 |     191.3049 |              191.3049 |      2 |      
------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 191.305


Total trials: 2
Total latency (us): 191.305

2025-04-03 18:20:07 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2025-04-03 18:20:16 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2025-04-03 18:20:16 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2025-04-03 18:20:16 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,1260672,1,6.5899,191.3049,191.3049,2,


2025-04-03 18:20:16 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 1260672 |      1 |         6.5899 |     191.3049 |              191.3049 |      2 |      
------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 191.305


Total trials: 2
Total latency (us): 191.305

2025-04-03 18:20:16 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2025-04-03 18:20:23 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2025-04-03 18:20:23 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2025-04-03 18:20:23 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,1260672,1,6.5899,191.3049,191.3049,2,


2025-04-03 18:20:23 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 1260672 |      1 |         6.5899 |     191.3049 |              191.3049 |      2 |      
------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 191.305


Total trials: 2
Total latency (us): 191.305

2025-04-03 18:20:23 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2025-04-03 18:20:30 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2025-04-03 18:20:30 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2025-04-03 18:20:30 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,1260672,1,6.5899,191.3049,191.3049,2,



Total trials: 2
Total latency (us): 191.305

2025-04-03 18:20:30 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 1260672 |      1 |         6.5899 |     191.3049 |              191.3049 |      2 |      
------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 191.305

2025-04-03 18:20:30 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2025-04-03 18:20:37 [INFO] [task_scheduler.cc:193] Sending 0 sample(s) to builder
2025-04-03 18:20:37 [INFO] [task_scheduler.cc:195] Sending 0 sample(s) to runner
2025-04-03 18:20:37 [INFO] [task_scheduler.cc:237] [Updated] Task #0: "main"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,1260672,1,6.5899,191.3049,191.3049,2,


2025-04-03 18:20:37 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 1260672 |      1 |         6.5899 |     191.3049 |              191.3049 |      2 |      
------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 191.305


Total trials: 2
Total latency (us): 191.305

2025-04-03 18:20:37 [INFO] [task_scheduler.cc:180] TaskScheduler picks Task #0: "main"
2025-04-03 18:20:43 [INFO] [task_scheduler.cc:260] Task #0 has finished. Remaining task(s): 0


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,main,1260672,1,6.5899,191.3049,191.3049,2,Y


2025-04-03 18:20:43 [DEBUG] [task_scheduler.cc:318] 
 ID | Name |    FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
------------------------------------------------------------------------------------------------------
  0 | main | 1260672 |      1 |         6.5899 |     191.3049 |              191.3049 |      2 |    Y 
------------------------------------------------------------------------------------------------------
Total trials: 2
Total latency (us): 191.305


Total trials: 2
Total latency (us): 191.305



In [13]:
sch.mod.show()

In [12]:
x = np.random.rand(8,14,14).astype("float32")
x = tvm.nd.array(x, device=tvm.cpu())
w = np.random.rand(16,8,5,5).astype("float32")
w = tvm.nd.array(w, device=tvm.cpu())
b = np.random.rand(16,1,1).astype("float32")
b = tvm.nd.array(b, device=tvm.cpu())
y = tvm.nd.empty((16,14,14), dtype="float32")

mod_conv= tvm.IRModule.from_expr(MNIST["Conv2DRelu_2"].with_attr("global_symbol", "main"))

lib = tvm.build(mod_conv, target="llvm")
f_timer_before = lib.time_evaluator("main", tvm.cpu())
print("Time cost of MyModule before tuning: %.3f ms" % (f_timer_before(x,w,b,y).mean * 1000))

lib = tvm.build(sch.mod, target="llvm")
f_timer_after = lib.time_evaluator("main", tvm.cpu())
print("Time cost of MyModule after tuning: %.3f ms" % (f_timer_after(x,w,b,y).mean * 1000))

Time cost of MyModule before tuning: 0.356 ms
Time cost of MyModule after tuning: 0.313 ms


In [8]:
sch = tvm.tir.Schedule(MNIST)

conv = sch.get_block("conv", func_name="Conv2DRelu_1")
k, i, j, di, dj = sch.get_loops(conv)

#sch.reorder(k, i, j, di, dj)

sch.parallel(k)
sch.unroll(di)
sch.vectorize(dj)


conv2 = sch.get_block("conv", func_name="Conv2DRelu_2")
k, i, j, q, di, dj = sch.get_loops(conv2)

sch.reorder(q, k, i, j, di, dj)

sch.parallel(q)
sch.unroll(di)
sch.vectorize(dj)

InternalError: Traceback (most recent call last):
  6: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)>::AssignTypedLambda<tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}>(tvm::__mk_TVM24::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#1}, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::TVMRetValue)
  5: tvm::TIRToRuntime(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
  4: tvm::codegen::Build(tvm::IRModule, tvm::Target)
  3: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::IRModule, tvm::Target)>::AssignTypedLambda<tvm::codegen::__mk_TVM0::{lambda(tvm::IRModule, tvm::Target)#1}>(tvm::codegen::__mk_TVM0::{lambda(tvm::IRModule, tvm::Target)#1}, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, tvm::runtime::TVMRetValue)
  2: tvm::codegen::LLVMModuleNode::Init(tvm::IRModule const&, tvm::Target const&)
  1: tvm::tir::PrimFunc tvm::runtime::Downcast<tvm::tir::PrimFunc, tvm::BaseFunc>(tvm::BaseFunc)
  0: _ZN3tvm7runtime6deta
  File "/home/gael/tvm/include/tvm/runtime/object.h", line 938
InternalError: Check failed: (ref->template IsInstance<typename SubRef::ContainerType>()) is false: Downcast from relax.expr.Function to tir.PrimFunc failed.