In [1]:
import onnx
import tvm
from tvm import relay, relax
import numpy as np
from tvm.script import tir as T
from tvm.script import relax as R

In [2]:
with open("tinyyolov2-8.onnx", "rb") as f:
    onnx_model = onnx.load(f)

print(onnx_model.graph.input[0])

mod, params = relay.frontend.from_onnx(onnx_model, {"image": (1,3,416,416)})

name: "image"
type {
  tensor_type {
    elem_type: 1
    shape {
      dim {
        dim_param: "None"
        denotation: "DATA_BATCH"
      }
      dim {
        dim_value: 3
        denotation: "DATA_CHANNEL"
      }
      dim {
        dim_value: 416
        denotation: "DATA_FEATURE"
      }
      dim {
        dim_value: 416
        denotation: "DATA_FEATURE"
      }
    }
  }
  denotation: "IMAGE"
}
doc_string: "Input image. Image(s) in RGB format. It is a [N, C, H, W]-tensor. The 1st/2nd/3rd slices along the C-axis are red, green, and blue channels, respectively."



In [3]:
from tvm.relax.testing.relay_translator import from_relay

relax_mod = from_relay(mod["main"], target="llvm")
#relax_mod.show()

In [4]:
# En benchmarkant le modèle on remarque que quelques convolutions monopolisent la puissance de calcul


x = np.random.rand(1, 3, 416, 416).astype("float32")
tvm_x = tvm.nd.array(x)

ex = relax.build(relax_mod, target="llvm")
vm = relax.VirtualMachine(ex, tvm.cpu(), profile=True)
evaluator = vm.profile("main",
    tvm_x, *params
)
evaluator

Name                          Duration (us)  Percent  Device  Count                                                                          Argument Shapes  
contrib_conv2d_NCHWc7          3 038 160,84    46.56    cpu0      1    float32[1, 256, 15, 15, 4], float32[256, 256, 3, 3, 4, 4], float32[1, 256, 13, 13, 4]  
contrib_conv2d_NCHWc6          1 533 685,92    23.50    cpu0      1    float32[1, 128, 15, 15, 4], float32[256, 128, 3, 3, 4, 4], float32[1, 256, 13, 13, 4]  
contrib_conv2d_NCHWc4            404 158,68     6.19    cpu0      1        float32[1, 32, 28, 28, 4], float32[64, 32, 3, 3, 4, 4], float32[1, 64, 26, 26, 4]  
contrib_conv2d_NCHWc5            390 758,51     5.99    cpu0      1      float32[1, 64, 15, 15, 4], float32[128, 64, 3, 3, 4, 4], float32[1, 128, 13, 13, 4]  
contrib_conv2d_NCHWc3            369 654,51     5.66    cpu0      1        float32[1, 16, 54, 54, 4], float32[32, 16, 3, 3, 4, 4], float32[1, 32, 52, 52, 4]  
contrib_conv2d_NCHWc2            350 578,98   

In [5]:
# implémentation relax de la conv7

class RelaxConv(relax.frontend.nn.Module):
    def __init__(self):
        super(RelaxConv, self).__init__()
        self.conv = relax.frontend.nn.Conv2D(1024, 1024, kernel_size=3, stride=1, padding=0, bias=False)

    def forward(self, x):
        x = self.conv(x)
        return x

input_shape = (1, 1024, 15, 15)
relax_mod, _ = RelaxConv().export_tvm({"forward": {"x": relax.frontend.nn.spec.Tensor(input_shape, "float32")}})
relax_mod = relax.transform.LegalizeOps()(relax_mod) # tvm ne veut pas compiler si on ne descend pas en tir manuellement
# relax_mod.show()

In [6]:
# implémentation perso de la même convolution, en passant par des layouts différents


@tvm.script.ir_module
class MyConv:
    @T.prim_func
    def conv(X: T.Buffer((1, 1024, 15, 15), "float32"),
             W: T.Buffer((1024, 3,3, 1024), "float32"),
             Y: T.Buffer((1, 13,13, 1024), "float32")):
        for di, dj, i, j, c_in, c_out in T.grid(3,3,13,13,1024,1024):
            with T.block("out"):
                v_c_in, v_c_out, v_i, v_j, v_di, v_dj = T.axis.remap("RSSSRR", (c_in, c_out, i, j, di, dj))
                with T.init():
                    Y[0, v_i, v_j, v_c_out] = 0
                Y[0, v_i, v_j, v_c_out] = Y[0, v_i, v_j, v_c_out] + X[0, v_c_in, v_i+v_di, v_j+v_dj] * W[v_c_in, v_di, v_dj, v_c_out]

    @R.function
    def forward(X: R.Tensor((1, 1024, 15, 15), "float32"),
                W: R.Tensor((1024, 1024, 3, 3), "float32")):
        cls = MyConv
        with R.dataflow():
            # transposed_x = relax.op.permute_dims(X, [0,2,3,1])
            transposed_w = relax.op.permute_dims(W, [0,2,3,1])
            lv0 = R.call_tir(cls.conv, (X,transposed_w), out_sinfo=R.Tensor((1, 13, 13, 1024), dtype="float32"))
            lv1 = relax.op.permute_dims(lv0, [0,3,1,2])
            R.output(lv1)
        return lv1

#my_conv = relax.transform.LegalizeOps()(MyConv)
#my_conv.show()

In [7]:
ex1 = relax.build(relax_mod, target="llvm -mcpu=core-avx2")
vm1 = relax.VirtualMachine(ex1, tvm.cpu(), profile=True)
ex2 = relax.build(MyConv, target="llvm -mcpu=core-avx2")
vm2 = relax.VirtualMachine(ex2, tvm.cpu(), profile=True)

In [8]:
# on vérifie que la version custom fonctionne

x = np.random.rand(1,1024,15,15).astype("float32")
tvm_x = tvm.nd.array(x)
w = np.random.rand(1024,1024,3,3).astype("float32")
tvm_w = tvm.nd.array(w)

res1 = vm1["forward"](tvm_x, tvm_w).numpy()
res2 = vm2["forward"](tvm_x, tvm_w).numpy()
np.testing.assert_allclose(res1, res2, atol=1e-4, rtol=1)

In [9]:
evaluator = vm1.profile("forward",
    tvm_x, tvm_w
)
evaluator

Name                          Duration (us)  Percent  Device  Count                                                                Argument Shapes  
conv2d                         2 966 032,76   100.00    cpu0      1  float32[1, 1024, 15, 15], float32[1024, 1024, 3, 3], float32[1, 1024, 13, 13]  
vm.builtin.check_tensor_info           6,77     0.00    cpu0      1                                                       float32[1, 1024, 15, 15]  
vm.builtin.match_shape                 2,97     0.00    cpu0      1                                                       float32[1, 1024, 15, 15]  
----------                                                                                                                                          
Sum                            2 966 042,49   100.00              3                                                                                 
Total                          2 966 121,69             cpu0      1                                       

In [10]:
# on obtient un gain de vitesse entre x3 et x4 si on ignore la transposition du kernel

evaluator = vm2.profile("forward",
    tvm_x, tvm_w
)
evaluator

Name                          Duration (us)  Percent  Device  Count                                                                Argument Shapes  
conv                             560 013,21    97.60    cpu0      1  float32[1, 1024, 15, 15], float32[1024, 3, 3, 1024], float32[1, 13, 13, 1024]  
transpose                         13 102,19     2.28    cpu0      1                           float32[1024, 1024, 3, 3], float32[1024, 3, 3, 1024]  
transpose1                           474,37     0.08    cpu0      1                             float32[1, 13, 13, 1024], float32[1, 1024, 13, 13]  
vm.builtin.check_tensor_info           4,24     0.00    cpu0      1                                                       float32[1, 1024, 15, 15]  
vm.builtin.match_shape                 2,42     0.00    cpu0      1                                                       float32[1, 1024, 15, 15]  
vm.builtin.check_tensor_info           2,34     0.00    cpu0      1                                       

In [11]:


sch=tvm.tir.Schedule(MyConv)
out = sch.get_block("out", func_name="conv")
di, dj, i, j, c_in, c_out = sch.get_loops(out)

sch.reorder(i, c_in, j, di, dj, c_out)

sch.parallel(i)

# on obtient des performances moins bonnes en unrollant/vectorisant

#sch.unroll(c_out)
#sch.vectorize(c_out)

sch.mod.show()

ex_ = relax.build(sch.mod, target="llvm -mcpu=core-avx2")
vm_ = relax.VirtualMachine(ex_, tvm.cpu(), profile=True)
evaluator = vm_.profile("forward",
    tvm_x, tvm_w
)

evaluator

Name                          Duration (us)  Percent  Device  Count                                                                Argument Shapes  
conv                             117 834,46    89.75    cpu0      1  float32[1, 1024, 15, 15], float32[1024, 3, 3, 1024], float32[1, 13, 13, 1024]  
transpose                         12 756,29     9.72    cpu0      1                           float32[1024, 1024, 3, 3], float32[1024, 3, 3, 1024]  
transpose1                           480,60     0.37    cpu0      1                             float32[1, 13, 13, 1024], float32[1, 1024, 13, 13]  
vm.builtin.check_tensor_info           5,90     0.00    cpu0      1                                                       float32[1, 1024, 15, 15]  
vm.builtin.match_shape                 4,27     0.00    cpu0      1                                                       float32[1, 1024, 15, 15]  
vm.builtin.match_shape                 4,04     0.00    cpu0      1                                       

In [39]:
import torch
import timeit
import time


x_ = torch.Tensor(x)
w_ = torch.Tensor(w)

#timeit.timeit(lambda: torch.nn.functional.conv2d(x_, w_), number=10)/10

start = time.time()
torch.nn.functional.conv2d(x_, w_)
end = time.time()

print("Torch:", end - start)

start = time.time()
vm_["forward"](tvm_x, tvm_w)
end = time.time()

print("Mon schedule:", end - start)

Torch: 0.10340332984924316
Mon schedule: 0.13658976554870605


In [29]:
# Modèle relay pour la même convolution 


# Define the input shape (batch size, channels, height, width)
input_shape = (1, 1024, 15, 15)
# Define the kernel shape (output channels, input channels, kernel height, kernel width)
kernel_shape = (1024, 1024, 3,3)

# Create a random input tensor
x_var = relay.var("X", shape = input_shape)
# Create a random weight tensor for the convolution
w_var = relay.var("W", shape = kernel_shape)

# Define the convolution operation
conv = relay.nn.conv2d(
    x_var,
    w_var,
    strides=(1, 1),
    padding=(0,0),
    kernel_size=(3, 3),
    channels=1024,
    out_dtype="float32"
)

# Create a Relay function
func = relay.Function([x_var, w_var], conv)

# Create a Relay module
relay_mod = tvm.IRModule.from_expr(func)
relay_mode = relay.transform.InferType()(relay_mod)

# Print the Relay module
relay_mod.show()

In [37]:
# en spécifiant -avx2 à Relay on obtient mieux que torch

from tvm.contrib import graph_executor

dev = tvm.cpu()  # or tvm.cuda() for GPU
lib = relay.build(relay_mod, target="llvm -mcpu=core-avx2", params=None)
graph_mod = graph_executor.GraphModule(lib["default"](dev))

print(graph_mod.benchmark(tvm.cpu(), number=5, repeat=5))

Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  55.2157      55.0933      56.0975      54.6721       0.5027                  


In [31]:
from tvm.autotvm.tuner import XGBTuner, GATuner, RandomTuner, GridSearchTuner
from tvm import autotvm

target = tvm.target.Target("llvm -mcpu=core-avx2")

In [32]:
tasks = autotvm.task.extract_from_program(func, target=target, params={})

In [33]:
tuner = autotvm.tuner.GATuner(tasks[0])

measure_option = autotvm.measure_option(
    builder=autotvm.LocalBuilder(),
    runner=autotvm.LocalRunner(number=5, repeat=1, min_repeat_ms=100)
)

tuner.tune(
        n_trial=10,
        measure_option=measure_option,
        callbacks=[autotvm.callback.log_to_file(f"conv2d.log")]
)

In [34]:
with autotvm.apply_history_best(f"conv2d.log"):
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(func, target=target, params={})


In [38]:
graph_mod = graph_executor.GraphModule(lib["default"](dev))

print(graph_mod.benchmark(tvm.cpu(), number=5, repeat=5))

Execution time summary:
 mean (ms)   median (ms)    max (ms)     min (ms)     std (ms)  
  56.4165      55.9922      59.5721      54.0611       1.7836                  
