In [1]:
import os
import numpy as np
import torch
from torch.export import export
from torchvision.models.resnet import ResNet18_Weights, resnet18

torch_model = resnet18(weights=ResNet18_Weights.DEFAULT).eval()

In [3]:
import tvm
from tvm import relax
from tvm.relax.frontend.torch import from_exported_program

# Give an example argument to torch.export
example_args = (torch.randn(1, 3, 224, 224, dtype=torch.float32),)

# Convert the model to IRModule
with torch.no_grad():
    exported_program = export(torch_model, example_args)
    mod = from_exported_program(exported_program, keep_params_as_input=True)

mod, params = relax.frontend.detach_params(mod)
mod.show()

In [None]:
TOTAL_TRIALS = 8000  # Change to 20000 for better performance if needed
target = tvm.target.Target("nvidia/geforce-rtx-3090-ti")  # Change to your target device
work_dir = "tuning_logs"

# Skip running in CI environment
IS_IN_CI = os.getenv("CI", "") == "true"
if not IS_IN_CI:
    mod = relax.get_pipeline("static_shape_tuning", target=target, total_trials=TOTAL_TRIALS)(mod)

    # Only show the main function
    mod["main"].show()

2024-11-29 02:28:56 [INFO] Logging directory: tuning_logs/logs
2024-11-29 02:28:56 [INFO] LocalBuilder: max_workers = 10
2024-11-29 02:28:58 [INFO] LocalRunner: max_workers = 1
2024-11-29 02:28:58 [INFO] [task_scheduler.cc:159] Initializing Task #0: "fused_matmul_add13"
2024-11-29 02:28:58 [INFO] [task_scheduler.cc:159] Initializing Task #1: "transpose"
2024-11-29 02:28:58 [INFO] [task_scheduler.cc:159] Initializing Task #2: "reshape"
2024-11-29 02:28:58 [INFO] [task_scheduler.cc:159] Initializing Task #3: "adaptive_avg_pool2d"
2024-11-29 02:28:58 [INFO] [task_scheduler.cc:159] Initializing Task #4: "fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4"
2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #5: "fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu"


  meta_schedule.cuda.meta_schedule.adaptive_pool_avg
  meta_schedule.gpu.meta_schedule.adaptive_pool_avg


2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #6: "fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1"
2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #7: "fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4"
2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #8: "fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4"
2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #9: "fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2"
2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #10: "fused_conv2d10_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11"
2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #11: "max_pool2d"
2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #12: "fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dim

  meta_schedule.cuda.meta_schedule.pool_max
  meta_schedule.gpu.meta_schedule.pool_max


2024-11-29 02:28:59 [INFO] [task_scheduler.cc:159] Initializing Task #14: "fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_add3_relu1"
2024-11-29 02:29:00 [INFO] [task_scheduler.cc:159] Initializing Task #15: "fused_conv2d2_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2"
2024-11-29 02:29:00 [INFO] [task_scheduler.cc:159] Initializing Task #16: "fused_conv2d5_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3"
2024-11-29 02:29:00 [INFO] [task_scheduler.cc:159] Initializing Task #17: "fused_conv2d7_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8"
2024-11-29 02:29:00 [INFO] [task_scheduler.cc:159] Initializing Task #18: "fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_add6_relu2"
2024-11-29 02:29:00 [INFO] [task_scheduler.cc:159] Initializing Task #19: "fused_conv2d6_subtract3_divide3_expand_dims2_multiply3_expand_dims2_add8_relu3"


Unnamed: 0,Name,FLOP,Weight,Speed (GFLOPS),Latency (us),Weighted Latency (us),Trials,Done
0,fused_matmul_add13,1025000,1,,,,0,
1,transpose,1,1,,,,0,
2,reshape,1,1,,,,0,
3,adaptive_avg_pool2d,25600,1,,,,0,
4,fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4,231336448,1,,,,0,
5,fused_conv2d_subtract_divide_expand_dims_multiply_expand_dims_add1_relu,240041984,1,,,,0,
6,fused_conv2d1_subtract1_divide1_expand_dims_multiply1_expand_dims_add2_relu1,232214528,2,,,,0,
7,fused_conv2d8_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_relu4,115730944,1,,,,0,
8,fused_conv2d9_subtract4_divide4_expand_dims3_multiply4_expand_dims3_add11_add12_relu4,231361536,2,,,,0,
9,fused_conv2d3_subtract2_divide2_expand_dims1_multiply2_expand_dims1_add5_relu2,231712768,1,,,,0,



Total trials: 0
Total latency (us): 0

2024-11-29 02:29:00 [DEBUG] [task_scheduler.cc:318] 
 ID |                                                                                  Name |      FLOP | Weight | Speed (GFLOPS) | Latency (us) | Weighted Latency (us) | Trials | Done 
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
  0 |                                                                    fused_matmul_add13 |   1025000 |      1 |            N/A |          N/A |                   N/A |      0 |      
  1 |                                                                             transpose |         1 |      1 |            N/A |          N/A |                   N/A |      0 |      
  2 |                                                                               reshape |         1 |      1 |            N/A |          N/A |                 

In [7]:
if not IS_IN_CI:
    ex = relax.build(mod, target="cuda")
    dev = tvm.device("cuda", 0)
    vm = relax.VirtualMachine(ex, dev)
    # Need to allocate data and params on GPU device
    gpu_data = tvm.nd.array(np.random.rand(1, 3, 224, 224).astype("float32"), dev)
    gpu_params = [tvm.nd.array(p, dev) for p in params["main"]]
    gpu_out = vm["main"](gpu_data, *gpu_params).numpy()

    print(gpu_out.shape)

TVMError: Traceback (most recent call last):
  9: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::runtime::Module (tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)>::AssignTypedLambda<tvm::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#6}>(tvm::{lambda(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target)#6}, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >)::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  8: tvm::TIRToRuntime(tvm::runtime::Map<tvm::Target, tvm::IRModule, void, void> const&, tvm::Target const&)
  7: tvm::SplitMixedModule(tvm::IRModule, tvm::Target const&, tvm::Target const&)
  6: tvm::ApplyPasses(tvm::IRModule, tvm::transform::Sequential)
  5: tvm::transform::Pass::operator()(tvm::IRModule) const
  4: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  3: tvm::transform::SequentialNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  2: tvm::transform::Pass::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  1: tvm::transform::ModulePassNode::operator()(tvm::IRModule, tvm::transform::PassContext const&) const
  0: tvm::runtime::PackedFuncObj::Extractor<tvm::runtime::PackedFuncSubObj<tvm::runtime::TypedPackedFunc<tvm::IRModule (tvm::IRModule, tvm::transform::PassContext)>::AssignTypedLambda<tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1}>(tvm::tir::transform::VerifyMemory()::{lambda(tvm::IRModule, tvm::transform::PassContext)#1})::{lambda(tvm::runtime::TVMArgs const&, tvm::runtime::TVMRetValue*)#1}> >::Call(tvm::runtime::PackedFuncObj const*, tvm::runtime::TVMArgs, tvm::runtime::TVMRetValue*)
  Did you forget to bind?
    Variable `lv74` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `lv71` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
    Variable `T_add` is directly accessed by host memory (it is not contained in a thread environment or in the function arguments.
  File "/home/yrx/develop/tvm/src/tir/analysis/verify_memory.cc", line 205
RuntimeError: Memory verification failed with the following errors:
# from tvm.script import tir as T

@T.prim_func
def add3(lv71: T.Buffer((T.int64(1), T.int64(512), T.int64(7), T.int64(7)), "float32"), lv74: T.Buffer((T.int64(1), T.int64(512), T.int64(7), T.int64(7)), "float32"), T_add: T.Buffer((T.int64(1), T.int64(512), T.int64(7), T.int64(7)), "float32")):
    T.func_attr({"target": T.target({"arch": "sm_89", "host": {"keys": ["cpu"], "kind": "llvm", "mtriple": "x86_64-unknown-linux-gnu", "tag": ""}, "keys": ["cuda", "gpu"], "kind": "cuda", "max_num_threads": 1024, "tag": "", "thread_warp_size": 32}), "tir.noalias": T.bool(True)})
    for ax1, ax2, ax3 in T.grid(512, 7, 7):
        cse_var_1: T.int32 = ax1 * 49 + ax2 * 7 + ax3
        T_add_1 = T.Buffer((T.int64(25088),), data=T_add.data)
        lv71_1 = T.Buffer((T.int64(25088),), data=lv71.data)
        lv74_1 = T.Buffer((T.int64(25088),), data=lv74.data)
        T_add_1[cse_var_1] = lv71_1[cse_var_1] + lv74_1[cse_var_1]