In [1]:
import numpy as np

import tvm
from tvm import relay, auto_scheduler
from tvm.relay import data_dep_optimization as ddo
import tvm.relay.testing
from tvm.contrib import graph_executor

In [2]:
def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=False):
    """Get the symbol definition and random weight of a network"""

    # auto-scheduler 更适合 NHWC 布局
    if layout == "NHWC":
        image_shape = (224, 224, 3)
    elif layout == "NCHW":
        image_shape = (3, 224, 224)
    else:
        raise ValueError("Invalid layout: " + layout)

    input_shape = (batch_size,) + image_shape
    output_shape = (batch_size, 1000)

    if name.startswith("resnet-"):
        n_layer = int(name.split("-")[1])
        mod, params = relay.testing.resnet.get_workload(
            num_layers=n_layer,
            batch_size=batch_size,
            layout=layout,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name.startswith("resnet3d-"):
        n_layer = int(name.split("-")[1])
        mod, params = relay.testing.resnet.get_workload(
            num_layers=n_layer,
            batch_size=batch_size,
            layout=layout,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name == "mobilenet":
        mod, params = relay.testing.mobilenet.get_workload(
            batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
        )
    elif name == "squeezenet_v1.1":
        assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
        mod, params = relay.testing.squeezenet.get_workload(
            version="1.1",
            batch_size=batch_size,
            dtype=dtype,
            image_shape=image_shape,
        )
    elif name == "inception_v3":
        input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
    elif name == "mxnet":
        # MXNet 模型的示例
        from mxnet.gluon.model_zoo.vision import get_model
        assert layout == "NCHW"

        block = get_model("resnet50_v1", pretrained=True)
        mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
        net = mod["main"]
        net = relay.Function(
            net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
        )
        mod = tvm.IRModule.from_expr(net)
    elif name == "mlp":
        mod, params = relay.testing.mlp.get_workload(
            batch_size=batch_size, dtype=dtype, image_shape=image_shape, num_classes=1000
        )
    else:
        raise ValueError("Network not found.")

    if use_sparse:
        from tvm.topi.sparse.utils import convert_model_dense_to_sparse

        mod, params = convert_model_dense_to_sparse(mod, params, bs_r=4, random_params=True)

    return mod, params, input_shape, output_shape

# 定义神经网络和编译 target。
# 若 target 机器支持 avx512 指令，
# 使用 "llvm -mcpu=skylake-avx512" 替换 "llvm -mcpu=core-avx2"
network = "resnet-50"
use_sparse = False
batch_size = 1
layout = "NHWC"
target = tvm.target.Target("llvm -mcpu=core-avx2 --num-cores=1")
dtype = "float32"
log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)

In [3]:
# 从网络中提取任务
print("Get model...")
mod, params, input_shape, output_shape = get_network(
    network,
    batch_size,
    layout,
    dtype=dtype,
    use_sparse=use_sparse,
)
print("Extract tasks...")
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)

# for idx, task in enumerate(tasks):
#     print("========== Task %d  (workload key: %s) ==========" % (idx, task.workload_key))
#     print(task.compute_dag)

Get model...
Extract tasks...


In [None]:
def run_tuning():
    print("Begin tuning...")
    trials = 50
    tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
    tune_option = auto_scheduler.TuningOptions(
        num_measure_trials=trials,  # 将此更改为 20000 以达到最佳性能
        runner=auto_scheduler.LocalRunner(repeat=10, enable_cpu_cache_flush=True),
        measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
    )

    if use_sparse:
        from tvm.topi.sparse.utils import sparse_sketch_rules

        search_policy = [
            auto_scheduler.SketchPolicy(
                task,
                program_cost_model=auto_scheduler.XGBModel(),
                init_search_callbacks=sparse_sketch_rules(),
            )
            for task in tasks
        ]

        tuner.tune(tune_option, search_policy=search_policy)
    else:
        tuner.tune(tune_option)

# 不在网页服务器中运行调优，因为它需要的时间太长。
# 取消注释运行以下行。
run_tuning()

Begin tuning...
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
-----------------------------------------------------------------------------------------------------------------
|    0 |                          vm_mod_fused_nn_conv2d_add_nn_relu_5 |            - |              - |      0 |
|    1 |                                    vm_mod_fused_nn_conv2d_add |            - |              - |      0 |
|    2 |                                        vm_mod_fused_nn_conv2d |            - |              - |      0 |
|    3 |                          vm_mod_fused_nn_conv2d_add_nn_relu_1 |            - |              - |      0 |
|    4 |                                     vm_mod_fused_nn_dense_add |            - |              - |      0 |
|    5 |                        vm_mod_fused_nn_conv2d_add_add_nn_relu |            - |              - |      0 |
|    6 |                                      vm_mod_fused_nn_conv2d_3 |

Placeholder: p0, p1
parallel ax0@ax1@ (0,56)
  pool_max auto_unroll: 16
  for ax0 (None)
    for ax1 (None)
      for ax2 (None)
        for ax3 (None)
          for rv0 (None)
            for rv1 (None)
              pool_max = ...
  for ax2 (0,56)
    for ax0 (None)
      for ax1 (None)
        for ax2 (None)
          for ax3 (None)
            pad_temp = ...
    for ax3 (0,64)
      T_relu = ...

with: [00:59:15] /home/yrx/develop/tvm/src/te/schedule/bound.cc:175: InternalError: Check failed: (found_attach || stage_attach.size() == 0) is false: Invalid Schedule, cannot find the producer compute(pad_temp, body=[T.if_then_else(ax1 >= 1 and ax1 < 113 and ax2 >= 1 and ax2 < 113, p0[ax0, ax1 - 1, ax2 - 1, ax3], T.float32(-340282346638528859811704183484516925440.0))], axis=[T.iter_var(ax0, T.Range(0, 1), "DataPar", ""), T.iter_var(ax1, T.Range(0, 114), "DataPar", ""), T.iter_var(ax2, T.Range(0, 114), "DataPar", ""), T.iter_var(ax3, T.Range(0, 64), "DataPar", "")], reduce_axis=[], tag=ele

GA Iter: 4	Max score: 0.9999	Min score: 0.9999	#Pop: 2	#M+: 389	#M-: 6587
EvolutionarySearch		#s: 2	Time elapsed: 7.75
----------------------------------------------------------------------
------------------------------  [ Measure ]
----------------------------------------------------------------------
Get 1 programs to measure:
.*
Time elapsed for measurement: 2.63 s
----------------------------------------------------------------------
------------------------------  [ Train cost model ]
----------------------------------------------------------------------
Time elapsed for training: 0.11 s
----------------------------------------------------------------------
------------------------------  [ Task Scheduler ]
----------------------------------------------------------------------
|  ID  |                       Task Description                        | Latency (ms) | Speed (GFLOPS) | Trials |
--------------------------------------------------------------------------------------------