In [1]:
%env CONDUCTOR_PATH=/home/damian/.conductor

import time
import numpy as np
import mxnet as mx
import gluonnlp as nlp
import tvm
from tvm import relay
import tvm.contrib.graph_runtime as runtime
import conductor
from conductor.mediation import Tasker

env: CONDUCTOR_PATH=/home/damian/.conductor


In [2]:
def timer(thunk, repeat=1, number=10, dryrun=3, min_repeat_ms=1000):
    """Helper function to time a function"""
    for i in range(dryrun):
        thunk()
    ret = []
    for _ in range(repeat):
        while True:
            beg = time.time()
            for _ in range(number):
                thunk()
            end = time.time()
            lat = (end - beg) * 1e3
            if lat >= min_repeat_ms:
                break
            number = int(max(min_repeat_ms / (lat / number) + 1, number * 1.618))
        ret.append(lat / number)
    return ret


model_name = 'bert_12_768_12'
dataset = 'book_corpus_wiki_en_uncased'
bert, _ = nlp.model.get_model(
    name=model_name,
    ctx=mx.cpu(0),
    dataset_name=dataset,
    pretrained=False,
    use_pooler=True,
    use_decoder=False,
    use_classifier=False)

seq_length = 128
batch = 1
mx_ctx = mx.gpu(0)
model = nlp.model.BERTClassifier(bert, dropout=0.1, num_classes=2)
model.initialize(ctx=mx_ctx)
model.hybridize(static_alloc=True)

dtype = "float32"
inputs = np.random.randint(0, 2000, size=(batch, seq_length)).astype(dtype)
token_types = np.random.uniform(size=(batch, seq_length)).astype(dtype)
valid_length = np.asarray([seq_length] * batch).astype(dtype)



In [3]:
inputs_nd = mx.nd.array(inputs, ctx=mx_ctx)
token_types_nd = mx.nd.array(token_types, ctx=mx_ctx)
valid_length_nd = mx.nd.array(valid_length, ctx=mx_ctx)
mx_out = model(inputs_nd, token_types_nd, valid_length_nd)
mx_out.wait_to_read()

# Benchmark the MXNet latency
res = timer(lambda: model(inputs_nd, token_types_nd, valid_length_nd).wait_to_read(),
            repeat=3,
            dryrun=5,
            min_repeat_ms=1000)
print(f"MXNet latency for batch {batch} and seq length {seq_length}: {np.mean(res):.2f} ms")

MXNet latency for batch 1 and seq length 128: 7.97 ms


In [4]:
shape_dict = {
    'data0': (batch, seq_length),
    'data1': (batch, seq_length),
    'data2': (batch,)
}
mod, params = relay.frontend.from_mxnet(model, shape_dict)

tasks = Tasker.extract_tensor_programs(mod, params, "bert", "cuda", "llvm")
for t in tasks:
    print(t.identifier)
    t.save()

def @main(%bertmodel0_word_embed_embedding0_weight: Tensor[(30522, 768), float32], %data0: Tensor[(1, 128), float32], %bertmodel0_token_type_embed_embedding0_weight: Tensor[(2, 768), float32], %data1: Tensor[(1, 128), float32], %bertencoder0_position_weight: Tensor[(512, 768), float32], %bertencoder0_layernorm0_gamma: Tensor[(768), float32], %bertencoder0_layernorm0_beta: Tensor[(768), float32], %bertencoder0_transformer0_dotproductselfattentioncell0_query_weight: Tensor[(768, 768), float32], %bertencoder0_transformer0_dotproductselfattentioncell0_key_weight: Tensor[(768, 768), float32], %bertencoder0_transformer0_dotproductselfattentioncell0_value_weight: Tensor[(768, 768), float32], %bertencoder0_transformer0_dotproductselfattentioncell0_query_bias: Tensor[(768), float32], %bertencoder0_transformer0_dotproductselfattentioncell0_key_bias: Tensor[(768), float32], %bertencoder0_transformer0_dotproductselfattentioncell0_value_bias: Tensor[(768), float32], %data2: Tensor[(1), float32], %b

: 