In [1]:
from __future__ import absolute_import, print_function

import argparse, json, os, requests, sys, time
from io import BytesIO
from os.path import join, isfile
from PIL import Image

from mxnet.gluon.model_zoo import vision
import numpy as np
from matplotlib import pyplot as plt

import tvm
from tvm import te
from tvm import rpc, autotvm, relay
from tvm.contrib import graph_runtime, utils, download
from tvm.contrib.debugger import debug_runtime
from tvm.relay import transform

import vta
from vta.testing import simulator
from vta.top import graph_pack

import torch
import torchvision
from tvm.contrib.download import download_testdata
from neurob_obf_models import custom_cnn_9



# Make sure that TVM was compiled with RPC=1
assert tvm.runtime.enabled("rpc")

  warn(f"Failed to load image Python extension: {e}")


In [2]:
# Load VTA parameters from the 3rdparty/vta-hw/config/vta_config.json file
env = vta.get_env()

# Set ``device=arm_cpu`` to run inference on the CPU
# or ``device=vta`` to run inference on the FPGA.
device = "vta"
target = env.target if device == "vta" else env.target_vta_cpu

# Dictionary lookup for when to start/end bit packing
pack_dict = {
    "resnet18_v1": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet18": ["nn.max_pool2d", "nn.adaptive_avg_pool2d"],
    "resnet34": ["nn.max_pool2d", "nn.adaptive_avg_pool2d"],
    "resnet50": ["nn.max_pool2d", "nn.adaptive_avg_pool2d"],
    "resnet101": ["nn.max_pool2d", "nn.adaptive_avg_pool2d"],
    "vgg11": ["nn.max_pool2d", "nn.dense"],
    "vgg16":    ["nn.max_pool2d", "nn.dense"],
    "resnet34_v1": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet18_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet34_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet50_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "resnet101_v2": ["nn.max_pool2d", "nn.global_avg_pool2d"],
    "mobilenetv2_1.0": ["nn.max_pool2d", "nn.global_avg_pool2d"]
}

# Name of Gluon model to compile
# The ``start_pack`` and ``stop_pack`` labels indicate where
# to start and end the graph packing relay pass: in other words
# where to start and finish offloading to VTA.
#model = "resnet18_v1"
#assert model in pack_dict
model = "resnet18"

In [None]:
env.BATCH

In [None]:
remote = None
if env.TARGET not in ["sim", "tsim", "intelfocl"]:

    # Get remote from tracker node if environment variable is set.
    # To set up the tracker, you'll need to follow the "Auto-tuning
    # a convolutional network for VTA" tutorial.
    tracker_host = os.environ.get("TVM_TRACKER_HOST", None)
    tracker_port = os.environ.get("TVM_TRACKER_PORT", None)
    # Otherwise if you have a device you want to program directly from
    # the host, make sure you've set the variables below to the IP of
    # your board.
#     device_host = os.environ.get("VTA_RPC_HOST", "192.168.2.99")
#     device_host="10.42.0.32"
    device_host = "10.42.0.188"
#     device_host="10.100.86.111"
    device_port = os.environ.get("VTA_RPC_PORT", "9091")
    if not tracker_host or not tracker_port:
        remote = rpc.connect(device_host, int(device_port))
    else:
        remote = autotvm.measure.request_remote(
            env.TARGET, tracker_host, int(tracker_port), timeout=10000
        )

    # Reconfigure the JIT runtime and FPGA.
    # You can program the FPGA with your own custom bitstream
    # by passing the path to the bitstream file instead of None.
    reconfig_start = time.time()
    vta.reconfig_runtime(remote)
    #vta.program_fpga(remote, bitstream="/home/srchand/Desktop/research/TVM_Intel_Fork/tvm/vta/sri_scripts/bitstreams/vta_il_apm.bit")
    #vta.program_fpga(remote, bitstream="/home/srchand/Desktop/research/bitstreams/vta_trojan.bit")
    #vta.program_fpga(remote, bitstream=None)
    #vta.program_fpga(remote, bitstream="/home/srchand/Desktop/research/TVM_Intel_Fork/tvm/vta/sri_scripts/bitstreams/vta_zcu104_trojan_wrapper.bit")
    #vta.program_fpga(remote, bitstream="/home/srchand/Desktop/research/TVM_Intel_Fork/tvm/vta/sri_scripts/bitstreams/vta_zcu104_ro_ref_clk_en_dis.bit")
#     vta.program_fpga(remote, bitstream="/mnt/hgfs/vmware_ubuntu_sf/bitstreams/vta_axi_sniffer_uart_rx_tx_hex.bit")
#     vta.program_fpga(remote, bitstream="/mnt/hgfs/vmware_ubuntu_sf/bitstreams/vta_ro_6m_no_axi_final_final.bit")
#     vta.program_fpga(remote, bitstream="/mnt/hgfs/vmware_ubuntu_sf/bitstreams/vta_pynq_sniffer_reset_on_read.bit")
#     vta.program_fpga(remote, bitstream='/mnt/hgfs/vmware_ubuntu_sf/vta_4x8x8/vta_1x8x32_Acc18_memory_trojan_runtime_sampling.bit')
#     vta.program_fpga(remote, bitstream='/mnt/hgfs/vmware_ubuntu_sf/vta_4x8x8/vta_new_4x8x8_memory_trojan_runtime_sampling.bit')
    vta.program_fpga(remote, bitstream='/mnt/hgfs/vmware_ubuntu_sf/vta_4x8x8/vta_new_1x16x16_memory_trojan_runtime_sampling.bit')

#     vta.program_fpga(remote, bitstream='/mnt/hgfs/vmware_ubuntu_sf/vta_4x8x8/vta_1x16x16_10000.bit')
    reconfig_time = time.time() - reconfig_start
    print("Reconfigured FPGA and RPC runtime in {0:.2f}s!".format(reconfig_time))

# In simulation mode, host the RPC server locally.
else:
    remote = rpc.LocalSession()

    if env.TARGET in ["intelfocl"]:
        # program intelfocl aocx
        vta.program_fpga(remote, bitstream="vta.bitstream")

# Get execution context from remote
ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0)

In [3]:
input_features = 150528
widen_list = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
decompo_list = [0, 0, 0, 0, 2, 0, 1, 1, 2, 0, 2, 0, 4, 0, 0, 4, 4, 0, 0, 0, 4]
dummy_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
deepen_list = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0]
skipcon_list = [0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1]
kerneladd_list = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
fuse_list = [10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 5.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 8.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 5.0, 10.0, 10.0, 10.0, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
prune_list = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]



In [4]:
obf_model = custom_cnn_9(input_features, True, widen_list, decompo_list, dummy_list, deepen_list, skipcon_list,
                kerneladd_list)

X = torch.randn(env.BATCH, input_features)

In [5]:
obf_model.eval()
obf_model(X)

To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /opt/conda/conda-bld/pytorch_1646755853042/work/aten/src/ATen/native/BinaryOps.cpp:607.)
  X1_0 = self.conv12_0(X1[:, :int(torch.floor_divide(X1_shape[1],4)), :, :])


tensor([[ 0.0016, -0.0010, -0.0006, -0.0003,  0.0035, -0.0006,  0.0012, -0.0050,
          0.0020, -0.0030]], grad_fn=<AddBackward0>)

In [None]:
import glob
# schedule_log_files = glob.glob(r'../logs/tuning_logs/vta_2x16x16/*.log')
# schedule_log_files = glob.glob(r'../logs/tuning_logs/*.log')
# schedule_log_files = glob.glob(r'../logs/tuning_logs/vta_1x8x32/*.log')
# schedule_log_files = glob.glob(r'../logs/tuning_logs/vta_4x8x8/*.log')
schedule_log_files = glob.glob(r'../logs/tuning_logs/vta_1x16x16/*.log')

In [None]:
# Load pre-configured AutoTVM schedules
with autotvm.tophub.context(target, extra_files=schedule_log_files):
# with autotvm.tophub.context(target):

    
    input_name = "input0"

    # Populate the shape and data type dictionary for ImageNet classifier input
    dtype_dict = {input_name: "float32"}
    shape_dict = {input_name: (env.BATCH, 3, 224, 224)}


#     # Get off the shelf gluon model, and convert to relay
#     gluon_model = vision.get_model(model, pretrained=True)
    
    
#     pytorch_model = getattr(torchvision.models, model)(pretrained=True).eval()
    
    pytorch_model = obf_model
        
    input_shape = [env.BATCH, 3, 224, 224]
    input_data = torch.randn(input_shape)
    scripted_model = torch.jit.trace(pytorch_model, input_data)
    
    shape_list = [(input_name, input_shape)]


    # Measure build start time
    build_start = time.time()

#     Start front end compilation
    mod, params = relay.frontend.from_pytorch(scripted_model, shape_list)
    
    
#     mod, params = relay.frontend.from_mxnet(gluon_model, shape_dict)

#     #mod, params = relay.frontend.from_mxnet(net, shape_dict)
    
#     # Update shape and type dictionary
    shape_dict.update({k: v.shape for k, v in params.items()})
    dtype_dict.update({k: str(v.dtype) for k, v in params.items()})
    

    if target.device_name == "vta":
        # Perform quantization in Relay
        # Note: We set opt_level to 3 in order to fold batch norm
        with tvm.transform.PassContext(opt_level=3):
            with relay.quantize.qconfig(global_scale=8.0, skip_conv_layers=[0]):
                mod = relay.quantize.quantize(mod, params=params)
                print(mod.astext(show_meta_data=False))
#                 print(apput)
            # Perform graph packing and constant folding for VTA target
#             assert env.BLOCK_IN == env.BLOCK_OUT
            # do device annotation if target is intelfocl or sim
            relay_prog = graph_pack(
                mod["main"],
                env.BATCH,
                env.BLOCK_IN,
                env.BLOCK_OUT,
                env.WGT_WIDTH,
                start_name=pack_dict[model][0],
#                 stop_name='cast',
#                 stop_name_idx=114,
                stop_name=pack_dict[model][1],
#                 start_name='nn.relu',
#                 start_name_idx=2,
#                 stop_name='nn.adaptive_avg_pool2d',
#                 start_name="cast",
#                 start_name_idx=8,
#                 stop_name="cast",                
#                 stop_name_idx=71,
                device_annot=(env.TARGET == "intelfocl"),
            )
    else:
        relay_prog = mod["main"]

    # Compile Relay program with AlterOpLayout disabled
    if target.device_name != "vta":
        with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
            graph, lib, params = relay.build(
                relay_prog, target=target, params=params, target_host=env.target_host
            )
    else:
        if env.TARGET == "intelfocl":
            # multiple targets to run both on cpu and vta
            target = {"cpu": env.target_vta_cpu, "ext_dev": target}
        with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}):
            graph, lib, params = relay.build(
                relay_prog, target=target, params=params, target_host=env.target_host
            )

    # Measure Relay build time
    build_time = time.time() - build_start
    print(model + " inference graph built in {0:.2f}s!".format(build_time))

    # Send the inference library over to the remote RPC server
    temp = utils.tempdir()
    lib.export_library(temp.relpath("graphlib.tar"))
    remote.upload(temp.relpath("graphlib.tar"))
    lib = remote.load_module("graphlib.tar")

    if env.TARGET == "intelfocl":
        ctxes = [remote.ext_dev(0), remote.cpu(0)]
        m = graph_runtime.create(graph, lib, ctxes)
    else:
        # Graph runtime
        m = graph_runtime.create(graph, lib, ctx)

In [None]:
help(lib)

In [None]:
print(mod.astext(show_meta_data=False))

In [None]:
categ_url = "https://github.com/uwsampl/web-data/raw/main/vta/models/"
categ_fn = "synset.txt"
download.download(join(categ_url, categ_fn), categ_fn)
synset = eval(open(categ_fn).read())

# Download test image
image_url = "https://homes.cs.washington.edu/~moreau/media/vta/cat.jpg"
image_fn = "cat.png"
download.download(image_url, image_fn)

# Prepare test image for inference
image = Image.open(image_fn).resize((224, 224))
plt.imshow(image)
plt.show()
image = np.array(image) - np.array([123.0, 117.0, 104.0])
image /= np.array([58.395, 57.12, 57.375])
image = image.transpose((2, 0, 1))
image = image[np.newaxis, :]
image = np.repeat(image, env.BATCH, axis=0)

# Set the network parameters and inputs
m.set_input(**params)
m.set_input(input_name, image)

In [None]:


#p = mp.Process(target=read_apm, args=('bob',))
#t1 = tr.Thread(target=read_apm, args=('bob',))
#pool = mp.Pool(4)
# Download ImageNet categories


# Perform inference and gather execution statistics
# More on: :py:method:`tvm.runtime.Module.time_evaluator`
num = 4  # number of times we run module for a single measurement
rep = 3  # number of measurements (we derive std dev from this)


timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)

stds = []
means = []

#timer = m.module.time_evaluator("run", ctx, number=num, repeat=rep)
#

if env.TARGET in ["sim", "tsim"]:
    simulator.clear_stats()
    timer()
    sim_stats = simulator.stats()
    print("\nExecution statistics:")
    for k, v in sim_stats.items():
        # Since we execute the workload many times, we need to normalize stats
        # Note that there is always one warm up run
        # Therefore we divide the overall stats by (num * rep + 1)
        print("\t{:<16}: {:>16}".format(k, v // (num * rep + 1)))
else:
    #vta.init_apm(remote)
#     for i in range(100):
#         vta.start_power_monitor(remote,1e-6)

#      tcost = timer()
#     vta.start_power_monitor(remote,1e-6)
#     vta.remote_cmd_exec(remote)
    m.run()
#     vta.stop_power_monitor(remote,f'/home/xilinx/i2c_prog/resnet18_1x16x16.csv')
#         vta.stop_power_monitor(remote,f'/home/xilinx/i2c_prog/pmbus_recordings/power_readings_pmbus_1x16x16_{i}.csv')
#         std = np.std(tcost.results) * 1000
#         mean = tcost.mean * 1000
#         stds.append(std)
#         means.append(mean)
#     print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
#     print("Average per sample inference time: %.2fms" % (mean / env.BATCH))

    #vta.read_metrics(remote,0)
#     m.run()
#     tcost = timer()
#     std = np.std(tcost.results) * 1000
#     mean = tcost.mean * 1000
#     print("\nPerformed inference in %.2fms (std = %.2f) for %d samples" % (mean, std, env.BATCH))
#     print("Average per sample inference time: %.2fms" % (mean / env.BATCH))
    
#     vta.reset_ro_monitor(remote)
#     vta.start_ro_monitor(remote)
#     m.run()
#     tcost = timer()

#     vta.stop_ro_monitor(remote,0)
#     #vta.reset_apm(remote)
        

# print("done")
    # Get classification results
tvm_output = m.get_output(0, tvm.nd.empty((env.BATCH, 1000), "float32", remote.cpu(0)))
for b in range(env.BATCH):
    top_categories = np.argsort(tvm_output.asnumpy()[b])
    # Report top-5 classification results
    print("\n{} prediction for sample {}".format(model, b))
    print("\t#1:", synset[top_categories[-1]])
    print("\t#2:", synset[top_categories[-2]])
    print("\t#3:", synset[top_categories[-3]])
    print("\t#4:", synset[top_categories[-4]])
    print("\t#5:", synset[top_categories[-5]])
    # This just checks that one of the 5 top categories
    # is one variety of cat; this is by no means an accurate
    # assessment of how quantization affects classification
    # accuracy but is meant to catch changes to the
    # quantization pass that would accuracy in the CI.
    cat_detected = False
    for k in top_categories[-5:]:
        if "cat" in synset[k]:
            cat_detected = True
    assert cat_detected

In [None]:
means


In [None]:
help(tvm.runtime.Module.time_evaluator)

In [None]:
import subprocess
import os
import signal


# proc = subprocess.Popen(["sshpass", "-p", "xilinx", "ssh", "-t", "xilinx@{}".format(device_host), "echo", "\"data\"", "tmp_ro_csvs/test.csv"], 
#                         stdout=subprocess.PIPE, 
#                         stderr=subprocess.PIPE,)

proc = subprocess.Popen(["sshpass", "-p", "xilinx", "ssh", "-t", "xilinx@{}".format(device_host), "sudo", "python3",
                             "/home/xilinx/tvm_il/vta/python/vta/read_trojan.py", "--base-address", "0xa0010000", "--offset", "0x0008", "--poll", "--auto-stop", ">>","tmp_ro_csvs/test.csv"], 
                        stdout=subprocess.PIPE, 
                        stderr=subprocess.PIPE,)


#                         shell=True, preexec_fn=os.setsid)

In [None]:
proc.kill()

In [None]:
proc = subprocess.Popen(["sshpass", "-p", "xilinx", "scp", "xilinx@{}:/home/xilinx/tmp_ro_csvs/test.csv".format(device_host), "./"], 
                        stdout=subprocess.PIPE, 
                        stderr=subprocess.PIPE,)

In [None]:
proc = subprocess.Popen(["sshpass", "-p", "xilinx", "ssh","-t", "xilinx@{}".format(device_host), "rm","tmp_ro_csvs/*"],
                        stdout=subprocess.PIPE, 
                        stderr=subprocess.PIPE,)

In [None]:
import pandas as pd
df1 = pd.read_csv("test.csv")

In [None]:
len(df1)

In [None]:
lines = proc.stdout.readlines()

In [None]:
lines

In [None]:
readings = []
for line in lines:
    line.decode("utf-8")
    readings.append(int(line))

In [None]:
import pandas as pd

df = pd.DataFrame(readings, columns=['data'])

In [None]:
df