In [None]:
model_file="./ldfgnet.xmodel"

In [2]:
from pynq_dpu import DpuOverlay
overlay = DpuOverlay("dpu.bit")
overlay.load_model(model_file)
dpu = overlay.runner
print(dpu)

vart::Runner@0xaaab15e83060


## 2. Utility functions

In this section, we will prepare a few functions for later use.

In [3]:
import os
import time
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
from PIL import Image
from IPython.display import display
import cv2
%matplotlib inline

#Power Rails Reported in PYNQ PowerBus for ZCU104#
Reported Label	Expected Voltage	Description
12V (Main Input Power)
INT (Internal Power Rail)
1V8 Common auxiliary rail for FPGA and peripherals (VCCAUX, VCCO, etc.).
1V2 Powers DDR4 memory (VCC_DDR4) and transceiver PLLs (VCC_PLL).
MGTA Likely VCC_GTX, the power for GT transceivers.
3V3 Supplies general I/O (VCCO for certain banks), MIO, and peripherals.
1V13 Could be an adaptive supply for a specific FPGA block (possibly VCCINT or VCCBRAM).
5V0	 Powers USB ports, some peripherals, and auxiliary circuits.
FMC (default)	VADJ_FMC, configurable voltage for the FMC connector.
MGTRA Another GT transceiver rail, similar to MGTA.

rails['12V'].power, rails['INT'].power, rails['1V8'].power, rails['1V2'].power, rails['MGTA'].power, rails['3V3'].power, 
rails['1V13'].power, rails['5V0'].power, rails['FMC'].power, rails['MGTRA'].power

VCCINT (0.85V) → Most Important
Powers the FPGA fabric, where the DPU executes computations.
DPU inference primarily runs here, consuming most of the dynamic power.
Power = VCCINT × Current (I_VCCINT) → Directly linked to DPU workload.

12V INT (Total Board Power)
Includes power losses in regulators and peripheral components (USB, PMIC, clocks, etc.).
Good for estimating total board power but not precise for isolating DPU inference power.

VCCINT (0.85V) → Most directly linked to DPU computations.
✅ VCC_DDR4 (1.2V) → Significant for memory-intensive workloads.
✅ 12V INT → Good for estimating overall board power.



12V (Main Input Power)
This is the primary 12V supply coming into the board.Powers all onboard regulators that generate lower voltages.Used to estimate total board power consumption.

INT (Internal Power Rail)
Could refer to VCCINT (0.85V), which powers the FPGA fabric.Alternatively, it might represent the total internal power consumption after conversion losses from 12V.
If labeled separately, it might be measured after regulation (e.g., post-PMIC power delivery).

Which One to Monitor for DPU Inference Power?
If "INT" refers to VCCINT, then it is the best indicator of DPU power usage.

In [4]:
from pynq import pmbus, DataRecorder
rails=pmbus.get_rails()
recorder = DataRecorder(rails['12V'].power,rails['INT'].power,rails['1V8'].power,rails['1V2'].power,rails['MGTA'].power,rails['3V3'].power,rails['1V13'].power,rails['5V0'].power,rails['FMC'].power,rails['MGTRA'].power)
recorder.reset()
recorder.record(0.2)

<pynq.pmbus.DataRecorder at 0xffff59aa1870>

In [None]:
def read_dataset(colitis, normal, polyps, esophag, pylorus, height,width):
    imgdata=[]
    imgclass=[]

    for col, nor, pop, eso, pyl in tqdm(zip(colitis, normal,polyps, esophag, pylorus), total=len(colitis)):

        img_array = cv2.imread(col,cv2.IMREAD_COLOR)
        newim_array = cv2.cvtColor(img_array,cv2.COLOR_BGR2RGB)
        newim_array = cv2.resize(newim_array, (height,width))  # resize to normalize data size
        imgdata.append(newim_array)
        clasnum=0
        imgclass.append(clasnum)

        img_array = cv2.imread(nor,cv2.IMREAD_COLOR)
        newim_array = cv2.cvtColor(img_array,cv2.COLOR_BGR2RGB)
        newim_array = cv2.resize(newim_array, (height,width))  # resize to normalize data size
        imgdata.append(newim_array)
        clasnum=1
        imgclass.append(clasnum)

        img_array = cv2.imread(pop,cv2.IMREAD_COLOR)
        newim_array = cv2.cvtColor(img_array,cv2.COLOR_BGR2RGB)
        newim_array = cv2.resize(newim_array, (height,width))  # resize to normalize data size
        imgdata.append(newim_array)
        clasnum=2
        imgclass.append(clasnum)

        img_array = cv2.imread(eso,cv2.IMREAD_COLOR)
        newim_array = cv2.cvtColor(img_array,cv2.COLOR_BGR2RGB)
        newim_array = cv2.resize(newim_array, (height,width))  # resize to normalize data size
        imgdata.append(newim_array)
        clasnum=3
        imgclass.append(clasnum)

        img_array = cv2.imread(pyl,cv2.IMREAD_COLOR)
        newim_array = cv2.cvtColor(img_array,cv2.COLOR_BGR2RGB)
        newim_array = cv2.resize(newim_array, (height,width))  # resize to normalize data size
        imgdata.append(newim_array)
        clasnum=4
        imgclass.append(clasnum)

    x_data = np.array(imgdata)
    y_data = np.array(imgclass)


    x_data = (x_data/255.0).astype(np.float32)
    x_data = x_data.reshape(x_data.shape[0],height,width,3)
  

#     y_data = to_categorical(y_data, num_classes=5)    
#     y_data = y_data.reshape(y_data.shape[0],5)
    
    return [x_data,y_data]


rdstart=time.time()
test_path = './dataset/'

test_xim = sorted(glob(os.path.join(test_path, "colitis", "*.jpg")))
test_yim = sorted(glob(os.path.join(test_path, "normal", "*.jpg")))
test_zim = sorted(glob(os.path.join(test_path, "polyps", "*.jpg")))
test_zaim = sorted(glob(os.path.join(test_path, "esophag", "*.jpg")))
test_zbim = sorted(glob(os.path.join(test_path, "pylorus", "*.jpg")))

#Read Dataset (train, test, valid)
[height,width]=[128,128]
[x_test,y_test] = read_dataset(test_xim, test_yim,test_zim, test_zaim, test_zbim, height,width)



print('----------------------------------------')
print('test shape :', x_test.shape)
print('----------------------------------------')
print('test label shape :', y_test.shape)
print('----------------------------------------')

images=x_test
rdstop=time.time()
print('The read dataset time: ',rdstop-rdstart,' sec')

Let's first define a few useful preprocessing functions. These functions
will make sure the DPU can take input images with arbitrary sizes.

We will also define a few functions to calculate softmax and provide 
the output class after running a DPU task.

In [None]:
def CPUCalcSoftmax(data,size):
    sum=0.0
    result = [0 for i in range(size)]
    for i in range(size):
        result[i] = math.exp(data[i])
        sum +=result[i]
    for i in range(size):
        result[i] /=sum
    return result

def predict_label(softmax):
    return np.argmax(softmax)


Keep in mind that our original images are 640x480 so we need to preprocess them
later to make sure it fits our model.

## 3. Use VART
Now we should be able to use VART to do image classification.

In [None]:
dpu = overlay.runner

inputTensors = dpu.get_input_tensors()
outputTensors = dpu.get_output_tensors()

shapeIn = tuple(inputTensors[0].dims)
shapeOut = tuple(outputTensors[0].dims)
outputSize = int(outputTensors[0].get_data_size() / shapeIn[0])

softmax = np.empty(outputSize)
print(shapeIn)
print(shapeOut)
print(outputSize)

output_data = [np.empty(shapeOut, dtype=np.float32, order="C")]
input_data = [np.empty(shapeIn, dtype=np.float32, order="C")]
image = input_data[0]


We can define a few buffers to store input and output data. They will be reused
during multiple runs.

Remember that we have a list of `original_images`. 
We can now define a new function `run()` which takes the image index as 
the input, and calculate the softmax as the classification result.
With the argument `display` set to `True`, the original image as well as the
predicted label can be rendered.

It is obvious that the range of `image_index` should be [0, `total_images`-1].

In [10]:
def run(image_index, display=False):
    no_of_classes = 5
    preprocessed = images[image_index]
    
    image[0,...] = preprocessed.reshape(
        inputTensors[0].dims[1],
        inputTensors[0].dims[2],
        inputTensors[0].dims[3])
    
    job_id = dpu.execute_async(input_data, output_data)
    dpu.wait(job_id)

    temp = [j.reshape(1, outputSize) for j in output_data]
    softmax = CPUCalcSoftmax(temp[0][0],no_of_classes)
    #print("Classification: {}".format(predict_label(softmax)))
    if display:
        display_image = images[image_index]*255
        _, ax = plt.subplots(1)
        _ = ax.imshow(display_image)
        print("Classification: {}".format(predict_label(softmax)))
    return softmax

Let's run it for 1 image and print out the predicted label.

We can also run it for multiple images as shown below. In this example
we have only used 1 thread; in principle, users should be able to boost
the performance by employing more threads.

In [None]:
import math
from pynq import get_rails, DataRecorder
total =np.shape(images)[0]
time1 = time.time()
pred=np.array([run(i,False) for i in range(total)])
#[val]=np.array([run(i,False) for i in range(total)])
y_pred=pred.argmax(axis=1)
time2 = time.time()
fps = total/(time2-time1)

print("Inference Time : ",time2-time1,' sec')
print("Throughput Performance: {} FPS".format(fps))

In [None]:
results=recorder.frame
recorder.stop()
f_results=results.sum(axis='columns')

import pandas
f_results=results.sum(axis='columns')

print('\n')
#print('Maximum Total Power Consumed:',max_power,'Watt' )
print('Maximum Total Power Consumed:',f_results.max(),'Watt' )

import pandas

head=[]
for col in results.columns:
    head.append(col)
import matplotlib.pyplot as plt
plt.figure(figsize=(20,8))
plt.plot(results)
plt.legend(head)
plt.xlabel('Time')
plt.ylabel('Power Consumption(Watt)')
plt.show()



To accurately assess the Ultra96-V2 board's power consumption during DPU inference, 
it's essential to monitor the power rails supplying the Zynq UltraScale+ MPSoC, 
particularly those powering the Programmable Logic (PL) where the DPU operates. 
Among the power rails you've listed, **the VCCINT (rails['INT'].power)** is the most critical, 
as it supplies power to the internal logic of the PL. 
Monitoring this rail will provide direct insight into the power consumption of the DPU during inference tasks.

While the other rails, such as VCCPSINT_FP, VCCPSINT_LP, VCCPSAUX, and VCCPSPLL, 
supply power to various domains of the Processing System (PS), 
their contribution to the overall power consumption during DPU operations is less significant compared to VCCINT. 
Therefore, focusing on the VCCINT rail will give you the most relevant data regarding the DPU's power usage

In [None]:
# print()
# max_rail=results.max()
# print('Maximum of all power rail =>',max_rail.idxmax(),':',max_rail.max())
p_result=results['INT_power'].values
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
#plt.plot(results)
plt.plot(p_result)
plt.xlabel('Time')
plt.ylabel('Power Consumption(Watt)')
print('The maximum power consumption on INT:', max(p_result),'Watt')
plt.show()

In [None]:
print('---------------------------------------------------')
print("Inference Time : ",time2-time1,' sec')
print("Throughput Performance: {} FPS".format(fps))

print('---------------------------------------------------')
print('Maximum Total Power Consumed:',f_results.max(),'Watt' )
print('The maximum power consumption on VCCINT:', max(p_result),'Watt')
print('---------------------------------------------------')

In [18]:
del overlay
del dpu