In [1]:
import sys
import math
import numpy as np
import os
import time
from PIL import Image
from matplotlib import pyplot
import cv2
from datetime import datetime
import pynq
from pynq import Xlnk
from pynq import Overlay
from pynq.mmio import MMIO
from preprocessing import Agent
from preprocessing import BATCH_SIZE
from preprocessing import get_image_path
from IPython.display import display

team = 'SystemsETHZ'
agent = Agent(team)
print(str(BATCH_SIZE))

500


In [2]:
OVERLAY_PATH = '/home/xilinx/jupyter_notebooks/dac_2019_contest/' \
                'overlay/SystemsETHZ/ultra96_v04.bit'
overlay = Overlay(OVERLAY_PATH)
dma = overlay.axi_dma_0
WEIGHTS_FILE_NAME = 'weights_file_v04.txt'

In [3]:
xlnk = Xlnk()
nn_ctrl = MMIO(0xA0010000, length=1024)
print('Got nn_ctrl!')

## Allocate Buffers
MINIBATCH_SIZE = 20
height = 176
width = 320
pixel_bits = 24
pixels_per_line = 384/pixel_bits
num_lines = int((height*width)/pixels_per_line)

in_buffer1 = xlnk.cma_array(shape=(MINIBATCH_SIZE*num_lines, 64), dtype=np.uint8)
in_buffer2 = xlnk.cma_array(shape=(MINIBATCH_SIZE*num_lines, 64), dtype=np.uint8)
in_buffers = [in_buffer1, in_buffer2]

fire1_num_out_lines = (height/4)*(width/4)*MINIBATCH_SIZE
fire1_out_buffer = xlnk.cma_array(shape=(int(16*fire1_num_out_lines),), dtype=np.uint32)
fire2_num_out_lines = (height/8)*(width/8)*MINIBATCH_SIZE
fire2_out_buffer = xlnk.cma_array(shape=(int(16*fire2_num_out_lines),), dtype=np.uint32)
fire3_num_out_lines = (height/16)*(width/16)*MINIBATCH_SIZE
fire3_out_buffer = xlnk.cma_array(shape=(int(16*fire3_num_out_lines),), dtype=np.uint32)
fire4_out_buffer = xlnk.cma_array(shape=(int(16*fire3_num_out_lines),), dtype=np.uint32)
fire5_out_buffer = xlnk.cma_array(shape=(int(16*fire3_num_out_lines),), dtype=np.uint32)
final_num_lines = int((height/16)*(width/16))
bndboxes = [xlnk.cma_array(shape=(MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32),
            xlnk.cma_array(shape=(MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32),
            xlnk.cma_array(shape=(MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32),
            xlnk.cma_array(shape=(MINIBATCH_SIZE,final_num_lines,16), dtype=np.int32)]
obj_array = np.zeros((MINIBATCH_SIZE,final_num_lines))

## Allocate SW weight buffers and load from text file
NUM_LAYERS = 3+4*4
weights_file = open(WEIGHTS_FILE_NAME, "r")
layer = 0
total_iterations = np.zeros(NUM_LAYERS)
for line in weights_file:
    if "layer" in line:
        temp = line.split(": ")
        layer = int(temp[1])
    if "total_iterations" in line:
        temp = line.split(": ")
        total_iterations[layer] = int(temp[1])
weights_file.close()

weightfactors_length = np.zeros(NUM_LAYERS)
weightsfactors = []
for i in range(0, NUM_LAYERS):
    weightfactors_length[i] = int(total_iterations[i])
    weightsfactors.append( xlnk.cma_array(shape=(int(16*weightfactors_length[i]),), dtype=np.uint32) )
obj_factors = np.zeros(4)
box_factors = np.zeros(4)
    
index = 0
weights_file = open(WEIGHTS_FILE_NAME, "r")
for line in weights_file:
    if "layer" in line:
        temp = line.split(": ")
        layer = int(temp[1])
        index = 0
    elif "total_iterations" not in line:
        if "obj_factor" in line:
            temp = line.split(' ')
            obj_factors[int(temp[1])] = int(temp[2])
        elif "box_factor" in line:
            temp = line.split(' ')
            box_factors[int(temp[1])] = int(temp[2])
        else:
            no0x = line.split('0x')[-1]
            base = 1
            while base < len(no0x):
                part = no0x[-1*(base+8):-1*base]    
                weightsfactors[layer][index*16 + int(base/8)] = int(part, 16)
                base += 8
            index += 1

## Define transfer functions
def weightsfactors_transfer(weightsfactors):
    nn_ctrl.write(0x40, 13)
    nn_ctrl.write(0x48, 0)
    nn_ctrl.write(0x0, 0) # Reset
    nn_ctrl.write(0x0, 1) # Deassert reset
    dma.sendchannel.transfer(weightsfactors)
    dma.sendchannel.wait()
    
def fire(inbuffer, outbuffer, 
         squeeze_din_w, squeeze_din_h,
         expand_din_w, expand_din_h,
         expand_din_w_afterpool, expand_din_h_afterpool,
         whichfire):
    nn_ctrl.write(0x0, 0) # Reset
    nn_ctrl.write(0x10, int(squeeze_din_w))
    nn_ctrl.write(0x18, int(squeeze_din_h))
    nn_ctrl.write(0x20, int(expand_din_w))
    nn_ctrl.write(0x28, int(expand_din_h))
    nn_ctrl.write(0x30, int(expand_din_w_afterpool))
    nn_ctrl.write(0x38, int(expand_din_h_afterpool))
    nn_ctrl.write(0x40, whichfire)
    nn_ctrl.write(0x48, MINIBATCH_SIZE) # set numReps
    nn_ctrl.write(0x0, 1) # Deassert reset
    dma.recvchannel.transfer(outbuffer)
    dma.sendchannel.transfer(inbuffer)

## Inference (Main Part)
interval_time = 0
total_time = 0
total_energy = 0
total_num_img = len(agent.img_list)
result = np.zeros((int(total_num_img), 17))
agent.reset_batch_count()
rails = pynq.get_rails()
print('total_num_img: ' + str(total_num_img))

processed_images = 0
for i in range(math.ceil(total_num_img/BATCH_SIZE)):
    # get a batch from agent
    batch = agent.send(interval_time, agent.img_batch)
    sorted_batch = sorted(batch)
    images_to_process_in_batch = len(sorted_batch)
    print('images_to_process_in_batch: ' + str(images_to_process_in_batch))
    minibatches_to_process = math.ceil(images_to_process_in_batch/MINIBATCH_SIZE)
    
    for k in range(0, minibatches_to_process):
        size = MINIBATCH_SIZE
        next_size = MINIBATCH_SIZE
        if k == minibatches_to_process-1:
            size = images_to_process_in_batch - k*MINIBATCH_SIZE
        if k+1 == minibatches_to_process-1:
            next_size = images_to_process_in_batch - (k+1)*MINIBATCH_SIZE
#         print('size: ' + str(size))
#         print('next_size: ' + str(next_size))
#         print('Already processed: ' + str(processed_images))
#         print('Process ' + str(size) + ' images, out of ' + str(images_to_process_in_batch) + ' in batch.')
        
        start = time.time()
        recorder = pynq.DataRecorder(rails['power1'].power)
        with recorder.record(0.01):
            if k == 0:
                for i in range(0, size):
                    first_image = sorted_batch[k*MINIBATCH_SIZE + i]
                    image = cv2.imread(get_image_path(first_image))
                    image = cv2.resize(image, (width, height), interpolation=cv2.INTER_NEAREST)  
                    in_buffers[k%2][i*num_lines:(i+1)*num_lines,0:48] = np.reshape(image, (num_lines, 48))

#             stamp1 = time.time()

            weightsfactors_transfer(weightsfactors[0])
            fire(in_buffers[k%2], fire1_out_buffer,\
                width/2, height/2, width/2, height/2, width/4, height/4, 1)
            if k < minibatches_to_process-1:
                num_images_to_read = int(next_size*0.8)
                for i in range(0, num_images_to_read):
                    first_image = sorted_batch[(k+1)*MINIBATCH_SIZE + i]
                    image = cv2.imread(get_image_path(first_image))
                    image = cv2.resize(image, (width, height), interpolation=cv2.INTER_NEAREST)
                    in_buffers[(k+1)%2][i*num_lines:(i+1)*num_lines,0:48] = np.reshape(image, (num_lines, 48))
            dma.recvchannel.wait()

#             stamp2 = time.time()

            weightsfactors_transfer(weightsfactors[1])
            fire(fire1_out_buffer, fire2_out_buffer,\
                width/4, height/4, width/4, height/4, width/8, height/8, 2)
            if k < minibatches_to_process-1:
                num_images_to_read = int(next_size*0.8)
                for i in range(num_images_to_read, next_size-1):
                    first_image = sorted_batch[(k+1)*MINIBATCH_SIZE + i]
                    image = cv2.imread(get_image_path(first_image))
                    image = cv2.resize(image, (width, height), interpolation=cv2.INTER_NEAREST)
                    in_buffers[(k+1)%2][i*num_lines:(i+1)*num_lines,0:48] = np.reshape(image, (num_lines, 48))
            dma.recvchannel.wait()

#             stamp3 = time.time()

            weightsfactors_transfer(weightsfactors[2])
            fire(fire2_out_buffer, fire3_out_buffer,\
                width/8, height/8, width/8, height/8, width/16, height/16, 3)
            if k < minibatches_to_process-1:
                i = next_size-1
                first_image = sorted_batch[(k+1)*MINIBATCH_SIZE + i]
                image = cv2.imread(get_image_path(first_image))
                image = cv2.resize(image, (width, height), interpolation=cv2.INTER_NEAREST)
                in_buffers[(k+1)%2][i*num_lines:(i+1)*num_lines,0:48] = np.reshape(image, (num_lines, 48))
            dma.recvchannel.wait()

#             stamp4 = time.time()
    
            for t in range(0, 4):
                weightsfactors_transfer(weightsfactors[3 + t*4])
                fire(fire3_out_buffer, fire4_out_buffer,\
                    width/16, height/16, width/16, height/16, width/16, height/16, 4)
                dma.recvchannel.wait()

                weightsfactors_transfer(weightsfactors[4 + t*4])
                fire(fire4_out_buffer, fire5_out_buffer,
                    width/16, height/16, width/16, height/16, width/16, height/16, 5)
                dma.recvchannel.wait()

                weightsfactors_transfer(weightsfactors[5 + t*4])
                fire(fire5_out_buffer, fire4_out_buffer,\
                    width/16, height/16, width/16, height/16, width/16, height/16, 6)
                dma.recvchannel.wait()

                weightsfactors_transfer(weightsfactors[6 + t*4])
                fire(fire4_out_buffer, bndboxes[t],\
                    width/16, height/16, width/16, height/16, width/16, height/16, 7)
                dma.recvchannel.wait()
                
                temp_obj = np.multiply(np.divide(bndboxes[t][:,:,4],float((1<<16))), float(obj_factors[t]))
                if t == 0:
                    obj_array = temp_obj
                else:
                    obj_array = np.add(obj_array, temp_obj)
            
#             stamp5 = time.time()
        
            grid_cell = np.argmax(obj_array, axis=1)
            result[processed_images:processed_images+size,16] = grid_cell[0:size]
            for p in range(0, size):
                for t in range(0,4):
                    result[processed_images+p, t*4:(t+1)*4] = bndboxes[t][p,grid_cell[p],0:4]
        
        end = time.time()
        t = end - start
        energy = recorder.frame["power1_power"].mean() * t
        
#         print('IM read time: ' + str(stamp1-start))
#         print('fire1 time: ' + str(stamp2-stamp1))
#         print('fire2 time: ' + str(stamp3-stamp2))
#         print('fire3 time: ' + str(stamp4-stamp3))
#         print('firefinal time: ' + str(stamp5-stamp4))
#         print('postprocessing time: ' + str(end-stamp5))
#         print('Processing time: {} seconds.'.format(t))
#         print('Energy: {} J.'.format(energy))
        total_time += t
        total_energy += energy
        processed_images += size
        
print('Total processed images: ' + str(processed_images))
print('Total time: ' + str(total_time))
print('FPS: ' + str(processed_images/total_time))
print('Total energy: ' + str(total_energy))

Got nn_ctrl!
total_num_img: 2324
images_to_process_in_batch: 500
images_to_process_in_batch: 500
images_to_process_in_batch: 500
images_to_process_in_batch: 500
images_to_process_in_batch: 324
Total processed images: 2324
Total time: 38.73975706100464
FPS: 59.99005095309009
Total energy: 231.701417642


In [4]:
with open(agent.coord_team + '/{}.txt'.format(team), 'w+') as fcoord:
    for i in range(0,result.shape[0]):
        fcoord.write(str(result[i,:]))
        fcoord.write('\n')
print("Coordinate results written successfully.")

written_count = 0
result_rectangles = []
for i in range(0,result.shape[0]):
    float_objdetect = result[i,16].astype('float')
    float_bndboxes1 = np.multiply(np.divide(result[i,0:4].astype('float'), float((1<<16))), float(box_factors[0]))
    float_bndboxes2 = np.multiply(np.divide(result[i,4:8].astype('float'), float((1<<16))), float(box_factors[1]))
    float_bndboxes3 = np.multiply(np.divide(result[i,8:12].astype('float'), float((1<<16))), float(box_factors[2]))
    float_bndboxes4 = np.multiply(np.divide(result[i,12:16].astype('float'), float((1<<16))), float(box_factors[3]))
    float_bndboxes = float_bndboxes1+float_bndboxes2+float_bndboxes3+float_bndboxes4
    float_bndboxes = np.divide( float_bndboxes, 4.0*float((1 << 22)) )
    
    obj_h = int(float_objdetect/(width/16))
    obj_w = int(float_objdetect%(width/16))
        
    x_min = int((float_bndboxes[0] + obj_w*16) *(640/width))
    y_min = int((float_bndboxes[1] + obj_h*16) *(360/height))
    x_max = int((float_bndboxes[2] + obj_w*16) *(640/width))
    y_max = int((float_bndboxes[3] + obj_h*16) *(360/height))

    result_rectangles.append([x_min, x_max, y_min, y_max])
        
print(str(len(result_rectangles)))
agent.save_results_xml(result_rectangles, total_time, total_energy)
print("XML results written successfully.")

Coordinate results written successfully.
2324
XML results written successfully.


In [None]:
xlnk.xlnk_reset()