# OpenVINO Demo: Multiple Networks in Action

### Here is a demo using mutliple networks!

We are going to be combining what we know in this demo. Please look at the other walkthroughs if the code is hard to follow.

**Note: this demo will not work with the Intel NCS 2 due to limited memory resources.**

First, let's import what we need.

In [2]:
import cv2
import numpy as np
from openvino.inference_engine import IENetwork, IECore
import time

User = 'fcrey'

Next, let's define the necessary functions in order to set up the network, pre-process the input dat, and post-process dat from the output layer and draw bounding boxes.

In [3]:
# Methods for object detection

def setup_obj_net(precision, device_choice, OS, ie):
    path_to_objxml = path_to_objbin = None
    if OS.lower() == 'windows':
        path_to_objxml = 'gesture_optimized\\' + precision + '\\frozen_inference_graph.xml'
        path_to_objbin = 'gesture_optimized\\' + precision + '\\frozen_inference_graph.bin'
    else:
        path_to_objxml = './gesture_optimized/' + precision + '/frozen_inference_graph.xml'
        path_to_objbin = './gesture_optimized/' + precision + '/frozen_inference_graph.bin'

    net = IENetwork(model=path_to_objxml, weights=path_to_objbin)
    input_layer = next(iter(net.inputs))
    output_layer = next(iter(net.outputs))
    input_shape = net.inputs[input_layer].shape
    
    ext = None
    if OS.lower() == 'windows':
        ext = 'C:\\Users\\' + User + '\\Documents\\Intel\\OpenVINO\\inference_engine_samples_build\\intel64\\Release\\cpu_extension.dll'
    else:
        ext = '/opt/intel/openvino/deployment_tools/inference_engine/lib/intel64/libcpu_extension_avx2.so'
    if device_choice.lower() == 'cpu':
        ie.add_extension(ext, device_name=device_choice)
    obj_exec_net = ie.load_network(network=net, device_name=device_choice, num_requests=1)
    
    return {
        'net': obj_exec_net, 
        'input_layer': input_layer,
        'output_layer': output_layer,
        'input_shape': input_shape
    }

def pre_obj_processing(obj_frame, input_shape):
    n, c, h, w = input_shape
    obj_in_frame = cv2.resize(obj_frame, (w, h))
    obj_in_frame = obj_in_frame.transpose((2, 0, 1))
    obj_in_frame = obj_in_frame.reshape((n, c, h, w))
    
    return {
        'blob' : obj_in_frame, 
        'frame': obj_frame, 
    }

def draw_bb(obj_det, frame):
    i_w = frame.shape[1]
    i_h = frame.shape[0]
    drawn = False
    
    proposals = []
    
    for obj in obj_det[0][0]:
        if obj[2] > 0.5:
            proposals.append(obj)
    if len(proposals) == 0:
        return {'status': False, 'miniframe': None, 'frame': frame, 'class': 8, 'offset': None}
    
    proposed_obj = max(proposals, key=lambda x: x[2])
    xmin = int(proposed_obj[3] * i_w)
    ymin = int(proposed_obj[4] * i_h)
    xmax = int(proposed_obj[5] * i_w)
    ymax = int(proposed_obj[6] * i_h)
    class_id = int(proposed_obj[1])
    green = (0, 255, 0)
    cv2.putText(frame, str(class_id), (xmin, ymin), cv2.FONT_HERSHEY_SIMPLEX, 0.8, green, 2, cv2.LINE_AA )
    cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), green, 2)
    
    xmid = (xmin + xmax) / 2
    ymid = (ymin + ymax) / 2
    
    xmin = int(xmid - 149)
    xmax = int(xmid + 150)
    ymin = int(ymid - 149)
    ymax = int(ymid + 150)
    
    if ymin >= i_h or xmin >= i_w or ymin < 0 or xmin < 0:
        return {'status': False, 'miniframe': None, 'frame': frame, 'class': 8, 'offset': None}
    
    # Draw box and label\class_id
    cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (255, 255, 255), 2)
    
    return {'status': True, 'miniframe': frame[ymin:ymax, xmin:xmax], 
            'frame': frame, 'class': class_id, 'offset': (xmin, ymin)}

Now, we will defined all the necessary functions for loading the optimized HandPose net, pre-processing input data, and post-processing data to draw handkeypoints on a frame.

In [4]:
# Methods for hand pose detection

def setup_net(shape, precision, device_choice, OS, ie):
    path_to_hpxml = path_to_hpbin = None
    if OS.lower() == 'windows':
        path_to_hpxml = 'handpose_optimized\\' + precision + '\\pose_iter_102000.xml'
        path_to_hpbin = 'handpose_optimized\\' + precision + '\\pose_iter_102000.bin'
    else:
        path_to_hpxml = './handpose_optimized/' + precision + '/pose_iter_102000.xml'
        path_to_hpbin = './handpose_optimized/' + precision + '/pose_iter_102000.bin'

    # Set up network for inference
    net = IENetwork(model=path_to_hpxml, weights=path_to_hpbin)

    # Based on prototxt, original input: [2, 3, 368, 368]
    input_layer = next(iter(net.inputs))
    output_layer = next(iter(net.outputs))
    n, c, _, _ = net.inputs[input_layer].shape
    net.reshape({input_layer: (n, c, shape[1], shape[0])})
    
    exec_net = ie.load_network(network=net, device_name=device_choice, num_requests=1)
    return {'net': exec_net, 'input_layer': input_layer, 'output_layer': output_layer}

def pre_hp_processing(frame, shape):
    try:
        inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, shape, (0, 0, 0), swapRB=True, crop=False)

        return {'blob' : inpBlob, 'frame': frame, 'status': True}
    except:
        return {'blob' : None, 'frame' : None, 'status': False}

def draw_skeleton(output, frame, offset, big_frame, class_id):
    points = []
    frameCopy = np.copy(frame)
    frameWidth = frame.shape[1]
    frameHeight = frame.shape[0]
    nPoints = 22
    POSE_PAIRS = [[0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20]]
    threshold = 0.15

    for i in range(nPoints):
        # confidence map of corresponding body's part.
        probMap = output[0, i, :, :]
        probMap = cv2.resize(probMap, (frameWidth, frameHeight))

        # Find global maxima of the probMap.
        minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)

        if prob > threshold :
            coord = (int(point[0]) + offset[0], int(point[1]) + offset[1])
            color = (0, 0, 255)
            font = cv2.FONT_HERSHEY_SIMPLEX
            cv2.putText(big_frame, "{}".format(i), coord, font, .8, color, 2, lineType=cv2.LINE_AA)

            # Add the point to the list if the probability is greater than the threshold
            points.append(coord)
        else :
            points.append(None)

    # Draw Skeleton
    for pair in POSE_PAIRS:
        partA = pair[0]
        partB = pair[1]

        if points[partA] and points[partB]:
            cv2.line(big_frame, points[partA], points[partB], (0, 255, 255), 2)
    
    if class_id == 3 or class_id == 4:
        # Index Finger Points
        vector_points = [points[0], points[5], points[6], points[7], points[8]]
        for i in range(4):
            if vector_points[i] != None and vector_points[i + 1] != None:
                if vector_points[i][0] < vector_points[i + 1][0]:
                    cv2.putText(big_frame, str(3), offset, font, .8, color, 2, lineType=cv2.LINE_AA)
                else:
                    cv2.putText(big_frame, str(4), offset, font, .8, color, 2, lineType=cv2.LINE_AA)
                return
                    

Putting the pieces together...

In [9]:
# SSD + Hand Pose Estimation
def main():
    ie = IECore()
    OS = 'windows'
    device = 'CPU'
    precision = 'fp32' if device.lower() == 'cpu' else 'fp16'
    
    # object detection net
    net_dict = setup_obj_net(precision, device, OS, ie)
    obj_exec_net = net_dict['net']
    obj_input_shape = net_dict['input_shape']
    
    # hand pose estimation net
    hp_shape = (229, 229)
    hp_net_dict = setup_net(hp_shape, precision, device, OS, ie)
    hp_net = hp_net_dict['net']
    hp_input_layer = hp_net_dict['input_layer']
    hp_output_layer = hp_net_dict['output_layer']
    
    vs = cv2.VideoCapture(0)
    
    while True:
        start = time.time()
        # obj detection routine
        ret, vframe = vs.read()
        # vframe = cv2.cvtColor(vframe, cv2.COLOR_BGR2GRAY)
        image_dict = pre_obj_processing(vframe, obj_input_shape)
        obj_inpBlob = image_dict['blob']

        obj_res = obj_exec_net.infer({'image_tensor': obj_inpBlob})
        obj_det = obj_res['DetectionOutput']
        
        proposed_frame = draw_bb(obj_det, image_dict['frame'])
        class_id = proposed_frame['class']
        big_frame = proposed_frame['frame']
        
        if proposed_frame['status']:
            new_frame = proposed_frame['miniframe']
            offset = proposed_frame['offset']
            
            # hand pose estimation routine
            sub_image_dict = pre_hp_processing(new_frame, hp_shape)
            if not sub_image_dict['status']:
                cv2.imshow("Hand Key Points", big_frame)
                continue
            
            hs_inpBlob = sub_image_dict['blob']
            hp_res = hp_net_dict['net'].infer({hp_input_layer: hs_inpBlob})
            hp_output = hp_res[hp_output_layer]
            
            draw_skeleton(hp_output, sub_image_dict['frame'], offset, big_frame, class_id)
            
            font = cv2.FONT_HERSHEY_SIMPLEX
            stamp = time.time() - start 
            cv2.putText(vframe, "Time: " + str(stamp), (30, 30), font, 0.8, (255, 0, 0), 2, cv2.LINE_AA)
            cv2.putText(vframe, "FPS: " + str(1/stamp), (30, 60), font, 0.8, (255, 0, 0), 2, cv2.LINE_AA)
            cv2.imshow("Hand Key Points", big_frame)
        else:
            font = cv2.FONT_HERSHEY_SIMPLEX
            stamp = time.time() - start 
            cv2.putText(vframe, "Time: " + str(stamp), (30, 30), font, 0.8, (255, 0, 0), 2, cv2.LINE_AA)
            cv2.putText(vframe, "FPS: " + str(1/stamp), (30, 60), font, 0.8, (255, 0, 0), 2, cv2.LINE_AA)
            cv2.imshow("Hand Key Points", big_frame)

        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            break
 
    # do a bit of cleanup
    cv2.destroyAllWindows()

main()

For comparison, I have a demo here using just the handpose model. This hopefully illustrates how the utilization of two networks can speed up inference.

In [5]:
# Hand Pose Estimation Demo
def main():
    ie = IECore()
    OS = 'windows'
    device = 'CPU'
    precision = 'fp32' if device.lower() == 'cpu' else 'fp16'
    
    vs = cv2.VideoCapture(0)
    
    ret, vframe = vs.read()
    aspect_ratio = vframe.shape[1]/vframe.shape[0]
    inHeight = 268
    inWidth = int(((aspect_ratio*inHeight)*8)//8)
    image_dict = pre_hp_processing(vframe, (inWidth, inHeight))
    net_dict = setup_net((inWidth, inHeight), precision, device, OS, ie)
    
    while True:
        start = time.time()
        ret, vframe = vs.read()
        image_dict = pre_hp_processing(vframe, (inWidth, inHeight))
        inpBlob = image_dict['blob']

        net = net_dict['net']
        input_layer = net_dict['input_layer']
        output_layer = net_dict['output_layer']

        res = net_dict['net'].infer({input_layer: inpBlob})
        hp_output = res[output_layer]
        
        draw_skeleton(hp_output, image_dict['frame'], (0, 0), vframe, None)
        
        stamp = time.time() - start 
        font = cv2.FONT_HERSHEY_SIMPLEX
        cv2.putText(vframe, "Time: " + str(stamp), (30, 30), font, 0.8, (255, 0, 0), 2, cv2.LINE_AA)
        cv2.putText(vframe, "FPS: " + str(1/stamp), (30, 60), font, 0.8, (255, 0, 0), 2, cv2.LINE_AA)
        cv2.imshow("Frame", image_dict['frame'])

        key = cv2.waitKey(1) & 0xFF
        if key == ord('q'):
            break
 
    # do a bit of cleanup
    cv2.destroyAllWindows()

main()