# Vision System

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import cv2

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from sklearn.metrics import confusion_matrix

In [2]:
WIDTH, HEIGHT = 300, 300
MAX_LABELS = 1

class_encoding = {
    "Wire": 0.0, 
    "Bolt": 1.0, 
    "Label": 2.0, 
    "LuggageTag": 3.0
}

inv_class_encoding = {v: k for k, v in class_encoding.items()}

print(class_encoding)
print(inv_class_encoding)

{'Wire': 0.0, 'Bolt': 1.0, 'Label': 2.0, 'LuggageTag': 3.0}
{0.0: 'Wire', 1.0: 'Bolt', 2.0: 'Label', 3.0: 'LuggageTag'}


In [3]:
def label_image(mat, bbox_coordinates):
    for box in bbox_coordinates:
        label, x1, y1, x2, y2 = box

        x1 = int(x1)
        y1 = int(y1)
        x2 = int(x2)
        y2 = int(y2)

        cv2.rectangle(mat, (x1, y1), (x2, y2), (255,0,0), 2)
        cv2.putText(mat, label, (x1, y1-8), 0, 0.5, (255,0,0))

In [5]:
from tensorflow.python.keras import backend as K

def calculate_iou(target_boxes, pred_boxes):
    # Intersection of both bounding boxes
    x1 = K.maximum(target_boxes[..., 0], pred_boxes[..., 0])
    y1 = K.maximum(target_boxes[..., 1], pred_boxes[..., 1])
    x2 = K.minimum(target_boxes[..., 2], pred_boxes[..., 2])
    y2 = K.minimum(target_boxes[..., 3], pred_boxes[..., 3])

    inter_area = K.maximum(0.0, x2 - x1) * K.maximum(0.0, y2 - y1)
    
    # Areas of the individual bounding boxes
    target_area = (target_boxes[..., 2] - target_boxes[..., 0]) * (target_boxes[..., 3] - target_boxes[..., 1])
    pred_area = (pred_boxes[..., 2] - pred_boxes[..., 0]) * (pred_boxes[..., 3] - pred_boxes[..., 1])
    
    return inter_area / (target_area + pred_area - inter_area)

def loss_iou(y_truth, y_pred):
    # Use mean absolute error to avoid vanishing/exploding gradient
    mae = tf.losses.mean_absolute_error(y_truth, y_pred)
    iou = calculate_iou(y_truth, y_pred)
    return mae + (1 - iou)

In [7]:
backbone = tf.keras.applications.MobileNetV2(weights = "imagenet", include_top = False, input_shape = (HEIGHT, WIDTH, 3))

# Freeze layers of base model
backbone.trainable = False

# Dual branch for classification and bounding box regression
classifier = layers.GlobalAveragePooling2D()(backbone.output)
classifier = layers.Dropout(0.7)(classifier)
classifier = layers.Dense(256, activation="relu")(classifier)
classifier = layers.Dense(128, activation="relu")(classifier)
classifier = layers.Dense(MAX_LABELS*len(class_encoding), activation="softmax", name="classifier")(classifier) 

locator = layers.GlobalAveragePooling2D()(backbone.output)
locator = layers.Dropout(0.3)(locator)
locator = layers.Dense(512, activation="relu")(locator)
locator = layers.Dense(256, activation="relu")(locator)
locator = layers.Dense(64, activation="sigmoid")(locator)
locator = layers.Dense(MAX_LABELS*4, activation="sigmoid", name="locator")(locator)

# Build model
model = Model(backbone.input, outputs=[classifier, locator])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 300, 300, 3  0           []                               
                                )]                                                                
                                                                                                  
 Conv1 (Conv2D)                 (None, 150, 150, 32  864         ['input_1[0][0]']                
                                )                                                                 
                                                                                                  
 bn_Conv1 (BatchNormalization)  (None, 150, 150, 32  128         ['Conv1[0][0]']                  
                                )                                                             

In [8]:
model.load_weights("models/mnv2_transfer")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1e3ca634d00>

In [9]:
cap = cv2.VideoCapture(0)

if not cap.isOpened():
    print("Cannot open camera")

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()

    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break

    frame = cv2.resize(frame, (WIDTH, HEIGHT), interpolation = cv2.INTER_AREA)
    #frame = cv2.resize(frame, (224, 224), interpolation = cv2.INTER_AREA)
    input_frame = np.expand_dims(frame, axis=0)

    class_inds, bboxes = model.predict(input_frame)
    #print(result)

    #class_inds, bboxes = np.split(result, [MAX_LABELS*len(class_encoding), MAX_LABELS*4])
    class_inds = np.reshape(class_inds, [MAX_LABELS, len(class_encoding)])
    bboxes = np.reshape(bboxes, [MAX_LABELS, 4])

    print(class_inds, bboxes)

    for i in range(MAX_LABELS):
        class_label = inv_class_encoding[class_inds[i].argmax()]
        class_label += " " + str(class_inds[i].max())
        bbox = bboxes[i]
        bbox = bbox * 300

        label_image(frame, [[class_label, *tuple(bbox)]])
        cv2.putText(frame, class_label, (0, 10), 0, 0.5, (255,0,0))

    # Display the resulting frame
    cv2.imshow("frame", frame)
    if cv2.waitKey(1) == ord("q"):
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()

[[4.4093697e-04 1.3629083e-03 9.9818075e-01 1.5500891e-05]] [[0.43182427 0.7287268  0.53522927 0.9209268 ]]
[[9.1547281e-06 1.7796154e-07 9.9999058e-01 1.2791847e-07]] [[0.41816995 0.6173909  0.58843356 0.8550034 ]]
[[8.5475898e-05 2.2698430e-06 9.9991047e-01 1.7857094e-06]] [[0.4174466  0.62052745 0.5905536  0.8847686 ]]
[[2.1004782e-06 3.5373020e-08 9.9999785e-01 2.2081808e-08]] [[0.4131955  0.5875708  0.5847267  0.82522935]]
[[5.2294581e-05 1.1632993e-06 9.9994564e-01 9.9398324e-07]] [[0.41549394 0.58298796 0.5926288  0.8248892 ]]
[[2.9260833e-05 9.1008206e-07 9.9996924e-01 6.2696353e-07]] [[0.40276   0.5682832 0.5951832 0.8110776]]
[[4.2552583e-06 1.8323756e-07 9.9999547e-01 1.3129912e-07]] [[0.42843753 0.62513983 0.577404   0.8450468 ]]
[[1.1766582e-05 1.4104219e-07 9.9998796e-01 6.5510569e-08]] [[0.4195033  0.5936344  0.5880729  0.83486605]]
[[3.0589395e-06 7.2330515e-08 9.9999690e-01 3.8028727e-08]] [[0.4140571 0.5919242 0.5910435 0.8279593]]
[[3.0589395e-06 7.2330515e-08 9.9999

KeyboardInterrupt: 

: 