### Faster RCNN 

We are looking into the Faster RCNN model with a mobile v3 backbone, which is supposed to be faster than the Faster RCNN model with ResNet backbone. However, its supposed to be less accurate.

We are following a similar or equivalent procedure as the other notebook with an RCNN model.

In [1]:
import torchvision.transforms as transforms
from torchvision.models.detection import FasterRCNN_MobileNet_V3_Large_320_FPN_Weights
from torchvision.models.detection import fasterrcnn_mobilenet_v3_large_320_fpn
from PIL import Image
from os import listdir
from os import path
import numpy as np
from torch import device
from torch import cuda
import cv2
import torch
from IPython.display import clear_output
import warnings
warnings.filterwarnings("ignore")

In [2]:
join = path.join

# image path and annotations path.
img_path = r'/Users/chiahaohsutai/Documents/GitHub/PRW/images/frames'
ann_path = r'/Users/chiahaohsutai/Documents/GitHub/PRW/annotations'

# get the image names.
img_names = sorted(list(listdir(img_path)))
img_names = [join(img_path, name) for name in img_names]

# get the annoation names.
ann_names = sorted(list(listdir(ann_path)))
ann_names = [join(ann_path, name) for name in ann_names]

In [3]:
CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# define the torchvision image transforms
transform = transforms.Compose([
    transforms.ToTensor(),
])

def predict(image, model, device, detection_threshold):

    # transform the image to tensor.
    image = transform(image).to(device)
    image = image.unsqueeze(0) # add a batch dimension
    outputs = model(image)     # get the predictions on the image

    # get all the predicited class names
    pred_labels = outputs[0]['labels'].cpu().numpy()

    # get score for all the predicted objects.
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()

    # get all the predicted bounding boxes.
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()

    # get boxes above the threshold score.
    boxes = pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)
    labels = pred_labels[pred_scores >= detection_threshold]

    return boxes, labels

In [4]:
def draw_boxes(boxes, labels, image):

    # create a color for the bounding box.
    COLOR = [255, 0, 0] 
  
    # read the image with OpenCV
    image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)

    # draw only the boxes which are persons.
    for i, box in enumerate(boxes):
        if labels[i] == 1:
            cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), COLOR, 2)

    return image

In [5]:
# create the model.
weights = FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.DEFAULT
model = fasterrcnn_mobilenet_v3_large_320_fpn(weights=weights)

In [6]:
# set the device.
device = device('mps' if torch.has_mps else 'cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# make a prediction.
image = Image.open(img_names[0])
model.eval().to(device)
clear_output()

In [8]:
with torch.no_grad():
    boxes, labels = predict(image, model, device, 0.7)

In [9]:
labels

array([2, 2, 2, 2])

In [10]:
img = draw_boxes(boxes, labels, image)

# display the image.
cv2.imshow('Image', img)
cv2.waitKey(5000)
cv2.destroyWindow('Image')
cv2.waitKey(1)

-1

This model performs significantly worst than the FasterRCNN with ResNet 50 backbone but its much much faster. It delivers a results extremely quickly. But it only catches pedestrians which are in clear sight and close to the front of the image.

Now we will do some evalutions of the model.

In [11]:
from collections import namedtuple
import numpy as np
from scipy.io import loadmat
import pandas as pd

In [12]:
# define the `Detection` object
Detection = namedtuple("Detection", ["image_path", "gt", "pred"])

def iou(boxA, boxB):
    """Calculates Intersection Over Union."""
    
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    
    # return the intersection over union value
    return iou

In [17]:
def get_boxes(ann):
    """Gets the correct key for annotations."""
    
    keys = ann.keys()
    key = None
    
    # get the correct key for the bouding box.
    for k in ['box_new', 'anno_file', 'anno_previous']:
        if k in keys:
            key = k
            break
    if key is None:
        raise ValueError("Invalid Annotation Error")
    
    # get the bounding boxes and convert to coordinates.
    bbox = [box[1:] for box in ann[key]]
    for box in bbox:
        xmin, ymin, w, h = box
        xmax, ymax = xmin+w, ymin+h
        box[-2] = xmax
        box[-1] = ymax
        
    return bbox

In [22]:
total_iou = []
# truth, pred
total_count = []
total = len(img_names)

# evaluate the model.
for img, ann in zip(img_names, ann_names):
    data = loadmat(ann)
    photo = Image.open(img)
    gt = get_boxes(data)
    
    # get a prediction
    with torch.no_grad():
        pred = predict(photo, model, device, 0.7)
    
    # go through the prediciton and calulate iou
    boxes, labels = pred
    for bbox, label in zip(boxes, labels):
        union = 0
        if int(label) == 1:
            for b in gt:
                union = max(union, iou(b, bbox))
        if union > 0.25:
            total_iou += union
    
    # get a count of the boxes.
    total_count.append((len(gt), len([label == 1 for label in pred[-1]])))

In [23]:
print(len(total_iou))
print(len(total_count))

0
11816
