### Faster RCNN

We will be looking into the Faster RCNN Model available in Torch. This model is trained in the MS COCO dataset (which is a common public access database with over 80 classes). In this notebook we will use the model with our data and we are going to just exctract all the cases of a pedestrian which the model detects.

We are going to use OpenCV and Numpy to process our images. 

In [1]:
import torchvision.transforms as transforms
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from PIL import Image
from os import listdir
from os import path
import numpy as np
from torch import device
from torch import cuda
import cv2
import torch

Get the image and annotation files.

In [2]:
join = path.join

# image path and annotations path.
img_path = r'/Users/chiahaohsutai/Documents/GitHub/PRW/images/frames'
ann_path = r'/Users/chiahaohsutai/Documents/GitHub/PRW/annotations'

# get the image names.
img_names = sorted(list(listdir(img_path)))
img_names = [join(img_path, name) for name in img_names]

# get the annoation names.
ann_names = sorted(list(listdir(ann_path)))
ann_names = [join(ann_path, name) for name in ann_names]

In [3]:
CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# define the torchvision image transforms
transform = transforms.Compose([
    transforms.ToTensor(),
])

def predict(image, model, device, detection_threshold):

    # transform the image to tensor
    image = transform(image).to(device)
    image = image.unsqueeze(0) # add a batch dimension
    outputs = model(image)     # get the predictions on the image

    # get all the predicited class names
    pred_labels = outputs[0]['labels'].cpu().numpy()

    # get score for all the predicted objects
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()

    # get all the predicted bounding boxes
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()

    # get boxes above the threshold score
    boxes = pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)
    labels = pred_labels[pred_scores >= detection_threshold]
    
    # keep only pedestrian predictions.
    boxes = boxes[labels == 1]
    labels = labels[labels == 1]

    return boxes, labels, outputs

In [4]:
def draw_boxes(boxes, labels, image):

    # create a color for the bounding box.
    COLOR = [255, 0, 0] 
  
    # read the image with OpenCV
    image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)

    # draw only the boxes which are persons.
    for i, box in enumerate(boxes):
        if labels[i] == 1:
            cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), COLOR, 2)

    return image

In [5]:
# create the model.
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn(weights=weights)

In [6]:
# set the device.
device = device('mps' if torch.has_mps else 'cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# make a prediction.
image = Image.open(img_names[0])
model.eval().to(device)
boxes, labels, output = predict(image, model, device, 0.7)



In [8]:
output[0]['labels']

tensor([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  2,  2,  2,  2,  2,  2,
         2,  2,  1,  1,  2,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  2,
         2,  2,  2,  3,  2,  2,  1,  2,  2,  2,  2,  2,  2,  2, 19,  1,  2,  2,
         4,  1,  1,  2,  2,  2,  2,  1,  2,  2,  2,  2,  1,  2,  1,  2,  2,  2,
         2,  2,  1,  4,  2,  1,  2,  1,  2,  2,  2,  1,  2,  2,  2,  1,  1,  1,
         2,  2,  2,  1,  2,  2,  2,  2,  2,  2], device='mps:0')

In [9]:
labels

array([1, 1, 1, 1])

In [10]:
output[0]['scores']

tensor([0.9874, 0.9821, 0.9802, 0.9695, 0.9644, 0.9636, 0.9558, 0.9507, 0.9361,
        0.9227, 0.9154, 0.8968, 0.8703, 0.8656, 0.8588, 0.8526, 0.8521, 0.8397,
        0.8306, 0.8241, 0.8190, 0.8141, 0.8094, 0.7290, 0.7092, 0.6875, 0.6872,
        0.6726, 0.6535, 0.6455, 0.6414, 0.6342, 0.6295, 0.6283, 0.5871, 0.5818,
        0.5434, 0.4937, 0.4889, 0.4810, 0.4511, 0.4498, 0.4422, 0.4325, 0.4130,
        0.4045, 0.3921, 0.3814, 0.3713, 0.3525, 0.3462, 0.3417, 0.3318, 0.3283,
        0.3257, 0.3245, 0.3197, 0.3169, 0.3167, 0.3010, 0.2748, 0.2674, 0.2585,
        0.2541, 0.2529, 0.2526, 0.2429, 0.2426, 0.2309, 0.2282, 0.2282, 0.2273,
        0.2239, 0.2123, 0.2110, 0.2054, 0.1897, 0.1849, 0.1657, 0.1610, 0.1566,
        0.1465, 0.1391, 0.1391, 0.1342, 0.1322, 0.1267, 0.1222, 0.1209, 0.1207,
        0.1196, 0.1161, 0.1145, 0.1143, 0.1069, 0.1018, 0.1002, 0.0978, 0.0949,
        0.0948], device='mps:0', grad_fn=<IndexBackward0>)

In [11]:
boxes

array([[1197,  438, 1231,  482],
       [1327,  430, 1349,  481],
       [1035,  420, 1078,  565],
       [ 195,  486,  259,  601]], dtype=int32)

In [12]:
img = draw_boxes(boxes, labels, image)

In [13]:
img.shape

(1080, 1920, 3)

Now we are going to evaluate the model performance using MAE, IOU and RMSE.

In [14]:
from collections import namedtuple
import numpy as np
from scipy.io import loadmat
import pandas as pd

In [15]:
# define the `Detection` object
Detection = namedtuple("Detection", ["image_path", "gt", "pred"])

def iou(boxA, boxB):
    """Calculates Intersection Over Union."""
    
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)
    
    # compute the area of both the prediction and ground-truth
    # rectangles
    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)
    
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    
    # return the intersection over union value
    return iou

def get_boxes(ann):
    """Gets the correct key for annotations."""
    
    keys = ann.keys()
    key = None
    
    # get the correct key for the bouding box.
    for k in ['box_new', 'anno_file', 'anno_previous']:
        if k in keys:
            key = k
            break
    if key is None:
        raise ValueError("Invalid Annotation Error")
    
    # get the bounding boxes and convert to coordinates.
    bbox = [box[1:] for box in ann[key]]
    for box in bbox:
        xmin, ymin, w, h = box
        xmax, ymax = xmin+w, ymin+h
        box[-2] = xmax
        box[-1] = ymax
        
    return bbox

In [16]:
total_iou = []
# truth, pred
total_count = []
total = len(img_names)

# evaluate the model.
for img, ann in zip(img_names, ann_names):
    
    # get appropiate data.
    data = loadmat(ann)
    photo = Image.open(img)
    gt = get_boxes(data)
    
    # get a prediction
    with torch.no_grad():
        pred = predict(photo, model, device, 0.7)
    
    # go through the prediciton and calulate iou
    boxes, labels, _ = pred
    for bbox in boxes:
        # get the iou.
        union = 0
        for b in gt:
            union = max(union, iou(b, bbox))
        if union > 0.25:
            total_iou.append(union)
    
    # get a count of the boxes.
    total_count.append((len(gt), len(labels)))

In [28]:
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_percentage_error as MAPE

In [29]:
# extract the counts.
y_true = [c[0] for c in total_count]
y_pred = [c[1] for c in total_count]

mse_eval = MSE(y_true, y_pred, squared=False)
mae_eval = MAPE(y_true, y_pred)
avg_iou = sum(total_iou) / len(total_iou)

In [30]:
print(f'RMSE: {mse_eval}, MAPE: {mae_eval}, Avg. IOU: {avg_iou}')

RMSE: 2.7832132932300984, MAPE: 0.8232211789430309, Avg. IOU: 0.7728651640533937
