### Faster RCNN

We will be looking into the Faster RCNN Model available in Torch. This model is trained in the MS COCO dataset (which is a common public access database with over 80 classes). In this notebook we will use the model with our data and we are going to just exctract all the cases of a pedestrian which the model detects.

We are going to use OpenCV and Numpy to process our images. 

In [1]:
import torchvision.transforms as transforms
from torchvision.models.detection import FasterRCNN_ResNet50_FPN_Weights
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from PIL import Image
from os import listdir
from os import path
import numpy as np
from torch import device
from torch import cuda
import cv2

Get the image and annotation files.

In [2]:
join = path.join

# image path and annotations path.
img_path, ann_path = 'PRW/frames', 'PRW/annotations'

# get the image names.
img_names = sorted(list(listdir(img_path)))
img_names = [join(img_path, name) for name in img_names]

# get the annoation names.
ann_names = sorted(list(listdir(ann_path)))
ann_names = [join(ann_path, name) for name in ann_names]

In [11]:
CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

# define the torchvision image transforms
transform = transforms.Compose([
    transforms.ToTensor(),
])

def predict(image, model, device, detection_threshold):

    # transform the image to tensor
    image = transform(image).to(device)
    image = image.unsqueeze(0) # add a batch dimension
    outputs = model(image)     # get the predictions on the image

    # get all the predicited class names
    pred_labels = outputs[0]['labels'].cpu().numpy()

    # get score for all the predicted objects
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()

    # get all the predicted bounding boxes
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()

    # get boxes above the threshold score
    boxes = pred_bboxes[pred_scores >= detection_threshold].astype(np.int32)
    labels = pred_labels[pred_scores >= detection_threshold]


    return boxes, labels, outputs

In [4]:
def draw_boxes(boxes, labels, image):

    # create a color for the bounding box.
    COLOR = [255, 0, 0] 
  
    # read the image with OpenCV
    image = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2RGB)

    # draw only the boxes which are persons.
    for i, box in enumerate(boxes):
        if labels[i] == 1:
            cv2.rectangle(image, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), COLOR, 2)

    return image

In [5]:
# create the model.
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn(weights=weights)

In [6]:
# set the device.
device = device('cuda' if cuda.is_available() else 'cpu')

In [12]:
# make a prediction.
image = Image.open(img_names[0])
model.eval().to(device)
boxes, labels, output = predict(image, model, device, 0.7)

In [14]:
output[0]

{'boxes': tensor([[8.9638e-01, 6.5949e+02, 1.6961e+02, 8.8538e+02],
         [1.9116e+00, 5.1553e+02, 7.7712e+01, 5.9065e+02],
         [1.1603e+02, 6.3123e+02, 2.8724e+02, 8.4227e+02],
         [1.1942e+03, 4.5441e+02, 1.2893e+03, 5.1249e+02],
         [8.3765e+02, 5.2466e+02, 9.2328e+02, 6.2705e+02],
         [3.6395e+02, 5.8677e+02, 5.1083e+02, 7.5238e+02],
         [4.6648e+02, 5.6279e+02, 5.9832e+02, 7.3327e+02],
         [1.3083e+03, 4.5091e+02, 1.3507e+03, 4.9087e+02],
         [2.5544e+02, 5.9312e+02, 4.1038e+02, 7.7712e+02],
         [9.7126e+02, 4.9954e+02, 1.0314e+03, 5.9224e+02],
         [5.6701e+02, 5.6997e+02, 6.4556e+02, 7.1699e+02],
         [1.1979e+03, 4.3838e+02, 1.2316e+03, 4.8274e+02],
         [2.5808e+02, 4.9751e+02, 3.2825e+02, 5.5663e+02],
         [3.2818e+02, 4.8907e+02, 4.3496e+02, 5.4716e+02],
         [7.5894e+02, 5.4281e+02, 8.5454e+02, 6.4946e+02],
         [6.5781e+02, 5.3686e+02, 7.7704e+02, 6.7708e+02],
         [1.0600e+03, 4.9029e+02, 1.1190e+03, 5

In [15]:
output[0]['labels']

tensor([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  2,  2,  2,  2,  2,  2,
         2,  2,  1,  1,  2,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  2,
         2,  2,  2,  3,  2,  2,  1,  2,  2,  2,  2,  2,  2,  2, 19,  1,  2,  2,
         4,  1,  1,  2,  2,  2,  2,  1,  2,  2,  2,  2,  1,  2,  1,  2,  2,  2,
         2,  2,  1,  4,  2,  1,  2,  1,  2,  2,  2,  1,  2,  2,  2,  1,  1,  1,
         2,  2,  2,  1,  2,  2,  2,  2,  2,  2])

In [17]:
output[0]['scores']

tensor([0.9874, 0.9821, 0.9802, 0.9695, 0.9644, 0.9636, 0.9558, 0.9507, 0.9361,
        0.9227, 0.9154, 0.8968, 0.8703, 0.8656, 0.8588, 0.8526, 0.8521, 0.8397,
        0.8306, 0.8242, 0.8191, 0.8141, 0.8094, 0.7290, 0.7092, 0.6875, 0.6872,
        0.6726, 0.6535, 0.6455, 0.6413, 0.6342, 0.6295, 0.6282, 0.5871, 0.5818,
        0.5434, 0.4937, 0.4889, 0.4809, 0.4511, 0.4498, 0.4422, 0.4325, 0.4130,
        0.4045, 0.3921, 0.3814, 0.3713, 0.3525, 0.3462, 0.3417, 0.3318, 0.3283,
        0.3257, 0.3245, 0.3197, 0.3170, 0.3167, 0.3010, 0.2748, 0.2674, 0.2585,
        0.2541, 0.2529, 0.2526, 0.2429, 0.2426, 0.2309, 0.2282, 0.2282, 0.2273,
        0.2239, 0.2123, 0.2110, 0.2054, 0.1897, 0.1849, 0.1657, 0.1610, 0.1566,
        0.1466, 0.1391, 0.1391, 0.1342, 0.1322, 0.1267, 0.1222, 0.1209, 0.1207,
        0.1196, 0.1161, 0.1145, 0.1143, 0.1069, 0.1018, 0.1002, 0.0978, 0.0949,
        0.0948], grad_fn=<IndexBackward0>)

: 

In [8]:
img = draw_boxes(boxes, labels, image)

# display the image.
cv2.imshow('Image', img)
cv2.waitKey(0)
cv2.destroyWindow('Image')
cv2.waitKey(1)

-1

In [9]:
img.shape

(1080, 1920, 3)