# Validation of Faster RCNN ResNet50


This code performs object detection using a pre-trained Faster R-CNN ResNet-50 model trained on the COCO dataset. first the mode is sets up the model for evaluation, and defines functions for calculating Intersection over Union (IoU), average precision (AP), mean Average Precision (mAP), parsing predictions and XML annotations. Additionally, it loads images and corresponding XML annotations, sorts bounding boxes, calculates mAP for each class, and prints relevant information including bounding boxes and classes. Finally, it calculates and prints the mAP value and frames per second (fps) metrics.

In [1]:
#!pip install torch torchvision
#!pip install opencv-python
#!pip install pycocotools
#!pip install torch opencv-python numpy Pillow

In [2]:
# @title importing packages
import torchvision
import torch
import torchvision.transforms as transforms
from torchvision.models.detection import fasterrcnn_resnet50_fpn
import cv2
import numpy as np
from PIL import Image
import time
np.random.seed(42)

In [3]:
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'aeroplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'sheep', 'horse', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]
# Categories to be replaced with 'N/A'
categories_to_replace = [ 'airplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair',
                         'cow', 'dining table', 'dog', 'horse', 'motorcycle', 'person', 'potted plant',
                         'sheep',  'train', 'tv']

In [4]:
# @title Download or load model if True or False respectively
# Define the computation device.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = fasterrcnn_resnet50_fpn(weights='FasterRCNN_ResNet50_FPN_Weights.COCO_V1', weights_backbone=None).to(device)
# Set the model to evaluation mode
model = model.eval()

In [5]:
np.random.seed(42)
total_elapsed_time=[]
def get_model(image_path):
   '''
    Functionality: Perform object detection on an image using a pre-trained model.

    Parameters:
    - image_path (str): Path to the input image file.

    Returns:
    - boxes (numpy.ndarray): Array of predicted bounding boxes for detected objects.
    - pred_classes (list): List of predicted class names for detected objects.
    '''
   transform = transforms.Compose([
      transforms.ToTensor(),
  ])
   device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Read the image.
   image = Image.open(image_path).convert('RGB')
    # Create a BGR copy of the image for annotation.
   image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
   image = transform(image).to(device)
   image = image.unsqueeze(0)
   with torch.no_grad():
    start_time = time.time()  # Record the start time
    outputs = model(image)
    end_time = time.time()  # Record the end time
    elapsed_time = end_time - start_time
    total_elapsed_time .append(elapsed_time)
    print(f"Time taken for prediction: {elapsed_time:.4f} seconds")
    # Get score for all the predicted objects.
    pred_scores = outputs[0]['scores'].detach().cpu().numpy()
    # Get all the predicted bounding boxes.
    pred_bboxes = outputs[0]['boxes'].detach().cpu().numpy()
    # Get boxes above the threshold score.
    # boxes = pred_bboxes[pred_scores >= 0.8].astype(np.int32)

    pred_labels = outputs[0]['labels'].detach().cpu().numpy()
    # Filter out boxes for the desired classes.
    desired_indices = [i for i, label in enumerate(pred_labels) if COCO_INSTANCE_CATEGORY_NAMES [label] in categories_to_replace and pred_scores[i] >= 0.8]
    boxes = pred_bboxes[desired_indices].astype(np.int32)
    labels = pred_labels[desired_indices]
    # Get all the predicted class names.
    pred_classes = [COCO_INSTANCE_CATEGORY_NAMES[i] for i in labels]
    return boxes, pred_classes

In [6]:
def calculate_iou(box1, box2):

   '''
    Functionality: Calculate the Intersection over Union (IoU) between two bounding boxes.

    Parameters:
    - box1 (tuple): Tuple representing the coordinates of the first bounding box (x1, y1, x2, y2).
    - box2 (tuple): Tuple representing the coordinates of the second bounding box (x1, y1, x2, y2).

    Returns:
    - iou (float): Intersection over Union (IoU) score between the two bounding boxes.
    '''
   x1 = max(box1[0], box2[0])
   y1 = max(box1[1], box2[1])
   x2 = min(box1[2], box2[2])
   y2 = min(box1[3], box2[3])

   intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
   box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
   box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])

   union_area = box1_area + box2_area - intersection_area

   iou = intersection_area / float(union_area) if union_area > 0 else 0
   return iou

def calculate_average_precision(gt_boxes, pred_boxes, iou_threshold=0.5):

   '''
    Functionality: Calculate the average precision (AP) and recall for a set of ground truth and predicted bounding boxes.

    Parameters:
    - gt_boxes (list): List of ground truth bounding boxes.
    - pred_boxes (list): List of predicted bounding boxes.
    - iou_threshold (float): IoU threshold for considering a detection as true positive.

    Returns:
    - precision (float): Precision score.
    - recall (float): Recall score.
    '''
   tp = 0  # True Positives
   fp = 0  # False Positives
   fn = 0  # False Negatives

   for i in range(min(len(gt_boxes), len(pred_boxes))):
    iou = calculate_iou(gt_boxes, pred_boxes)
    if iou >= iou_threshold:
        tp += 1
    else:
       fp += 1

    fn = len(gt_boxes) - tp

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0

    return precision, recall


total = []

def calculate_mAP(gt_boxes_list, pred_boxes_list, iou_threshold=0.5):

  '''
    Functionality: Calculate the mean Average Precision (mAP) for a set of ground truth and predicted bounding boxes.

    Parameters:
    - gt_boxes_list (list): List of lists containing ground truth bounding boxes for each class.
    - pred_boxes_list (list): List of lists containing predicted bounding boxes for each class.
    - iou_threshold (float): IoU threshold for considering a detection as true positive.
    '''
  total_precision = 0.0
  total_recall = 0.0
  for class_idx, (gt_boxes, pred_boxes) in enumerate(zip(gt_boxes_list, pred_boxes_list)):
        precision, recall = calculate_average_precision(gt_boxes, pred_boxes, iou_threshold)
        total_precision += precision
        total_recall += recall
        print(f"Class {class_idx + 1}: Precision={precision}, Recall={recall}")
        total.append(total_precision)

def cal_mAP(ground_truth):
   '''
    Functionality: Calculate the mean Average Precision (mAP) for a set of ground truth bounding boxes.

    Parameters:
    - ground_truth (list): List of ground truth bounding boxes.
    '''
   mAP =  sum(ground_truth)/ sum(total)
   mAP = mAP  * 100
   print(f"mAP: {mAP}%")

In [7]:
from xml.etree import ElementTree as ET
import numpy as np
import os

image_dir = 'voc challenge2007/VOC2007/JPEGImages'
annotation_dir = 'voc challenge2007/VOC2007/Annotations'

def parse_prediction(prediction):
  '''
    Functionality: Parse predicted bounding boxes from a list of coordinates to a list of lists of integers.

    Parameters:
    - prediction (list): List of predicted bounding box coordinates.

    Returns:
    - parsed_prediction (list): Parsed list of lists containing integer coordinates.
    '''
  parsed_prediction = [list(map(int, box)) for box in prediction]
  return parsed_prediction

# Function to parse XML annotations and extract ground truth bounding boxes
def parse_annotation(xml_file):
  '''
    Functionality: Parse XML annotations to extract ground truth bounding boxes and object names.

    Parameters:
    - xml_file (str): Path to the XML annotation file.

    Returns:
    - gt_boxes (list): List of ground truth bounding boxes.
    - gt_name (list): List of object names corresponding to the ground truth bounding boxes.
    '''
  tree = ET.parse(xml_file)
  root = tree.getroot()
  gt_boxes = []
  gt_name = []
  for obj in root.findall('object'):
      name = obj.find('name').text
      if name not in COCO_INSTANCE_CATEGORY_NAMES:
          continue
      bbox = obj.find('bndbox')
      xmin = int(bbox.find('xmin').text)
      ymin = int(bbox.find('ymin').text)
      xmax = int(bbox.find('xmax').text)
      ymax = int(bbox.find('ymax').text)
      gt_boxes.append([xmin, ymin, xmax, ymax])
      gt_name.append(name)


  return gt_boxes, gt_name

# image_path = 'VOC2007/JPEGImages/000004.jpg'

all_pred_scores = []
# Iterate over each image in the dataset
image_files = os.listdir(image_dir)
ground_truth = []
for image_file in image_files:
    # Load the image
    image_path = os.path.join(image_dir, image_file)

    # image = Image.open(image_path).convert('RGB')
    # Load the corresponding XML annotation
    boxes, pred_classes = get_model(image_path)
    annotation_file = os.path.splitext(image_file)[0] + '.xml'
    annotation_path = os.path.join(annotation_dir, annotation_file)
    print(f'Currently working on, annotation: {annotation_path} and image : {image_path}')
    # Parse the XML file to get ground truth bounding boxes
    gt_boxes, gt_name = parse_annotation(annotation_path)
    ground_truth.append(len(gt_name))

    def compare_function(lst):
        return lst[0]

    # Sort the first list based on the comparison function
    gt_boxes_sorted = sorted(gt_boxes, key=compare_function)
    pred_boxes_sorted = sorted(parse_prediction(boxes), key=compare_function)

    print(f'The boxes are, Ground truth: {gt_boxes_sorted} and predicted : {pred_boxes_sorted}\n')
    print(f'The classes are, Ground truth: {gt_name} and predicted : {pred_classes}\n')

    iou_threshold_value = 0.5
    calculate_mAP(gt_boxes_sorted, pred_boxes_sorted, iou_threshold_value)
print("The Code has finished running")

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'voc challenge2007/VOC2007/JPEGImages'

In [None]:
cal_mAP(ground_truth)

mAP: 82.94290834613415%


In [None]:
def fps():
   print('Total fps taken',sum(total_elapsed_time))
   print('Average fps taken',sum(total_elapsed_time)/ len(total_elapsed_time) )

In [None]:
fps()

Total fps taken 483.2442395687103
Average fps taken 0.09758567034909336
