## Object Detection using PyTorch

### Import necessary libraries

In [None]:
import torch
import torchvision

from PIL import Image
from pprint import pprint
from collections import Counter
import requests
import ast

In [None]:
### Set options

IMAGE_URL = ""


### Helper Functions

In [None]:
def get_mapping_dict():
    idx_to_labels_url = "https://gist.githubusercontent.com/suraj813/1fe4c9dd0bc7e1dd1ce79462712ac9ce/raw/0e2c65813946769a375d673a34a1c0236b0505f1/coco_idx_to_labels.txt"
    r = requests.get(idx_to_labels_url).text
    return ast.literal_eval(r)

def load_input(img_path):
    image = Image.open(img_path)
    image = torchvision.transforms.ToTensor()(image)
    return image

def count_objects(model_output):
    _, labels, confidence = model_output[0].values()
    label_map = get_mapping_dict()
    detected_objects = []
    
    # filter out low-confidence predictions
    confidence_threshold = 0.85
    for label, confidence in zip(labels.tolist(), confidence.tolist()):
        if confidence > confidence_threshold:
            classname = label_map[str(label)]
            detected_objects.append((classname, confidence,))
    
    counts = Counter([x[0] for x in detected_objects])
    return detected_objects, counts 

## Main pipeline

### Load a pretrained torchvision model

In [None]:
def load_model():
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    # Set it to `eval` mode because we aren't training the model
    model.eval()
    return model

model = load_model()

### Get an image to analyze

In [None]:
def download_image(url):
    if url.startswith("http"):
        r = requests.get(url).content
        open("input.jpg", "wb").write(r)
        url = "input.jpg"
    return url
    
img_path = download_image(IMAGE_URL)
Image.open(img_path).show()

### Preprocess the image for inference
- we convert the human-readable image into a model-readable tensor

In [None]:
img_tensor = load_input(img_path)

print(img_tensor.shape)
print(img_tensor[:10])

### Batchify
- Since the operations on each image are identical and independent of each other, they can be performed in parallel. This is why inputs to deep learning models are batches of images (or text or audio or whatever your model consumes)
- In our case, we are interested in predictions on a single image. So we create a batch of size 1.

In [None]:
list_of_images = [img_tensor]  # singleton list
X = torch.tensor(list_of_images)  # input batch

### Run inference on the image
- Pass the input batch through the model
- The model returns an output batch, one entry for each input image

In [None]:
predictions = model(X)

print(predictions)

### Post-process output
For each object detected in an input image, the model returns to us:
- what it thinks the object is (_label_)
- how confident it is about it's prediction (_confidence_)
- co-ordinates of where in the image is the detected object (_bounding box_)

In our function, we are only interested in the objects the model detects with high confidence. Further, there might be multiple occurences of an object in the image; we also want the function to count how many times each object appears in the image


In [None]:
def count_detected_objects(model_output, confidence_threshold):
    bbox, labels, confidence = model_output.values()
    label_map = get_mapping_dict()
    detected_objects = []
    
    # filter out low-confidence predictions
    for label, confidence in zip(labels.tolist(), confidence.tolist()):
        if confidence > confidence_threshold:
            classname = label_map[str(label)]
            detected_objects.append((classname, confidence,))
    
    counts = Counter([x[0] for x in detected_objects])
    return detected_objects, counts 
    

detected_objects, counts = count_detected_objects(predictions[0], confidence_threshold=0.85)   

print("Detected objects:")
print("="*20)
pprint(detected_objects)
print()

print("Count of objects:")
print("="*20)
pprint(counts)