## DeepSORT Tracking

### Introduction
This jupyter notebook runs a DeepSORT Algorithm atop of various object detection (bounding box) models to track objects in video files.


## Install Python Dependencies

Python Version Used: 3.7 =< version =< 3.9

In [None]:
!pip install -U tensorflow==2.5.0
!pip install -U numpy==1.18.5
!pip install -U Pillow==7.2.0
!pip install -U opencv-python==4.5.1.48
!pip install -U flask==2.1.2
!pip install -U tf-slim


## Import Python Libraries


In [None]:
import os
import cv2
import time
import os.path
import numpy as np
import tensorflow as tf

%cd deep_sort_realtime
from PIL import Image
from deep_sort_realtime.deepsort_tracker import DeepSort
%cd ..

## Load Label

In [None]:
def load_label(model_path):
    """Reads label map in the format of .pbtxt and parse into dictionary

    Args:
        label_map_path: the file path to the label_map

    Returns:
        dictionary with the format of {label_index: {'id': label_index, 'name': label_name}}
    """
    label_map_path = os.path.join(model_path, "label_map.pbtxt")

    if os.path.exists(label_map_path) is False:
        raise FileNotFoundError("No valid label map found.")

    label_map = {}

    with open(label_map_path, "r") as label_file:
        for line in label_file:
            if "id" in line:
                label_index = int(line.split(":")[-1])
                label_name = next(label_file).split(":")[-1].strip().strip("'")
                label_map[label_index] = {"id": label_index, "name": label_name}

    return label_map


## Load Image

In [None]:
def load_image_into_numpy_array(img, height, width):
    """Load an image from base64 into a numpy array.

    Puts image into numpy array to feed into tensorflow graph.
    Note that by convention we put it into a numpy array with shape
    (height, width, channels), where channels=3 for RGB.

    Args:
        image_base64: image in base64 format
        height: height of image
        width: width of image

    Returns:
        uint8 numpy array with shape (img_height, img_width, 3)
    """
    image_shape = np.asarray(img).shape

    image_resized = img.resize((height, width))
    return np.array(image_resized), (image_shape[0], image_shape[1])


## Prediction Functions

### Bounding Box Tensorflow Prediction

In [None]:
def bbox_tf_predition(img, trained_model, size, threshold):
    """Run prediction on image in base64 format with trained tf bounding box model and label.

    Args:
        img: input image
        trained_model: loaded tensorflow model.
        size: size of image to load.
        threshold: confidence threshold.

    Returns:
        list for detection in the format of
        [(boxes,classes,scores)...]
        boxes: Array of array contains normalized boxes coorindates,
        classes: Array of numbers representing class,
        scores: Array of numbers representing confidence scores

    """

    ## Run prediction
    height, width = size.split(",")

    ## Returned original_shape is in the format of width, height
    image_resized, origi_shape = load_image_into_numpy_array(
        img, int(height), int(width)
    )

    ## The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image_resized)

    ## The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis, ...]

    ## Feed image into model
    detections_output = trained_model(input_tensor)

    num_detections = int(detections_output.pop("num_detections"))
    detections = {
        key: value[0, :num_detections].numpy()
        for key, value in detections_output.items()
    }
    detections["num_detections"] = num_detections

    ## Filter out predictions below threshold
    indexes = np.where(detections["detection_scores"] > float(threshold))

    ## Extract predictions
    bboxes = detections["detection_boxes"][indexes]
    classes = detections["detection_classes"][indexes].astype(np.int64)
    scores = detections["detection_scores"][indexes]
    detection_info = {
        "boxes": bboxes.tolist(),
        "classes": classes.tolist(),
        "scores": scores.tolist(),
    }
    if len(detection_info["boxes"]) == 0:
        return None
    else:
        detection_out = []
        for i in range(0, len(detection_info["boxes"])):
            temp_box = [0, 0, 0, 0]
            temp_box[3] = (
                detection_info["boxes"][i][2] - detection_info["boxes"][i][0]
            ) * origi_shape[0]
            temp_box[2] = (
                detection_info["boxes"][i][3] - detection_info["boxes"][i][1]
            ) * origi_shape[1]
            temp_box[1] = detection_info["boxes"][i][0] * origi_shape[0]
            temp_box[0] = detection_info["boxes"][i][1] * origi_shape[1]

            detection_out.append(
                (temp_box, detection_info["scores"][i], detection_info["classes"][i])
            )

    return detection_out


## DeepSORT

In [None]:
def DeepSORT(
    video_path, model_path, size, threshold, output_format, output_vid_trk_path, COLORS
):

    # load model
    trained_model = tf.saved_model.load(os.path.join(model_path, "saved_model"))

    # load map
    category_index = load_label(model_path)

    # read in video
    vid = cv2.VideoCapture(video_path)

    # initialize video writer
    width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(vid.get(cv2.CAP_PROP_FPS))
    codec = cv2.VideoWriter_fourcc(*output_format)
    out = cv2.VideoWriter(output_vid_trk_path, codec, fps, (width, height))

    # initialize tracker
    tracker = DeepSort(
        max_age=100,
        nn_budget=20,
        nms_max_overlap=0.3,
        override_track_class=None,
        trained_model=os.path.join(model_path, "saved_model"),
    )

    idx = 0
    while True:
        # Read Video
        (grabbed, frame) = vid.read()
        if not grabbed:
            break

        # Do detection on every video frame
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        prediction = bbox_tf_predition(
            Image.fromarray(frame), trained_model, size, threshold
        )

        # the mobilenet detection function "predict()" should output bounding boxes, score, and class name
        # instead of only image data.

        if prediction == None:
            result = np.asarray(frame)
            result = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            out.write(result)
            print("Frame " + str(idx) + " is written!")
            idx += 1
            continue

        tracks = tracker.update_tracks(prediction, frame=frame)

        # update tracks
        for track in tracks:
            bbox = track.to_tlbr(oori=True)
            if bbox is None:
                continue

            class_name = category_index[int(track.det_class)]["name"].strip('"')
            color = tuple(int(c) for c in COLORS[int(track.track_id) % len(COLORS)])

            # draw bbox on screen
            cv2.rectangle(
                frame,
                (
                    int(bbox[0]),
                    int(bbox[1]),
                ),
                (
                    int(bbox[2]),
                    int(bbox[3]),
                ),
                color,
                int((width + height) / 600),
            )
            ## Draw label background
            cv2.rectangle(
                frame,
                (
                    int(bbox[0]),
                    int(bbox[3]),
                ),
                (
                    int(bbox[2]),
                    int(bbox[3] + int((width + height) / 108)),
                ),
                color,
                -1,
            )
            cv2.putText(
                frame,
                class_name,
                (int(bbox[0]), int(bbox[3] + int((width + height) / 300))),
                cv2.FONT_HERSHEY_SIMPLEX,
                (width + height) / 7500,
                (0, 0, 0),
                int((width + height) / 3000),
                cv2.LINE_AA,
            )
            cv2.putText(
                frame,
                "ID:" + str(track.track_id),
                (int(bbox[0]), int(bbox[3] + int((width + height) / 136))),
                cv2.FONT_HERSHEY_SIMPLEX,
                (width + height) / 7500,
                (0, 0, 0),
                int((width + height) / 3000),
                cv2.LINE_AA,
            )
        result = np.asarray(frame)
        result = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        # save video
        out.write(result)
        print("Frame " + str(idx) + " is written!")
        idx += 1
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    vid.release()
    out.release()


## Run DeepSORT for Video

In [None]:
video_path = "./pexels-rodnae-productions-5699922.mp4"
model_path = "./model"
output_vid_trk_path = "./output_" + video_path.split("./")[1].split(".")[0] + ".mp4"
size = "320,320"
threshold = "0.7"
output_format = "mp4v"
COLORS = np.random.randint(0, 255, size=(200, 3))

start = time.time()
output_vid = DeepSORT(
    video_path, model_path, size, threshold, output_format, output_vid_trk_path, COLORS
)
end = time.time()
print(
    "Tracking by DeepSORT for " + video_path + " takes " + str(int(end - start)) + "s."
)


## Display Video

In [None]:
from IPython.display import display, HTML
from base64 import b64encode


def display_video(path):
    mp4 = open(path, "rb").read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    display(
        HTML(
            """
          <video width=400 controls>
                <source src="%s" type="video/mp4">
          </video>
      """
            % data_url
        )
    )


In [None]:
display_video(video_path)

In [None]:
display_video(output_vid_trk_path)