# Illustrating Example of how to use MTCF

Import stuff and set visibility of GPU. The MTCF code is setup to use only one GPU.

<span style="color:red">Note:</span> This code MUST be run from the directory that houses this file.

In [None]:
import MTCF.MTCF as MTCF
import MTCF.util as util
from MTCF.parseAnnotations import parseVOTAnnotation, parseOTBAnnotation

from IPython import display
from time import time
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
import json

# This code allows the notebook to only see 1 GPU
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # GPU number

# Root path of the MTCF code base directory. There should be a folder MTCF/ in this directory with code
root_path = os.path.abspath('.') + '/'

Choose which video to track, and how many frames to run

In [None]:
sequence_name = 'ball1'
dataset = 'vot' # MUST be in ['otb', 'vot']
num_frames_to_run = 10000

### Set the parameters of the MTCF

In [None]:
params = {# Stuff
          'verbosity' : 1,
          'color_names_filepath' : root_path + 'MTCF/lookup_tables/w2c.txt',
    
          # MTCF params
          'max_trackers' : 8,
          'max_tracker_images' : 50,
          'num_frames_between_training' : 5,
          'tracker_num_initial_images' : 5,
          'tracker_weight_decay' : 0.2,
    
          # Base tracker params
          'label_sigma_factor' : 1/16.,
          'image_learning_rate' : 0.013,
          'reg_lambda' : 1e-2,
          'search_window_factor' : 5.,
          'search_shape' : 'square', # must be in ['proportional', 'square']
    
          # Features params
          'im_rep' : 'VGG16', # must be in ['HOG', 'HOG+CN', 'VGG16']
          'VGG_layer_name' : 'block3_conv3',
          'use_PCA' : True,
          'VGG_PCA_num_components' : 64,
          'HOG_PCA_num_components' : 15,
          'CN_PCA_num_components' : 5,
          'HOG_cell_size' : 4,
    
          # Template size params
          'max_template_sidelength' : 48,
          'min_template_sidelength' : 24,

          # scale params
          'num_scales' : 5,
          'scale_step' : 1.02,

          # Learning params
          'LBFGSB_max_initial_learning_iters' : 100,
          'LBFGSB_max_learning_iters' : 5,
          
          }

### Get the absolute file paths of images and ground truth

There a couple videos from OTB100/VOT2016 included with the codebase. If you have videos (with ground truth annotation in the same format as VOT/OTB), you can replace these absolute file paths.

In [None]:
if dataset == 'vot': 
    video_path = root_path + 'videos/vot2016/' + sequence_name + '/'
elif dataset == 'otb':
    video_path = root_path + 'videos/otb100/' + sequence_name + '/'

# Video statistics
if dataset == 'vot':
    vid_stats = util.VOT_video_statistics(video_path)
elif dataset == 'otb':
    vid_stats = util.OTB_video_statistics(video_path)
frame_height = vid_stats['fh']
frame_width = vid_stats['fw']
num_frames_total = vid_stats['n_frames']

# Get the ground truth bounding boxes
if dataset == 'vot':
    bounding_boxes = parseVOTAnnotation(video_path)
elif dataset == 'otb':
    bounding_boxes = parseOTBAnnotation(video_path)
bboxes = np.stack([(bounding_boxes[i]['bx'], bounding_boxes[i]['by'], bounding_boxes[i]['width'], bounding_boxes[i]['height']) for i in xrange(1, num_frames_total+1)], axis=0)

### Instantiate the MTCF

First, we get the starting and ending frame numbers. 

Next, we get the first RGB image and ground truth (GT) bounding box.

Finally, we use that to instantiate the MTCF. We also plot the first image for visualization purposes.

In [None]:
# Get starting and ending frame numbers
if dataset == 'otb':
    config = json.load(open(video_path + 'cfg.json'))
    image_start_frame = int(config['startFrame'])
else:
    image_start_frame = 1
if num_frames_to_run > num_frames_total:
    num_frames_to_run = num_frames_total
image_end_frame = num_frames_to_run + image_start_frame - 1

# Get the first image and GT bounding box
image_filename = util.get_image_filename(image_start_frame, dataset)
if dataset == 'otb' and not video_path.endswith('img/'):
    images_path = video_path + 'img/'
else:
    images_path = video_path
first_image = util.load_image_with_resize(images_path + image_filename)
first_bbox = bboxes[image_start_frame-1, :] # indexing starts at 0

In [None]:
# Instantiate MTCF
mtcf = MTCF.MTCF(first_image, first_bbox, params)

In [None]:
%matplotlib inline

# Plot the first image with this GT bounding box
img_copy = first_image.copy()
cv2.rectangle(img_copy, tuple((first_bbox[:2] - np.array([first_bbox[2]/2, first_bbox[3]/2])).astype(int)), 
                        tuple((first_bbox[:2] + np.array([first_bbox[2]/2, first_bbox[3]/2])).astype(int)), 
                        (255, 0, 0), # red rectangle (RGB)
                        2) # thickness of rectangle is two pixels    
plt.imshow(img_copy.astype(np.uint8))
plt.title("First Frame w/ GT bbox")

### Run the tracker

We plot a visualization of the tracker against the ground truth. Each frame of the video will be overlayed with the <span style="color:green">predicted</span> bounding box in green, and the <span style="color:red">ground truth</span> bounding box in red.

<span style="color:red">Note:</span> The speed of the tracker is not reflected in this notebook. The bottleneck is due to the notebook display clearing and painting the images.

In [None]:
%matplotlib inline 

# Keep track of predictions, IoU, and scale factors.
predicted_bboxes = []
IoUs = []
scale_factors = [1.0]

# For loop over all frames for tracking. Start tracking at second frame
start_time = time()
for frame_num in xrange(image_start_frame+1, image_end_frame+1): 

    # Get the image
    image_filename = util.get_image_filename(frame_num, dataset)
    image = util.load_image_with_resize(images_path + image_filename)

    # Get the ground truth bounding box
    gt_bbox = bboxes[frame_num-1, :] # -1 because python indices start at 0

    # Get the predicted bounding box
    predicted_bbox = mtcf.track(image)
    predicted_bboxes.append(predicted_bbox)

    # Compute IOU, scale factor
    iou = util.IoU(predicted_bbox, gt_bbox)
    IoUs.append(iou)
    scale_factors.append(mtcf.current_scale_factor)
    
    # Plot the image with ground truth (red) and prediction (green)
    img_copy = image.copy()
    cv2.rectangle(img_copy, tuple((gt_bbox[:2] - np.array([gt_bbox[2]/2, gt_bbox[3]/2])).astype(int)), 
                            tuple((gt_bbox[:2] + np.array([gt_bbox[2]/2, gt_bbox[3]/2])).astype(int)), 
                            (255, 0, 0), # red rectangle (RGB)
                            2) # thickness of rectangle is two pixels    
    cv2.rectangle(img_copy, tuple((predicted_bbox[:2] - np.array([predicted_bbox[2]/2, predicted_bbox[3]/2])).astype(int)), 
                            tuple((predicted_bbox[:2] + np.array([predicted_bbox[2]/2, predicted_bbox[3]/2])).astype(int)), 
                            (0, 255, 0), # green rectangle (RGB)
                            2) # thickness of rectangle is two pixels
    
    # Clear the previous image and paint the current one.
    plt.gca().cla()
    plt.imshow(img_copy.astype(np.uint8))
    plt.title("Frame {0} out of {1}".format(frame_num, image_end_frame))
    display.clear_output(wait=True)
    display.display(plt.gcf()) 
    
# Print some stuff
print "Average IoU: {0}".format(np.mean(IoUs))
print "Time taken to track: {0} seconds".format(round(time() - start_time, 3))

Recall, <span style="color:green">predicted</span> bounding boxes are in green, and <span style="color:red">ground truth</span> bounding boxes in red.

### Results

We plot IoU and scale over time of the predictions.

In [None]:
fig = plt.figure(figsize=(13, 5))

# Plot IoU over time
ax0 = fig.add_subplot(1, 2, 1)
plt.plot(np.arange(image_start_frame+1, image_end_frame+1), IoUs)
plt.title('IoU. Average IoU: {0}'.format(np.mean(IoUs)))
plt.xlabel('t')

# Plot scale factor over time
ax1 = fig.add_subplot(1, 2, 2)
plt.plot(np.arange(image_start_frame, image_end_frame+1), scale_factors)
plt.title('Scale Factors')
plt.xlabel('t')