In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/Shareddrives/EECS504_Computer_Vision_Project/ObjectTracking

/content/drive/Shareddrives/EECS504_Computer_Vision_Project/ObjectTracking


In [3]:
import os
import json
import cv2
from matplotlib import pyplot as plt
import numpy as np

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import tensorflow as tf
from yolov3.utils import Load_Yolo_model, image_preprocess, postprocess_boxes, nms, draw_bbox, read_class_names
from yolov3.configs import *
import time
from google.colab.patches import cv2_imshow

from deep_sort import nn_matching
from deep_sort.detection import Detection
from deep_sort.tracker import Tracker
from deep_sort import generate_detections as gdet

# video_path   = "./IMAGES/test.mp4"
video_path   = "./IMAGES/three_ppl_video_30fps_260frames.avi"

YOLO Tracking function


In [21]:
def Object_tracking(Yolo, video_path, output_path, input_size=416, show=False, CLASSES=YOLO_COCO_CLASSES, score_threshold=0.3, iou_threshold=0.45, rectangle_colors='', Track_only = []):
    # Definition of the parameters
    max_cosine_distance = 0.7
    nn_budget = None
    
    #initialize deep sort object
    model_filename = 'model_data/mars-small128.pb'
    encoder = gdet.create_box_encoder(model_filename, batch_size=1)
    metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
    tracker = Tracker(metric)

    times, times_2 = [], []

    if video_path:
        vid = cv2.VideoCapture(video_path) # detect on video
    else:
        vid = cv2.VideoCapture(0) # detect from webcam

    # by default VideoCapture returns float instead of int
    width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(vid.get(cv2.CAP_PROP_FPS))
    codec = cv2.VideoWriter_fourcc(*'XVID')
    out = cv2.VideoWriter(output_path, codec, fps, (width, height)) # output_path must be .mp4

    NUM_CLASS = read_class_names(CLASSES)
    key_list = list(NUM_CLASS.keys()) 
    val_list = list(NUM_CLASS.values())
    
    all_tracked_bboxes = []

    while True:
        _, frame = vid.read()

        try:
            original_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            original_frame = cv2.cvtColor(original_frame, cv2.COLOR_BGR2RGB)
        except:
            break
        
        image_data = image_preprocess(np.copy(original_frame), [input_size, input_size])
        #image_data = tf.expand_dims(image_data, 0)
        image_data = image_data[np.newaxis, ...].astype(np.float32)

        t1 = time.time()
        if YOLO_FRAMEWORK == "tf":
            pred_bbox = Yolo.predict(image_data)
        elif YOLO_FRAMEWORK == "trt":
            batched_input = tf.constant(image_data)
            result = Yolo(batched_input)
            pred_bbox = []
            for key, value in result.items():
                value = value.numpy()
                pred_bbox.append(value)
        
        #t1 = time.time()
        #pred_bbox = Yolo.predict(image_data)
        t2 = time.time()
        
        pred_bbox = [tf.reshape(x, (-1, tf.shape(x)[-1])) for x in pred_bbox]
        pred_bbox = tf.concat(pred_bbox, axis=0)

        bboxes = postprocess_boxes(pred_bbox, original_frame, input_size, score_threshold)
        bboxes = nms(bboxes, iou_threshold, method='nms')

        # extract bboxes to boxes (x, y, width, height), scores and names
        boxes, scores, names = [], [], []
        for bbox in bboxes:
            if len(Track_only) !=0 and NUM_CLASS[int(bbox[5])] in Track_only or len(Track_only) == 0:
                boxes.append([bbox[0].astype(int), bbox[1].astype(int), bbox[2].astype(int)-bbox[0].astype(int), bbox[3].astype(int)-bbox[1].astype(int)])
                scores.append(bbox[4])
                names.append(NUM_CLASS[int(bbox[5])])

        # Obtain all the detections for the given frame.
        boxes = np.array(boxes) 
        names = np.array(names)
        scores = np.array(scores)
        features = np.array(encoder(original_frame, boxes))
        detections = [Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip(boxes, scores, names, features)]
        # print(boxes)
        # Pass detections to the deepsort object and obtain the track information.
        tracker.predict()
        tracker.update(detections)

        # Obtain info from the tracks
        tracked_bboxes = []
        for track in tracker.tracks:
            if not track.is_confirmed() or track.time_since_update > 5:
                continue 
            bbox = track.to_tlbr() # Get the corrected/predicted bounding box
            # print(bbox)
            class_name = track.get_class() #Get the class name of particular object
            tracking_id = track.track_id # Get the ID for the particular track
            index = key_list[val_list.index(class_name)] # Get predicted object index by object name
            tracked_bboxes.append(bbox.tolist() + [tracking_id, index]) # Structure data, that we could use it with our draw_bbox function
        
        # Save all tracked_bboxes
        all_tracked_bboxes.append(tracked_bboxes)

        # draw detection on frame
        print(np.array(tracked_bboxes[:4], dtype=np.int32))
        image = draw_bbox(original_frame, tracked_bboxes, CLASSES=CLASSES, tracking=True)

        t3 = time.time()
        times.append(t2-t1)
        times_2.append(t3-t1)
        
        times = times[-20:]
        times_2 = times_2[-20:]

        ms = sum(times)/len(times)*1000
        fps = 1000 / ms
        fps2 = 1000 / (sum(times_2)/len(times_2)*1000)
        
        image = cv2.putText(image, "Time: {:.1f} FPS".format(fps), (0, 30), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1, (0, 0, 255), 2)

        # draw original yolo detection
        #image = draw_bbox(image, bboxes, CLASSES=CLASSES, show_label=False, rectangle_colors=rectangle_colors, tracking=True)

        # print("Time: {:.2f}ms, Detection FPS: {:.1f}, total FPS: {:.1f}".format(ms, fps, fps2))
        if output_path != '': out.write(image)
        if show:
            cv2_imshow(image)
            
            if cv2.waitKey(25) & 0xFF == ord("q"):
                cv2.destroyAllWindows()
                break
            
    cv2.destroyAllWindows()
    return all_tracked_bboxes

Command to run OBJECT TRACKING

In [22]:
yolo = Load_Yolo_model()
all_tracked_bboxes = Object_tracking(yolo, video_path, "detection.mp4", input_size=YOLO_INPUT_SIZE, show=False, iou_threshold=0.1, rectangle_colors=(255,0,0), Track_only = ["person"])


GPUs [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[]
[]
[[481 176 594 424   1   0]
 [124 177 229 436   2   0]
 [279 201 364 438   3   0]]
[[475 176 591 430   1   0]
 [124 187 228 443   2   0]
 [282 201 368 439   3   0]]
[[473 178 587 429   1   0]
 [116 176 224 443   2   0]
 [284 201 370 438   3   0]]
[[468 178 581 427   1   0]
 [115 174 221 440   2   0]
 [285 201 371 437   3   0]]
[[465 181 575 425   1   0]
 [114 171 221 441   2   0]
 [285 201 372 437   3   0]]
[[456 181 567 428   1   0]
 [114 170 219 440   2   0]
 [285 198 374 439   3   0]]
[[447 176 561 433   1   0]
 [120 169 224 440   2   0]
 [285 197 376 439   3   0]]
[[443 175 556 431   1   0]
 [123 171 225 438   2   0]
 [287 198 377 436   3   0]]
[[440 174 552 432   1   0]
 [124 169 226 439   2   0]
 [289 199 380 436   3   0]]
[[438 173 550 432   1   0]
 [126 170 225 437   2   0]
 [292 201 381 434   3   0]]
[[430 169 544 437   1   0]
 [128 166 228 442   2   0]
 [292 199 384 437   3   0]]
[[425 168 539 437   

In [6]:
print(all_tracked_bboxes[5][0])


[468.9683067108188, 178.97057216566776, 581.4389672008724, 427.280836008199, 1, 0]


Get Centroids

In [57]:
# Assign -1 to centroid coord when no bbox available
# print(len(all_tracked_bboxes))
# print(len(all_tracked_bboxes[0]))
num_frames = 260
num_obj = 3
centroids_x = np.zeros((num_frames, num_obj)) 
centroids_y = np.zeros((num_frames, num_obj))
centroids_x -= 1
centroids_y -= 1
for i in range(num_frames):
  tracked_bboxes = all_tracked_bboxes[i]
  num_bboxes = len(tracked_bboxes)
  for j in range(num_bboxes):
    obj_ID = tracked_bboxes[j][4]
    corner_xmin = tracked_bboxes[j][0]
    corner_ymin = tracked_bboxes[j][1]
    corner_xmax = tracked_bboxes[j][2]
    corner_ymax = tracked_bboxes[j][3]
    centroids_x[i, obj_ID-1] = (corner_xmin + corner_xmax)/2
    centroids_y[i, obj_ID-1] = (corner_ymin + corner_ymax)/2
print(centroids_y)

[[ -1.          -1.          -1.        ]
 [ -1.          -1.          -1.        ]
 [300.70888917 307.22523505 320.29814956]
 [303.48033387 315.77012355 320.55348223]
 [304.04624242 309.98244226 319.87262783]
 [303.12570409 307.30590519 319.62649042]
 [303.12714894 306.30857871 319.18638318]
 [304.84961961 305.28273405 319.03079757]
 [304.77253645 304.92317289 318.30117816]
 [303.73058891 305.14839498 317.69798299]
 [303.64467011 304.58256585 317.81949827]
 [303.27333657 304.0547669  318.20939583]
 [303.78960948 304.54254036 318.69999833]
 [302.67273264 305.06237331 318.2204531 ]
 [300.58865264 305.60329181 318.3713632 ]
 [299.79349518 305.81837987 317.44410912]
 [300.16908016 305.57742195 311.79026037]
 [301.65659853 302.88951093 309.3451318 ]
 [302.86463981 302.17726585 314.36687614]
 [301.34349329 301.59149259 307.68733712]
 [301.42720512 301.71448459 306.55408069]
 [301.46022301 299.45878102 306.1685312 ]
 [300.47536035 298.96102341 305.2018558 ]
 [298.4738538  298.46501831 304.23

In [8]:
folder_path = "/content/drive/Shareddrives/EECS504_Computer_Vision_Project/Princeton Datasets/three_people"


In [9]:
depth_images_path = folder_path + "/depth"
rgb_images_path = folder_path + "/rgb"
json_path = folder_path + "/frames.json"

In [11]:
with open(json_path) as json_file:
    frames_json = json.load(json_file)


In [70]:
num_frames = frames_json['length']

K_matrix = frames_json['K']
cx = K_matrix[0][2]
cy = K_matrix[1][2]
fx = K_matrix[0][0]/1000  
fy = K_matrix[1][1]/1000
print(cx, cy, fx, fy)

rgb_timestamps = frames_json['imageTimestamp']
depth_timestamps = frames_json['depthTimestamp']

320 240 0.5758157496 0.5758157496


In [48]:
rgb_array = []
depth_array = []

for frame_id in range(1,num_frames+1):
  rgb_image = rgb_images_path + '/r-{}-{}.png'.format(frames_json['imageTimestamp'][frame_id-1], frames_json['imageFrameID'][frame_id-1])
  rgb = cv2.imread(rgb_image)
  depth_image = depth_images_path + '/d-{}-{}.png'.format(frames_json['depthTimestamp'][frame_id-1], frames_json['depthFrameID'][frame_id-1])
  depth = cv2.imread(depth_image)
  depth = np.bitwise_or(np.right_shift(depth,3), np.left_shift(depth,13))

  rgb_array.append(rgb)
  depth_array.append(depth)


In [55]:
print(depth_array[103][50,50,0])
# x_ = np.linspace(1,640,640) #----> this will be centroid x from yolo
# y_ = np.linspace(1,480,480) #----> this will be centroid y from yolo
# x, y = np.meshgrid(x_,y_)

0


In [82]:
cam_coords_x = np.zeros((num_frames, num_obj))
cam_coords_y = np.zeros((num_frames, num_obj))
cam_coords_z = np.zeros((num_frames, num_obj))
cam_coords_x += 1000
cam_coords_y += 1000
cam_coords_z += 1000

for i in range(num_frames):
  for j in range(num_obj):
    u = int(centroids_y[i,j])
    v = int(centroids_x[i,j])
    # print(u,v)
    if (u != -1):
      z = depth_array[i][u,v,0]/1000
      cam_coords_x[i,j] = (u - cx)*z*(1/fx)  
      cam_coords_y[i,j] = (v - cy)*z*(1/fy)                                  
      cam_coords_z[i,j] = z*10 # Wrong, just for now          
print(cam_coords_z)


[[1.0e+03 1.0e+03 1.0e+03]
 [1.0e+03 1.0e+03 1.0e+03]
 [8.0e-02 9.0e-02 1.1e-01]
 [8.0e-02 9.0e-02 1.1e-01]
 [8.0e-02 9.0e-02 1.1e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 9.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 1.0e-01]
 [8.0e-02 8.0e-02 9.0e-02]
 [7.0e-02 8.0e-02 9.0e-02]
 [7.0e-02 8.0e-02 8.0e-02]
 [7.0e-02 8.0e-02 1.0e+03]
 [7.0e-02 8.0e-02 1.0e+03]
 [7.0e-02 8.0e-02 1.0e+03]
 [7.0e-02 8.0e-02 1.0e+03]
 [7.0e-02 8.0e-02 1.0e+03]
 [7.0e-02 8.0e-02 1.0e+03]
 [7.0e-02 7.0e-02 9.0e-02]
 [7.0e-02 7.0e-02 9.0e-02]
 [8.0e-02 7.0e-02 9.0e-02]
 [8.0e-02 7.0e-02 9.0e-02]
 

Calculate velocity

In [95]:
# If 1000 is encountered, retain same position, velocity as before
fps = 13
vel_x = np.zeros((num_frames, num_obj))
vel_y = np.zeros((num_frames, num_obj))
vel_z = np.zeros((num_frames, num_obj))

for i in range(num_frames):
  for j in range(num_obj):
    if (cam_coords_x[i,j] != 1000 and cam_coords_x[i-1,j]):
      vel_x[i,j] = (cam_coords_x[i,j] - cam_coords_x[i-1,j])/fps    
      vel_y[i,j] = (cam_coords_y[i,j] - cam_coords_y[i-1,j])/fps                            
      vel_z[i,j] = (cam_coords_z[i,j] - cam_coords_z[i-1,j])/fps    # print(vel_x)

In [96]:
for frame_num in range(num_frames):
  plt.xlim(-0.3, 0.3)
  plt.ylim(0,0.15)
# frame_num = 100
  plt.quiver(cam_coords_x[frame_num,:],cam_coords_z[frame_num,:],vel_x[frame_num,:],vel_z[frame_num,:])
  # plt.show()
  plt.savefig('/content/drive/Shareddrives/EECS504_Computer_Vision_Project/ObjectTracking/2Dfigs/' + str(frame_num) + '.png')
  plt.clf()

  length = a * (widthu_per_lenu / (self.scale * self.width))
  length = a * (widthu_per_lenu / (self.scale * self.width))
  short = np.repeat(length < minsh, 8, axis=1)
  tooshort = length < self.minlength


<Figure size 432x288 with 0 Axes>

In [97]:
import os
from os.path import isfile, join

pathIn= '/content/drive/Shareddrives/EECS504_Computer_Vision_Project/ObjectTracking/2Dfigs'
pathOut = '/content/drive/Shareddrives/EECS504_Computer_Vision_Project/ObjectTracking/2D_vid.avi'
fps = 13
frame_array = []
files = [f for f in os.listdir(pathIn) if isfile(join(pathIn, f))]
#for sorting the file names properly
files.sort(key = lambda x: int((x.split(".")[0])))
total_frames = len(files)
num_files = 260
for i in range(num_files):
    filename = pathIn + '/' + files[i]
    # print(filename)
    #reading each files
    img = cv2.imread(filename) 
    height, width, layers = img.shape
    size = (width,height)
    #inserting the frames into an image array
    frame_array.append(img)
print("Frame array loaded!")
out = cv2.VideoWriter(pathOut,cv2.VideoWriter_fourcc(*'DIVX'), fps, size)
print("out initialized!")
for i in range(len(frame_array)):
    # writing to a image array
    out.write(frame_array[i])
out.release()
print("out loaded!")

Frame array loaded!
out initialized!
out loaded!
