## This notebook is used to merge the features extracted by the objects detector. We will move the format from frame predictions to clip predictions. 

## These are the features to be used in the notebook *Preprocessing obj detection for training*

# DiDeMo

In [8]:
import json
import tqdm
import math 
import sys

In [9]:
# Load the data 
CLIP_SIZE = 2.5 
FPS = 1
root = '../data/processed/didemo/obj_detection/visual_genome/'
raw_predictions_file = f'{root}didemo_raw_obj_detection.json'
data = json.load(open(raw_predictions_file, 'r'))


perc=50
thresholds_file = f'{root}didemo_thresholds_percentile_{perc}.json'
thresholds = json.load(open(thresholds_file,'r'))
print('Done')

Done


In [27]:
def boxes_features(boxes,image_w,image_h):
    normalizes_boxes = [[bb[0]/image_w, bb[1]/image_h, bb[2]/image_w, bb[3]/image_h ] for bb in boxes]
    
    features = []
    for bb in normalizes_boxes:
        center = [(bb[0]+bb[2])/2,(bb[1]+bb[3])/2]
        width  = [(bb[2]-bb[0])/2]
        heigth = [(bb[3]-bb[1])/2]
        features.append(center+width+heigth)
    return features

In [31]:
# Apply custom threshold to each predicted objects 
def threshold_predictions(data, thresholds):
    obj_classes = data['objects']
    obj_conf_score = data['obj_conf']
    boxes = data['boxes']
    image_w = data['image_w']
    image_h = data['image_h']
    boxes_feat = boxes_features(boxes,image_w,image_h)
    w_obj_pred = []
    for obj,conf,bb in zip(obj_classes,obj_conf_score,boxes_feat):
        if conf > thresholds[obj]:
            w_obj_pred.append([obj,conf,bb])
    return w_obj_pred

In [32]:
# main loop
post_processed_data = {}
video_keys = list(data.keys())
for k in tqdm.tqdm(video_keys):
    frame_keys = list(data[k].keys())                # get list of frames for video
    num_frames = len(frame_keys)                     # compute number of frames
    num_windows = math.ceil(num_frames/CLIP_SIZE)    # compute number of clips
    idx = [i for i in range(num_frames)]             # ancillary indexes variable
    selected_idx = sorted([1] + idx[3::int(CLIP_SIZE*2)] + idx[6::int(CLIP_SIZE*2)])   # Select best indexes 
    selected_frames = [frame_keys[i] for i in selected_idx]   # distill the wanted frames
    post_processed_data[k] = {}                 
    for i,kk in enumerate(selected_frames):          # loop over the frames and threshold the predictions
        post_processed_data[k][i] = threshold_predictions(data[k][kk], thresholds)

print('Done')

100%|██████████| 10642/10642 [00:06<00:00, 1631.27it/s]

Done





In [33]:
# Dump data
dump_name = f'{root}didemo_obj_detection_perc_{perc}_with_scores.json'
with open(dump_name, "w") as write_file:
            json.dump(post_processed_data, write_file)

print('Done')

Done


# Charades-sta

In [4]:
# Load the data 
CLIP_SIZE = 3
FPS = 1
root = '../data/processed/charades-sta/obj_detection/visual_genome/'
raw_predictions_file = f'{root}charades_sta_raw_obj_detection.json'
data = json.load(open(raw_predictions_file, 'r'))


perc=50
thresholds_file = f'{root}charades_sta_thresholds_percentile_{perc}.json'
thresholds = json.load(open(thresholds_file,'r'))
print('Done')

Done


In [None]:
# Ok for charades if win dnow size 1.5 otherwise standard
# CLIP_SIZE = 3
# a = [i for i in range(len(frame_keys))]
# b = sorted([0] + a[2::CLIP_SIZE] + a[3::CLIP_SIZE])

In [None]:
def normalize_boxes_charades(boxes,image_w,image_h):
    raise NotImplemented

In [5]:
# Define useful function
def post_process_windod_perc_and_scores(data,i,w_size,thresholds):
    video_keys = list(data.keys())
    num_frames = len(video_keys)
    w_obj_pred = []
    for idx in range(i*w_size, min(num_frames,(i+1)*w_size)):   # for each frame in window
        pred = data[video_keys[idx]]                            # get predictions
        num_detections = len(pred['obj_conf'])                  # count predictions
        for ii in range(num_detections):                        # go through predictions
            if pred['obj_conf'][ii] > thresholds[pred['objects'][ii]]:                   # check if score is above a threshold
                tmp = [pred['objects'][ii], 
                       pred['obj_conf'][ii],
                       normalize_boxes_charades(pred['boxes'][ii], pred['image_w'][ii],pred['image_h'][ii])
                      ]
                w_obj_pred.append(tmp)          # save the prediction for objects
    return w_obj_pred

In [15]:
# main loop
post_processed_data = {}
video_keys = list(data.keys())
for k in tqdm.tqdm(video_keys):
    frame_keys = list(data[k].keys())                # get list of frames for video
    num_frames = len(frame_keys)                     # compute number of frames
    num_windows = math.ceil(num_frames/CLIP_SIZE)    # compute number of clips
    post_processed_data[k] = {i:post_process_windod_perc_and_scores(data[k],i,CLIP_SIZE,thresholds) 
                                                    for i in range(num_windows)}
print('Done')

100%|██████████| 6670/6670 [00:01<00:00, 4239.93it/s]

Done





In [16]:
# Dump data
dump_name = f'{root}charades_sta_obj_detection_perc_{perc}_with_scores.json'
with open(dump_name, "w") as write_file:
            json.dump(post_processed_data, write_file)

print('Done')

Done


# Activitynet - Captions

In [21]:
# Load the data 
CLIP_SIZE = 2.5 
FPS = 1
root = '../data/processed/activitynet-captions/obj_detection/visual_genome/'
raw_predictions_file = f'{root}activitynet_captions_raw_obj_detection.json'
data = json.load(open(raw_predictions_file, 'r'))


# perc=50
# thresholds_file = f'{root}activitynet_captions_thresholds_percentile_{perc}.json'
# thresholds = json.load(open(thresholds_file,'r'))
th = 0.4
thresholds = [th for _ in range(1601)]
print('Done')

Done


In [29]:
# main loop
post_processed_data = {}
video_keys = list(data.keys())
for k in tqdm.tqdm(video_keys):
    frame_keys = list(data[k].keys())                # get list of frames for video
    num_frames = len(frame_keys)                     # compute number of frames
    num_windows = math.ceil(num_frames/CLIP_SIZE)    # compute number of clips
    idx = [i for i in range(num_frames)]             # ancillary indexes variable
    if num_frames == 1:
        selected_idx = [0]
    else:
        selected_idx = sorted([1] + idx[3::int(CLIP_SIZE*2)] + idx[6::int(CLIP_SIZE*2)])   # Select best indexes 
    selected_frames = [frame_keys[i] for i in selected_idx]   # distill the wanted frames
    post_processed_data[k] = {}                 
    for i,kk in enumerate(selected_frames):          # loop over the frames and threshold the predictions
        post_processed_data[k][i] = threshold_predictions(data[k][kk], thresholds)

print('Done')

100%|██████████| 19994/19994 [00:08<00:00, 2270.55it/s]

Done





In [30]:
# Dump data
dump_name = f'{root}activitynet_captions_obj_detection_th_{th}_with_scores.json'
with open(dump_name, "w") as write_file:
            json.dump(post_processed_data, write_file)

print('Done')

Done
