# Code for visualization of Chamfer Distance alignmet between objects/clips and language.

Data needs to be dumped from the _eval_item function in dataset untrimmed and from the chamfer distance class. <br>
In particular we will try to trace back the mapping between each word and the relative object/scene.

In [1]:
import numpy as np
import json
import os
import sys
import tqdm
import time
import re
import cv2

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from utils import LanguageRepresentationMCN_glove, compare_glove, sentences_to_words

# set display defaults
plt.rcParams['figure.figsize'] = (12, 9)        # small images
plt.rcParams['image.interpolation'] = 'nearest'  # don't interpolate: show square pixels
plt.rcParams['image.cmap'] = 'gray'  # use grayscale output rather than a (potentially misleading) color heatmap

  from ._conv import register_converters as _register_converters


# Setup Glove

In [2]:
## Initialization glove
t = time.time()
lang_interface = LanguageRepresentationMCN_glove(max_words=1)
print('Done in {:.2f} seconds.'.format(time.time()-t))
## Test GLove
word = 'hello'
feature = lang_interface(word)
print(feature.shape)
print(lang_interface.dim)

Done in 89.35 seconds.
(300,)
300


# Read Data

In [3]:
# Load obj vocab
classes_VG = []
classes_file = '../data/raw/language/visual_genome/objects_vocab.txt'
with open(classes_file, 'r') as f:
    for object in f.readlines():
        classes_VG.append(object.split(',')[0].lower().strip())
print('Done')

Done


In [4]:
classes_VG_revisited = []
map_glove_word = []
for c in classes_VG:
    v = lang_interface(c)
    if np.abs(v).sum() != 0.0:
        map_glove_word.append(v)
        classes_VG_revisited.append(c)   

print('Number of unique codewords: {}'.format(len(map_glove_word)))

Number of unique codewords: 1078


In [5]:
feat_VG_dict = {c:lang_interface(c) for c in classes_VG if np.abs(lang_interface(c)).sum()!=0}

In [6]:
# Remove unnecessary images from the one I downloaded
filename = '../data/processed/didemo/test-01.json'
data     = json.load(open(f'{filename}','r'))
moments  = data['moments']
videos   = list(set([m['video'] for m in moments]))
print('Number of videos in split {}'.format(len(videos)))
frames   = os.listdir('../data/interim/matching_evaluation/didemo_frames/')
print('Tot number of videos in dataset: {}'.format(len(frames)))
not_     = [f for f in frames if f not in videos ]

Number of videos in split 1037
Tot number of videos in dataset: 10642


In [7]:
# Read dumped data:
t = time.time()
path = '../data/interim/matching_evaluation/dump_done/eval_items/'
files = sorted(os.listdir(path))[1:]
dumped_data_input = {}
for f in files:
    d   = json.load(open(f'{path}{f}','r'))
    idx = list(d.keys())[0]
    
    # Convert back to numpy
    d[idx]['times']     = np.asarray(d[idx]['times'])
    d[idx]['proposals'] = np.asarray(d[idx]['proposals'])
    d[idx]['feat']      = {k:np.asarray(v) for k,v in d[idx]['feat'].items()}
    
    # Store in variable for later use - aggregate all information
    dumped_data_input[int(idx)] = d[idx]
print('Done in {:.2f} s'.format(time.time()-t))

Done in 3.36 s


In [8]:
# Read dumped data:
t = time.time()
path  = '../data/interim/matching_evaluation/dump_done/chamfer_distance/'
files = sorted(os.listdir(path))
idx   = 0
dumped_data_chamfer = {}
for i in range(0,len(files),2):
    d_rgb = np.load(f'{path}{i:04d}.npz')
    d_obj = np.load(f'{path}{i+1:04d}.npz')
    dumped_data_chamfer[idx]= {'rgb': d_rgb['arr_0'], 
                               'obj': d_obj['arr_0']}    
    idx += 1
print('Done in {:.2f} s'.format(time.time()-t))

Done in 0.02 s


In [9]:
print(dumped_data_chamfer[0]['rgb'].shape)
# print(dumped_data_chamfer[0]['rgb'][0])
print(dumped_data_chamfer[0]['obj'].shape)
print(dumped_data_input[0].keys())
print(dumped_data_input[0]['feat']['mask-rgb'])
print(dumped_data_input[0]['proposals'])
print(dumped_data_input[0]['len_query'])


(21, 12, 50)
(21, 120, 50)
dict_keys(['description', 'times', 'video', 'annotation_id', 'annotation_id_original', 'time', 'video_index', 'proposals', 'len_query', 'feat'])
[[1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]
[[ 0.  5

In [10]:
# Read mapping between clips and object classes
input_file = '../data/processed/didemo/obj_classes_per_clip.json'
mapping_obj_with_boxes = json.load(open(input_file,'r'))

keys_ = list(mapping_obj_with_boxes.keys())
mapping_obj = {}
for k in keys_:
    clip_keys_ = list(mapping_obj_with_boxes[k].keys())
    clip_dict_ = {}
    for ck in clip_keys_:
        clip_list_ = mapping_obj_with_boxes[k][ck]
        reduced_list = [elem[0] for elem in clip_list_]
        clip_dict_[ck] = reduced_list
    mapping_obj[k] = clip_dict_
print('Done')

Done


# Analisys and mapping

In [11]:
# Global variables:
P_size = 21                  # number of proposals
clip_size = 2.5              # clip size

In [12]:
# Compute objects in moments
def _compute_obj_in_moment(objects,loc):    
    try:
        moment_obj = []
        im_start = int(int(loc[0]) // clip_size)
        im_end   = int(int(loc[1]) // clip_size)
        for i in range(im_start,im_end):
            for obj in objects[str(i)]:
                moment_obj.append([obj,i])
    except:
        return []
    return moment_obj

In [13]:
# mapping from language to video
def _mapping_tokens_to_video(tokens, pred_obj, reduced_pairwise,k):
    proposal_mapping_lang_to_obj = {key:[] for key in tokens}
    for ii,word in enumerate(tokens):
        if len(reduced_pairwise[:ii]) > 0:
            ind_  = int(np.argmin(reduced_pairwise[:,ii]))       # per ogni parola della descrizione prendo l'indice dell'oggetto cui distanza dell'embedding e minore.
            score = float(reduced_pairwise[ind_,ii])                    # prendo anche lo score
            if k == 'obj':
                obj = pred_obj[ind_]
                proposal_mapping_lang_to_obj[word].append({"object": obj[0],
                                                           "obj_idx":ind_,
                                                           "score":  score,
                                                           "frame": obj[1]})
            else:
                proposal_mapping_lang_to_obj[word].append({"clip_name":f'clip_{ind_}',
                                                           "clip_idx:":ind_,
                                                           "score":    score})
    return proposal_mapping_lang_to_obj

In [14]:
# mapping from video to language
def _mapping_video_to_tokens(tokens, pred_obj, reduced_pairwise, k, len_video):
    if k=='obj':
        proposal_mapping_obj_to_lang = {}
        for i in range(len(pred_obj)):
            proposal_mapping_obj_to_lang[pred_obj[i][0]] = [] 
        for ii, origin in enumerate(pred_obj):
            ind_     = int(np.argmin(reduced_pairwise[ii,:]))
            score    = float(reduced_pairwise[ii,ind_])
            token    = tokens[ind_]
            proposal_mapping_obj_to_lang[origin[0]].append({"token":  token,
                                                            "obj_idx":ind_,
                                                            "score":  score,
                                                            "frame":  origin[1]})
    else:
        proposal_mapping_obj_to_lang = {'clip_{}'.format(i):[] for i in range(len_video)}
        for ii in range(len_video):
            ind_     = int(np.argmin(reduced_pairwise[ii,:]))
            score    = float(reduced_pairwise[ii,ind_])
            token    = tokens[ind_]
            origin   = f'clip_{ii}'
            proposal_mapping_obj_to_lang[origin].append({"token":   token,
                                                         "clip_idx":ind_,
                                                         "score":   score})
    return proposal_mapping_obj_to_lang

In [16]:
# Ciclo sui proposals

keys = list(dumped_data_chamfer.keys())

annotations_data = {}
for dump_id in tqdm.tqdm(keys):
    # annotations information:
    elem        = dumped_data_input[dump_id]
    video_id    = elem['video']
    pairwise    = dumped_data_chamfer[dump_id]
    description = elem['description']
    proposals   = elem['proposals']
    tokens      = sentences_to_words(description)
    # Tiene l'informazione di ogni proposal nel video
    proposals_mapping = {i:{} for i in range(P_size)}                 # per ogni proposal appendi 2 dizionari ogniuno con il mapping di ogni parola con ogni oggetto e viceversa
    
    for i, moment in enumerate(proposals):
        moment   = [str(int(e)) for e in moment] 
        pred_obj = _compute_obj_in_moment(mapping_obj[video_id], moment)
        if len(pred_obj) > 0:
            single_proposal_mapping={'lang_to_rgb':None, 'rgb_to_lang':None,
                                     'lang_to_obj':None, 'obj_to_lang':None}
            for k in ['rgb','obj']:
                input_feat  = elem['feat'][k][i]
                input_mask  = elem['feat']['-'.join(['mask',k])][i]
                feat_paiwise= pairwise[k][i]
                len_video   = int(np.sum(input_mask))
                len_lang    = elem['len_query'][i]

                # Only take as many elements as the GT langugage and video info
                reduced_pairwise = feat_paiwise[:len_video, :len_lang]

                single_proposal_mapping['lang_to_{}'.format(k)] = _mapping_tokens_to_video(tokens, pred_obj, reduced_pairwise,k)
                single_proposal_mapping['{}_to_lang'.format(k)] = _mapping_video_to_tokens(tokens, pred_obj, reduced_pairwise,k, len_video)

            # Append whole information in proposal summary
            proposals_mapping[i] = single_proposal_mapping           
    
    # Append all proposals informations to annotations.
    annotations_data[dump_id] = proposals_mapping 

  0%|          | 0/11 [00:00<?, ?it/s]


KeyError: 10

In [None]:
# Dump data, let's try the fucking visualization:
filename = '../data/interim/matching_evaluation/matchings.json'
with open(filename,'w') as f:
    json.dump(annotations_data,f)
print('Done')

In [None]:
# Read mapping between clips and object classes
input_file = '../data/processed/didemo/obj_classes_per_clip.json'
mapping_obj_with_boxes = json.load(open(input_file,'r'))
print('Done')

In [None]:
annotation_keys_ = list(annotations_data.keys())
print(annotation_keys_[0])
elem        = dumped_data_input[annotation_keys_[0]]
video_id    = elem['video']
print(elem.keys())
print(video_id)
print(mapping_obj_with_boxes[video_id])

# Grande ciclo per plottare tutto

In [None]:
root = '../data/processed/didemo/obj_detection/visual_genome/'
raw_predictions_file = f'{root}didemo_raw_obj_detection.json'
data = json.load(open(raw_predictions_file, 'r'))
print('Done')

In [None]:
import math
def _get_frames_indices(k):
    path = '../data/interim/matching_evaluation/didemo_frames/{}/'.format(video_id)
    frame_keys = list(data[k].keys())                # get list of frames for video
    num_frames = len(frame_keys)                     # compute number of frames
    num_windows = math.ceil(num_frames/clip_size)    # compute number of clips
    idx = [i for i in range(num_frames)]             # ancillary indexes variable
    selected_idx = sorted([1] + idx[3::int(clip_size*2)] + idx[6::int(clip_size*2)])   # Select best indexes 
    selected_frames = [frame_keys[i] for i in selected_idx]   # distill the wanted frames
    selected_frames = ['{}{}'.format(path,f) for f in selected_frames]
    return selected_frames

In [None]:
 def _create_proposals_frames_list(list_frames, proposals):
    proposals_frames = []
    for loc in proposals:
        im_start = int(int(loc[0]) // clip_size)
        im_end   = int(int(loc[1]) // clip_size)
        tmp      = [list_frames[i] for i in range(im_start,im_end)]
        proposals_frames.append(tmp)
    return proposals_frames

In [None]:
 def _gather_obj_info_per_proposal(bbox_per_video, proposals):
    proposals_objects = []
    for loc in proposals:
        im_start = int(int(loc[0]) // clip_size)
        im_end   = int(int(loc[1]) // clip_size)
        tmp      = [bbox_per_video[str(i)] for i in range(im_start,im_end)]
        tmp      = [item for sublist in tmp for item in sublist]
        proposals_objects.append(tmp)
    return proposals_objects

In [None]:
def _get_mapping_info_per_proposal(mapping_, proposals):
    return [mapping_[i] for i in range(len(proposals))]                       # Mapping is already computed for proposals.

In [None]:
def _inverse_map_box(box, height, width):
    c1,c2,w,h = box
    return [(c1-w)*width,(c2-h)*height,(c1+w)*width,(c2+h)*height ]

In [None]:
def _compute_box(frames_objects,proposals_mapping, height, width, ii):
    map_ = proposals_mapping['lang_to_obj']
    bboxes_ = []
    names   = []
    for token in map_.keys():
        data_ = map_[token]
        if len(data_) ==0:
            obj_index = 0
            box = _inverse_map_box(frames_objects[obj_index][1:], height, width)
            bboxes_.append(box)
            names.append(frames_objects[obj_index][0])
#             print(token, map_[token], frames_objects[0])
        else:
            obj_index = map_[token][0]['obj_idx']
            frame_idx = map_[token][0]['frame']
            if frame_idx == ii:
                box = _inverse_map_box(frames_objects[obj_index][1:], height, width)
                bboxes_.append(box)
                names.append(frames_objects[obj_index][0])
#                 print(token, map_[token], frames_objects[obj_index], ii,frame_idx)
    return names, bboxes_

In [None]:
path_dump = '../data/interim/matching_evaluation/images/'
def _plot_proposals(video_id,description, proposals_frames, proposals_objects, proposals_mapping):
    print(description)
    folder = '{}{}/'.format(path_dump,video_id)
    if not os.path.exists(folder):
        os.makedirs(folder)
    for i, frames in enumerate(proposals_frames):
        frames_objects = proposals_objects[i]
        frames_mapping = proposals_mapping[i]
        my_dpi=1000
        fig = plt.figure()
        DPI = fig.get_dpi()
        fig.set_size_inches(len(frames)*320/float(DPI),2*240.0/float(DPI))# figsize=(len(frames)*3,len(frames)/2))
#         plt.title(description, fontsize=20)
        plt.axis('off')
#         fig.tight_layout()
        fig.subplots_adjust(left=None, bottom=0.0, right=None, top=None, wspace=0.01, hspace=None)
        for ii, frame in enumerate(frames):
            im = cv2.imread(frame)
            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
            height, width, _ = im.shape
            
            ax=fig.add_subplot(1,len(frames),  ii + 1)
            ax.axis('off')
            ax.imshow(im)
         
            names, bboxes = _compute_box(frames_objects,frames_mapping, height, width, ii)
            for n,bbox in zip(names,bboxes):
                ax.add_patch(patches.Rectangle((bbox[0], bbox[1]),
                            bbox[2] - bbox[0],
                            bbox[3] - bbox[1], fill=False,
                            edgecolor='red', linewidth=3, alpha=0.5))
                ax.text(bbox[0], bbox[1] - 2,
                            '%s' % (n),
                            bbox=dict(facecolor='blue', alpha=0.1),
                            fontsize=10, color='white')
        
        f_name = frame.split('/')[-1]
        dump_path = '{}{}'.format(folder,i)
        plt.savefig(dump_path,bbox_inches='tight')
        plt.close()

In [None]:
# Ciclo su tt le chiavi di annotation_keys_
idx_=4
for annotation_index in annotation_keys_[idx_:idx_+1]: 
    video_id          = dumped_data_input[annotation_index]['video']           #2 Get the video id
    description       = dumped_data_input[annotation_index]['description']
    list_frames       = _get_frames_indices(video_id)                          #3 Leggo i frame che sono contenuti nella cartella:
    proposals         =  dumped_data_input[annotation_index]['proposals']
    proposals_frames  = _create_proposals_frames_list(list_frames, proposals)  #4 Frames per proposal - moment
    bbox_per_video    = mapping_obj_with_boxes[video_id]   #5 Get BBox information
    proposals_objects = _gather_obj_info_per_proposal(bbox_per_video,proposals)
    mapping_          = annotations_data[annotation_index]                     #6 Get mapping between everything
    proposals_mapping = _get_mapping_info_per_proposal(mapping_, proposals)

    # Important variables = proposals_frames, proposals_objects, proposals_mapping
    print(dumped_data_input[annotation_index]['times'])
    _plot_proposals(video_id, description, proposals_frames,proposals_objects,proposals_mapping)


In [None]:
print(list_frames[0])
im = cv2.imread(list_frames[0])
im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
fig,ax = plt.subplots(1)
ax.imshow(im)

dump_path = '../data/interim/matching_evaluation/images/test.png'
plt.savefig(dump_path)
plt.close()