## This notebooks takes care of preprocessing the obj detection (after thresholding) to achieve an HDF5 file to use as input for training using this information.

NB: We need the files containing both obj predictions and confidence scores.

In [1]:
import json
import h5py
import numpy as np
import sys
import datetime 
import tqdm 
import numpy as np
import time
import torch 

  from ._conv import register_converters as _register_converters


## Setup Glove

In [2]:
## From glove.py 
GLOVE_DIM = 300
GLOVE_FILE = '../data/raw/glove.6B.%dd.txt' % GLOVE_DIM
VOCAB_FILE = '../data/raw/vocab_glove_complete.txt'


class GloveEmbedding(object):
    "Creates glove embedding object"

    def __init__(self, glove_file=GLOVE_FILE, glove_dim=GLOVE_DIM):
        with open(glove_file, encoding='utf-8') as fid:
            glove_txt = fid.readlines()
        glove_txt = [g.strip() for g in glove_txt]
        glove_vector = [g.split(' ') for g in glove_txt]
        glove_words = [g[0] for g in glove_vector]
        glove_dict = {w: i for i, w in enumerate(glove_words)}
        glove_vecs = [g[1:] for g in glove_vector]
        glove_array = np.zeros((glove_dim, len(glove_words)))
        for i, vec in enumerate(glove_vecs):
            glove_array[:,i] = np.array(vec)
        self.glove_array = glove_array
        self.glove_dict = glove_dict
        self.glove_words = glove_words
        self.glove_dim = glove_dim
        

class RecurrentEmbedding(object):
    "TODO"

    def __init__(self, glove_file=GLOVE_FILE, glove_dim=GLOVE_DIM,
                 vocab_file=VOCAB_FILE):
        self.glove_file = glove_file
        self.embedding = GloveEmbedding(self.glove_file, glove_dim)

        with open(vocab_file, encoding='utf-8') as fid:
            vocab = fid.readlines()
        vocab = [v.strip() for v in vocab]
        if '<unk>' in vocab:
            # don't have an <unk> vector.  Alternatively, could map to random
            # vector...
            vocab.remove('<unk>')

        self.vocab_dict = {}
        for i, word in enumerate(vocab):
            try:
                self.vocab_dict[word] = self.embedding.glove_array[
                    :, self.embedding.glove_dict[word]]
            except:
                print(f'{word} not in glove embedding')

In [3]:
class LanguageRepresentationMCN_glove(object):
    "Get representation of sentence"

    def __init__(self, max_words=50):
        self.max_words = max_words
        self.dim = None
        self.embedding = None
        self.embedding = RecurrentEmbedding()
        self.dim = self.embedding.embedding.glove_dim

    def __call__(self, word):
        "Return padded sentence feature"
        if word in self.embedding.vocab_dict:
            return self.embedding.vocab_dict[word]
        else:
            return np.zeros((self.dim,))

In [4]:
## Initialization glove
t = time.time()
lang_interface = LanguageRepresentationMCN_glove(max_words=1)
print('Done in {:.2f} seconds.'.format(time.time()-t))

Done in 121.92 seconds.


In [5]:
## Test GLove
word = 'hello'
feature = lang_interface(word)
print(feature.shape)
print(lang_interface.dim)

(300,)
300


In [6]:
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

class BERTEmbedding(object):
    def __init__(self, data_directory=None, model_name='bert-base-uncased', features_combination_mode=0):
        self.model_name = model_name
        self.features_combination_mode = features_combination_mode
        # Load pre-trained model tokenizer (vocabulary)
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        # Load pre-trained model (weights)
        self.model = BertModel.from_pretrained(model_name)
        # Put the model in "evaluation" mode, meaning feed-forward operation.
        self.model.eval()
        # Determine the modality in which layers are combined to obtain the final features
        self._select_combination_mode()
        self._setup_dim()
        if data_directory:
            self.bert_dict = {}
            self._load_preprocessed_features(data_directory=data_directory)

    def __call__(self, key):
        '''
            returns tuple (feat[numpy],query_length) if integer key is provided
            return features[torch tensor] if tuple is provided.

            Usage:
                - In training we use the preprocessed sentences through annotation_id of each moment
                - For standalone processing we need to compute first the tokenized version of the 
                sentence and then call the model on that tokenization. 
                Check below UNIT TEST in main for more details.
        '''
        if type(key) == int:
            return self.bert_dict[str(key)]
        elif type(key) == tuple:
            return self._compute_features(key)
        else:
            raise('Invalid input to bert module')

    def _compute_features(self, tokens):
        # Compute tokens from sentence
        tokens_tensor, segments_tensors, num_tokens = tokens
        # Predict hidden states features for each layer
        with torch.no_grad():
            encoded_layers, check = self.model(tokens_tensor, segments_tensors)
        # Convert the hidden state embeddings into single token vectors [# tokens, # layers, # features]
        token_embeddings = self._compute_tokens_vectors(encoded_layers, num_tokens)
        # Word Vectors, compute features for each token
        features = self.features_combination(token_embeddings)
        # remove the special tokens [CLS]/[SEP] and transform to tensor
        features = torch.stack(features[1:-1])   
        return features

    def _load_preprocessed_features(self, data_directory):
        print('Loading language features')
        t = time.time()
        m = self.model_name.replace('-','_')
        f = self.features_combination_mode
        max_words = 50
        filename = f'./data/processed/{data_directory}/bert/{m}_comb_mode_{f}.json'
        feat = json.load(open(filename, 'r'))
        for k,f in feat.items():
            len_query = min(len(f), max_words)
            padding_size = max_words - len_query
            feature = np.pad(np.asarray(f), [(0,padding_size),(0,0)], mode='constant')
            self.bert_dict[k] = (torch.from_numpy(feature).type(torch.FloatTensor),len_query)
        print("Time to load precomputed language features {:.2f}".format(time.time()-t))

    def _tokenization(self, text):
        #TODO: return a dictionary and non a tuple, increase readability
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = self.tokenizer.tokenize(marked_text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        num_tokens = len(tokenized_text)
        segments_ids = [1] * num_tokens
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([indexed_tokens])
        segments_tensors = torch.tensor([segments_ids])
        return tokens_tensor, segments_tensors, num_tokens

    def _compute_tokens_vectors(self, encoded_layers, num_tokens):
        token_embeddings = [] 
        for token_i in range(num_tokens):
            hidden_layers = [] 
            for layer_i in range(len(encoded_layers)):
                vec = encoded_layers[layer_i][0][token_i]
                hidden_layers.append(vec)
            token_embeddings.append(hidden_layers)
        return token_embeddings

    def _select_combination_mode(self):
        mode = self.features_combination_mode
        if mode == 0:
            self.features_combination = self._last_layer
        elif mode == 1:
            self.features_combination = self._summation_last_four_layers
        elif mode == 2:
            self.features_combination = self._concatenation_last_four_layers
        elif mode == 3:
            self.features_combination = self._summation_second_to_last
        else:
            raise('Feature combination modality unknown, specify value in list [0,1]')
        
    def _concatenation_last_four_layers(self, token_embeddings):
        return [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings] 
    
    def _summation_last_four_layers(self, token_embeddings):
        return [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings] 
    
    def _last_layer(self, token_embeddings):
        return [layer[-1] for layer in token_embeddings]
    
    def _summation_second_to_last(self, token_embeddings):
        return [torch.sum(torch.stack(layer)[1:], 0) for layer in token_embeddings] 

    def _setup_dim(self):
        self.dim = 768
        if 'large' in self.model_name:
            self.dim = 1024
        if self.features_combination_mode == 2:
            self.dim = self.dim * 4
        
    def compute_text_tokens(self, text):
        '''
        DEPRECATED, USED FOR DEBUGGIN PURPOSES
        '''
        marked_text = "[CLS] " + text + " [SEP]"
        tokenized_text = self.tokenizer.tokenize(marked_text)[1:-1]
        return tokenized_text

In [7]:
def _process_word_BERT(word):
    BERT_clip_features = BERT(BERT._tokenization(word)).numpy()
    if BERT_clip_features.shape[0] > 1:
        BERT_clip_features = np.mean(np.stack(BERT_clip_features,axis=0),axis=0)
    else: 
        BERT_clip_features = np.squeeze(BERT_clip_features)
    return BERT_clip_features

In [8]:
# Set up model
model_name = 'bert-base-uncased'
features_combination_mode = 0
BERT = BERTEmbedding(model_name=model_name, features_combination_mode=features_combination_mode)
print('Done')

Done


In [9]:
# Test BERT
text = "KABIR"
feat = BERT(BERT._tokenization(text)).numpy()
print(feat.shape)
print(feat.dtype)
print(np.mean(np.stack(feat,axis=0),axis=0).shape)

(2, 768)
float32
(768,)


# DiDeMo

In [10]:
# Load data predictions
obj_file = '../data/processed/didemo/obj_detection/visual_genome/didemo_obj_detection_perc_50_with_scores.json'
obj_data = json.load(open(obj_file,'r'))

# Load obj vocab
classes_VG = ['__background__']
classes_file = '../data/raw/language/visual_genome/objects_vocab.txt'
with open(classes_file, 'r') as f:
    for object in f.readlines():
        classes_VG.append(object.split(',')[0].lower().strip())

# Load metadata for all splits
metadata_val   = json.load(open('../data/processed/didemo/val-01.json','r'))
metadata_train = json.load(open('../data/processed/didemo/train-01.json','r'))
metadata_test  = json.load(open('../data/processed/didemo/test-01.json','r'))
metadata = {}

for k in metadata_val['videos'].keys():
    metadata[k] = metadata_val['videos'][k]
    
for k in metadata_train['videos'].keys():
    metadata[k] = metadata_train['videos'][k]
    
for k in metadata_test['videos'].keys():
    metadata[k] = metadata_test['videos'][k]

print('Done')

Done


In [11]:
# Set clip size of pooled features
CLIP_SIZE = 2.5                                       # clip size in seconds
MAX_DURATION = 30                                     # max video duration (Didemo convention)
MAX_NUMBER_OF_CLIPS = int(MAX_DURATION // CLIP_SIZE)  # maximum number of clips per video

print(f'CLIP_SIZE={CLIP_SIZE}, MAX_DURATION={MAX_DURATION}, MAX_NUMBER_OF_CLIPS={MAX_NUMBER_OF_CLIPS}')

CLIP_SIZE=2.5, MAX_DURATION=30, MAX_NUMBER_OF_CLIPS=12


In [None]:
# Let's create the vector of logits we want to dump

data = {}
expanded_features  = []
glove_features_avg = {}
glove_features_weighted_avg = {}
glove_features_max = {}
glove_features_bb = {}
glove_features_bb_spatial = {}
# BERT_features_avg  = {}
BERT_features_bb_spatial  = {}
# BERT_features_max  = {}
metadata_keys = list(metadata.keys())

for k in tqdm.tqdm(metadata_keys):                              # Cycle over the videos key

    pred_obj_video = obj_data[k]                       # get video obj predictions
    clip_keys = list(pred_obj_video.keys())            # get number of features we computed
    gt_duration = metadata[k]['duration']//CLIP_SIZE   # get actual video duration and compute the number of clips
    
    if gt_duration > len(clip_keys):                   # store information about the videos for which we expand
        expanded_features.append(k)                    # the features size to match the number of clips
    
    video_feat_logit     = np.zeros((max(MAX_NUMBER_OF_CLIPS,len(clip_keys)),len(classes_VG)-1),dtype='float32')         #placeholder for features
    video_feat_glove_avg = np.zeros((max(MAX_NUMBER_OF_CLIPS,len(clip_keys)),lang_interface.dim),dtype='float32')    #placeholder for features
    video_feat_glove_bb  = np.zeros((max(MAX_NUMBER_OF_CLIPS,len(clip_keys)),lang_interface.dim + 4),dtype='float32')    #placeholder for features
    video_feat_glove_weighted_avg = np.zeros((max(MAX_NUMBER_OF_CLIPS,len(clip_keys)),lang_interface.dim),dtype='float32')    #placeholder for features
    
    video_feat_glove_bb_spatial = []
    video_feat_BERT_bb_spatial  = []

    for clip_idx, kk in enumerate(clip_keys):          # cycle over the clip features
        clip_data = pred_obj_video[kk]                 
        clip_feat = {}
        clip_w_feat = {}
        clip_feat_BERT = {}
        clip_feat_bb = {}
        for elem in clip_data:                         # For each predicted objects check the confidence score and the class
            if elem[1] > video_feat_logit[clip_idx,elem[0]-1]:      # if confidence score of current obj greater than previously seen objects for the same class and clip
                video_feat_logit[clip_idx,elem[0]-1] = elem[1]      # save the prediction in the logits variable. 
            
            word = classes_VG[elem[0]]                              # get obj class name
            clip_feat[elem[0]] = lang_interface(word)               # embedd with glove and save
            clip_w_feat[elem[0]] = elem[1] * lang_interface(word)   # embedd with glove and weight with the confidence score.
            clip_feat_bb[elem[0]] = np.asarray(list(lang_interface(word) ) + elem[2])  # embedd with glove, and concatenate the position information
            
            # Bert infor extraction in clip
            BERT_clip_features = _process_word_BERT(word)            # embedd with BERT, and concatenate the position information
            clip_feat_BERT[elem[0]] = np.asarray(list(BERT_clip_features) + elem[2])
            
        detected_obj      = [clip_feat[c] for c in clip_feat.keys()]
        detected_w_obj    = [clip_w_feat[c] for c in clip_w_feat.keys()]
        detected_bb_obj   = [clip_feat_bb[c] for c in clip_feat_bb.keys()]
        detected_obj_BERT = [clip_feat_BERT[c] for c in clip_feat_BERT.keys()]
       
        if len(detected_obj) == 0:
            detected_obj      = [np.zeros((lang_interface.dim,))]       # if no obj was detected we input the zero vector
            detected_w_obj    = [np.zeros((lang_interface.dim,))]       # if no obj was detected we input the zero vector
            detected_bb_obj   = [np.zeros((lang_interface.dim+4,))]     # if no obj was detected we input the zero vector
            detected_obj_BERT = [np.zeros((BERT.dim+4,))]               # if no obj was detected we input the zero vector

        video_feat_glove_avg[clip_idx] = np.mean(np.stack(detected_obj,axis=0),axis=0)
        video_feat_glove_weighted_avg[clip_idx] = np.mean(np.stack(detected_w_obj,axis=0),axis=0)
        video_feat_glove_bb[clip_idx] =  np.mean(np.stack(detected_bb_obj,axis=0),axis=0)
        video_feat_glove_bb_spatial.append(np.asarray(detected_bb_obj))
        video_feat_BERT_bb_spatial.append(np.asarray(detected_obj_BERT))
    
    
    glove_features_avg[k]= video_feat_glove_avg[:MAX_NUMBER_OF_CLIPS,:]
    glove_features_bb[k] = video_feat_glove_bb[:MAX_NUMBER_OF_CLIPS,:]
    glove_features_weighted_avg[k]= video_feat_glove_weighted_avg[:MAX_NUMBER_OF_CLIPS,:]
    glove_features_bb_spatial[k]  = video_feat_glove_bb_spatial[:MAX_NUMBER_OF_CLIPS]
    BERT_features_bb_spatial[k] = video_feat_BERT_bb_spatial[:MAX_NUMBER_OF_CLIPS]
    
    data[k] = video_feat_logit[:MAX_NUMBER_OF_CLIPS,:]                          # Create a dictionary of features  
    
print(f'Number of videos for which we expanded the features {len(expanded_features)}')

 17%|█▋        | 1814/10642 [1:22:29<6:41:27,  2.73s/it]

In [14]:
# Dump the data into a hdf5 file
output_file = '../data/processed/didemo/obj_predictions_perc_50_max_logit.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=data[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

100%|██████████| 10642/10642 [00:03<00:00, 2852.16it/s]

Done





In [15]:
# Dump the data into a hdf5 file
output_file = '../data/processed/didemo/obj_predictions_perc_50_avg_glove.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=glove_features_avg[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

100%|██████████| 10642/10642 [00:03<00:00, 3289.06it/s]

Done





In [17]:
# Dump the data into a hdf5 file
output_file = '../data/processed/didemo/obj_predictions_perc_50_weighted_avg_glove.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=glove_features_weighted_avg[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

100%|██████████| 10642/10642 [00:03<00:00, 3462.95it/s]

Done





In [19]:
# Dump the data into a hdf5 file
output_file = '../data/processed/didemo/obj_predictions_perc_50_glove_bb_spatial.h5'

# Lets padd the data
_keys_ = list(glove_features_bb_spatial.keys())
max_ = 0
feat_dim = glove_features_bb_spatial[_keys_[0]][0].shape[1]

for k in _keys_:
    for i in range(len(glove_features_bb_spatial[k])):
        tmp =  glove_features_bb_spatial[k][i].shape[0]
        if tmp > max_:
            max_ = tmp

for k in _keys_:
    for i,feat in enumerate(glove_features_bb_spatial[k]):
        tmp = np.zeros((max_, feat_dim))
        tmp[:feat.shape[0]] += feat 
        glove_features_bb_spatial[k][i] = tmp
    glove_features_bb_spatial[k] = np.asarray(glove_features_bb_spatial[k])

__keys__ = list(glove_features_bb_spatial.keys())
print('Number of videos {}'.format(len(__keys__)))
print('Number of clips first video {}'.format(len(glove_features_bb_spatial[__keys__[0]])))
print('Number of detections first clip {}'.format(len(glove_features_bb_spatial[__keys__[0]][0])))
print('Shape of feature of first detection {}'.format(glove_features_bb_spatial[__keys__[0]][0][0].shape))


with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=glove_features_bb_spatial[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

  4%|▍         | 442/10642 [00:00<00:04, 2199.01it/s]

Number of videos 10642
Number of clips first video 12
Number of detections first clip 10
Shape of feature of first detection (304,)


100%|██████████| 10642/10642 [00:05<00:00, 1950.51it/s]


Done


In [None]:
# Dump the data into a hdf5 file
output_file = '../data/processed/didemo/obj_predictions_perc_50_BERT_bb_spatial.h5'

# Lets padd the data
_keys_ = list(BERT_features_bb_spatial.keys())
max_ = 0
feat_dim = BERT_features_bb_spatial[_keys_[0]][0].shape[1]

for k in _keys_:
    for i in range(len(BERT_features_bb_spatial[k])):
        tmp =  BERT_features_bb_spatial[k][i].shape[0]
        if tmp > max_:
            max_ = tmp

for k in _keys_:
    for i,feat in enumerate(BERT_features_bb_spatial[k]):
        tmp = np.zeros((max_, feat_dim))
        tmp[:feat.shape[0]] += feat 
        BERT_features_bb_spatial[k][i] = tmp
    BERT_features_bb_spatial[k] = np.asarray(BERT_features_bb_spatial[k])

__keys__ = list(BERT_features_bb_spatial.keys())
print('Number of videos {}'.format(len(__keys__)))
print('Number of clips first video {}'.format(len(BERT_features_bb_spatial[__keys__[0]])))
print('Number of detections first clip {}'.format(len(BERT_features_bb_spatial[__keys__[0]][0])))
print('Shape of feature of first detection {}'.format(BERT_features_bb_spatial[__keys__[0]][0][0].shape))


with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=BERT_features_bb_spatial[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

# Charades-sta

Note: BERT embeding for obj classes is not yet available for charades. Please check the above function for didemo to extend charades for that functionality. 

In [24]:
# Load data predictions
obj_file = '../data/processed/charades-sta/obj_detection/visual_genome/charades_sta_obj_detection_perc_50_with_scores.json'
obj_data = json.load(open(obj_file,'r'))

# Load obj vocab
classes_VG = ['__background__']
classes_file = '../data/raw/language/visual_genome/objects_vocab.txt'
with open(classes_file, 'r') as f:
    for object in f.readlines():
        classes_VG.append(object.split(',')[0].lower().strip())

# Load metadata for all splits
metadata_train = json.load(open('../data/processed/charades-sta/train-01.json','r'))
metadata_test  = json.load(open('../data/processed/charades-sta/test-01.json','r'))
metadata = {}
    
for k in metadata_train['videos'].keys():
    metadata[k] = metadata_train['videos'][k]
    
for k in metadata_test['videos'].keys():
    metadata[k] = metadata_test['videos'][k]

print('Done')

Done


In [25]:
# Set clip size of pooled features
CLIP_SIZE = 3                                         # clip size in seconds

print(f'CLIP_SIZE={CLIP_SIZE}')

CLIP_SIZE=3


In [26]:
# Let's create the vector of logits we want to dump

data = {}
expanded_features = []
glove_features_avg = {}
glove_features_max = {}

for k in tqdm.tqdm(metadata.keys()):                              # Cycle over the videos key
    
    pred_obj_video = obj_data[k]                       # get video obj predictions
    clip_keys = list(pred_obj_video.keys())            # get number of features we computed
    gt_duration = metadata[k]['duration']//CLIP_SIZE   # get actual video duration and compute the number of clips
    
#     if gt_duration > len(clip_keys):                   # store information about the videos for which we expand
#         expanded_features.append(k)                    # the features size to match the number of clips
    
    video_feat_logit = np.zeros((len(clip_keys),len(classes_VG)-1),dtype='float32')         #placeholder for features
    video_feat_glove_avg = np.zeros((len(clip_keys),lang_interface.dim),dtype='float32')    #placeholder for features
    video_feat_glove_max = np.zeros((len(clip_keys),lang_interface.dim),dtype='float32')    #placeholder for features
    
    for clip_idx, kk in enumerate(clip_keys):          # cycle over the clip features
        clip_data = pred_obj_video[kk]                 
        clip_feat = {}
        for elem in clip_data:                         # For each predicted objects check the confidence score and the class
            if elem[1] > video_feat_logit[clip_idx,elem[0]-1]:      # if confidence score of current obj greater than previously seen objects for the same class and clip
                video_feat_logit[clip_idx,elem[0]-1] = elem[1]      # save the prediction in the logits variable. 
                word = classes_VG[elem[0]]             # get obj class name
            clip_feat[elem[0]] = lang_interface(word)           # embedd with glove and save
        detected_obj = [clip_feat[k] for k in clip_feat.keys()]
        if len(detected_obj) == 0:
            detected_obj = [np.zeros((lang_interface.dim,))]       # if no obj was detected we input the zero vector

        video_feat_glove_avg[clip_idx] = np.mean(np.stack(detected_obj,axis=0),axis=0)
        video_feat_glove_max[clip_idx] = np.max(np.stack(detected_obj,axis=0),axis=0)

    glove_features_avg[k] = video_feat_glove_avg
    glove_features_max[k] = video_feat_glove_max
    data[k]               = video_feat_logit                          # Create a dictionary of features  
    
print(f'Number of videos for which we expanded the features {len(expanded_features)}')

100%|██████████| 6670/6670 [00:13<00:00, 501.78it/s]

Number of videos for which we expanded the features 0





In [27]:
# Dump the data into a hdf5 file
output_file = '../data/processed/charades-sta/obj_predictions_perc_50_max_logit.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=data[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

100%|██████████| 6670/6670 [00:01<00:00, 3718.71it/s]

Done





In [28]:
# Dump the data into a hdf5 file
output_file = '../data/processed/charades-sta/obj_predictions_perc_50_avg_glove.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=glove_features_avg[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

100%|██████████| 6670/6670 [00:01<00:00, 4394.23it/s]

Done





In [29]:
# Dump the data into a hdf5 file
output_file = '../data/processed/charades-sta/obj_predictions_perc_50_max_glove.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=glove_features_max[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

100%|██████████| 6670/6670 [00:01<00:00, 4391.74it/s]

Done





# Activitynet Captions

In [38]:
# Load data predictions
obj_file = '../data/processed/activitynet-captions/obj_detection/visual_genome/activitynet_captions_obj_detection_perc_50_with_scores.json'
obj_data = json.load(open(obj_file,'r'))

# Load obj vocab
classes_VG = ['__background__']
classes_file = '../data/raw/language/visual_genome/objects_vocab.txt'
with open(classes_file, 'r') as f:
    for object in f.readlines():
        classes_VG.append(object.split(',')[0].lower().strip())

# Load metadata for all splits
metadata_val   = json.load(open('../data/processed/activitynet-captions/val.json','r'))
metadata_train = json.load(open('../data/processed/activitynet-captions/train.json','r'))
metadata = {}

for k in metadata_val['videos'].keys():
    metadata[k] = metadata_val['videos'][k]
    
for k in metadata_train['videos'].keys():
    metadata[k] = metadata_train['videos'][k]

print('Done')

Done


In [39]:
print(len(list(metadata.keys())))
print(len(list(obj_data.keys())))

14926
19994


In [40]:
# Set clip size of pooled features
CLIP_SIZE = 2.5                                         # clip size in seconds

print(f'CLIP_SIZE={CLIP_SIZE}')

CLIP_SIZE=2.5


In [41]:
# Let's create the vector of logits we want to dump

data = {}
expanded_features = []
glove_features_avg = {}
glove_features_max = {}

for k in tqdm.tqdm(metadata.keys()):                              # Cycle over the videos key
    
    pred_obj_video = obj_data[k]                       # get video obj predictions
    clip_keys = list(pred_obj_video.keys())            # get number of features we computed
    gt_duration = metadata[k]['duration']//CLIP_SIZE   # get actual video duration and compute the number of clips
    
#     if gt_duration > len(clip_keys):                   # store information about the videos for which we expand
#         expanded_features.append(k)                    # the features size to match the number of clips
    
    video_feat_logit = np.zeros((len(clip_keys),len(classes_VG)-1),dtype='float32')         #placeholder for features
    video_feat_glove_avg = np.zeros((len(clip_keys),lang_interface.dim),dtype='float32')    #placeholder for features
    video_feat_glove_max = np.zeros((len(clip_keys),lang_interface.dim),dtype='float32')    #placeholder for features
    
    for clip_idx, kk in enumerate(clip_keys):          # cycle over the clip features
        clip_data = pred_obj_video[kk]                 
        clip_feat = {}
        for elem in clip_data:                         # For each predicted objects check the confidence score and the class
            if elem[1] > video_feat_logit[clip_idx,elem[0]-1]:      # if confidence score of current obj greater than previously seen objects for the same class and clip
                video_feat_logit[clip_idx,elem[0]-1] = elem[1]      # save the prediction in the logits variable. 
                word = classes_VG[elem[0]]             # get obj class name
            clip_feat[elem[0]] = lang_interface(word)           # embedd with glove and save
        detected_obj = [clip_feat[k] for k in clip_feat.keys()]
        if len(detected_obj) == 0:
            detected_obj = [np.zeros((lang_interface.dim,))]       # if no obj was detected we input the zero vector

        video_feat_glove_avg[clip_idx] = np.mean(np.stack(detected_obj,axis=0),axis=0)
        video_feat_glove_max[clip_idx] = np.max(np.stack(detected_obj,axis=0),axis=0)

    glove_features_avg[k] = video_feat_glove_avg
    glove_features_max[k] = video_feat_glove_max
    data[k]               = video_feat_logit                          # Create a dictionary of features  
    
print(f'Number of videos for which we expanded the features {len(expanded_features)}')

100%|██████████| 14926/14926 [01:27<00:00, 171.16it/s]

Number of videos for which we expanded the features 0





In [None]:
# Dump the data into a hdf5 file
output_file = '../data/processed/charades-sta/obj_predictions_perc_50_max_logit.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=data[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

In [None]:
# Dump the data into a hdf5 file
output_file = '../data/processed/charades-sta/obj_predictions_perc_50_avg_glove.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=glove_features_avg[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')

In [None]:
# Dump the data into a hdf5 file
output_file = '../data/processed/charades-sta/obj_predictions_perc_50_max_glove.h5'

with h5py.File(output_file, "w") as f:
    for k in tqdm.tqdm(data.keys()):
        f.create_dataset(k, data=glove_features_max[k])
    f.create_dataset('metadata/time_unit', data=np.asarray(CLIP_SIZE))     # dump information regarding the clip size
    
print('Done')