# Exploring Charades-STA

In [None]:
import pandas as pd
import h5py

def parse_charades_sta(filename):
    """Parser raw charades-STA annotations
    
    Args:
        filename (str)
    Returns:
        instances (list of dicts)
    TODO:
        update dict by class
        
    """
    instances = []
    with open(filename, 'r') as fid:
        for line in fid:
            line = line.strip()
            video_info, description = line.split('##')
            video_id, t_start, t_end = video_info.split()
            t_start = float(t_start)
            t_end = float(t_end)
            
            instances.append(
                {'video': video_id,
                 'times': [[t_start, t_end]],
                 'description': description}
            )
            # print(video, t_start, t_end, description)
    return instances

def make_annotations_df(instances, file_h5):
    "Create data-frames to play easily with data"
    instances_df = pd.DataFrame([{**i, **{'t_start': i['times'][0][0],
                                          't_end': i['times'][0][1]}}
                                 for i in instances])
    videos_in_charades_sta = {i for i in instances_df['video'].unique()}
    instances_gbv = instances_df.groupby('video')
    with h5py.File(file_h5, 'r') as f:
        videos_info = []
        for video_id, dataset in f.items():
            if video_id not in videos_in_charades_sta:
                continue
            videos_info.append(
                {'video': video_id,
                 'num_frames': dataset.shape[0],
                 'num_instances': instances_gbv.get_group(
                     video_id).shape[0],
                }
            )
    videos_df = pd.DataFrame(videos_info)
    return videos_df, instances_df

## 1. Moments duration analysis

Why? to extend SMCN and all its variants, we need to do the dirty work done by DiDeMo setup.

> DiDeMo makes it easy by defining the search space up front.

What? We need to set a couple of parameters: (i) _minimum_ moment length, (ii) _maximum_ moment length, (iii) _type of range_, how to explore minimum -> maximum, and (iv) _striding_. Those parameters will define the search space, and will set the stage to define the size of the chunk/clip. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Q = 95
QUANTILES = np.arange(25, 101, 2.5)
COLOR = ['blue', 'orange', 'green']

all_duration = []
fig, axs = plt.subplots(1, 3, figsize=(21, 7))
for i, subset in enumerate(['train', 'test']):
    filename = f'../data/raw/charades/charades_sta_{subset}.txt'
    data = parse_charades_sta(filename)
    duration = [i['times'][0][1] - i['times'][0][0]
                for i in data
#                 if i['times'][0][1] > i['times'][0][0]
               ]
    all_duration += duration
    
    duration = np.array(duration)
    print('Negative durations: ', sum(duration <= 0))
    percentiles = np.percentile(duration, QUANTILES)
    axs[i].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[-1].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[i].set_xlabel('Duration')
    axs[i].set_ylabel('Quantile')
    axs[i].set_title(f'Duration stats {subset}\n'
                     f'Min: {np.min(duration[duration > 0]):.2f}, '
                     f'Median: {np.median(duration):.2f}, '
                     f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                     f'Max: {np.max(duration):.2f}')
duration = np.array(all_duration)
percentiles = np.percentile(duration, QUANTILES)
axs[-1].plot(percentiles, QUANTILES, ls='--', color=COLOR[-1])
axs[-1].set_xlabel('Duration')
axs[-1].set_ylabel('Quantile')
_ = axs[-1].set_title('Duration stats (train+test)\n'
                      f'Min: {np.min(duration[duration > 0]):.2f}, '
                      f'Median: {np.median(duration):.2f}, '
                      f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                      f'Max: {np.max(duration):.2f}')

- Minimum length of a moment.

  The initial batch of experiments will be done with 3s as it's close to the minimum and there are psicological references that support that number.

  > DiDeMo makes it easy by defining the minimum length of the segments to 5s with variance 0.
  
- Maximum length of moment.
  
  We ended up taking 24s as it's close to the maximum moment duration in the testing set.
  
    > DiDeMo makes it easy by defining the maximum length of the segments to 30s and setting the length of the video also to 30s.

- Explore range from minimum to maximum moment length.

  TODO: describe
    
- Stride

  TODO: describe
  
Based on the parameters mentioned above, we study multiple configurations of proposals in terms of its size and its recall upper bound.

In [None]:
import sys
sys.path.append('..')
import numpy as np
from proposals import SlidingWindowMSRSS
from nb_utils import parse_moments
from nb_utils import recall_bound_and_search_space

filename = '../data/processed/charades-sta/test-01.json'
clip_length = 3
proposals_prm = dict(
    length=clip_length,
    scales=list(range(2, 9, 1)),
    stride=0.3
)

dataset = parse_moments(filename)
proposals_fn = SlidingWindowMSRSS(**proposals_prm)
train_results = recall_bound_and_search_space(
    filename, proposals_fn)
recall_ious, search_space, durations = train_results
num_clips = np.ceil(durations / clip_length).sum()
search_space[-1] /= num_clips
print(recall_ious)
print(search_space)

TODO: Update graphs

In training the results look as follows like

In [None]:
# Variables to edit
min_length = 3
max_length = 80
num_scales = 8
strides = [1, 2, 3, 4, 5]

annotation_file = '../data/raw/charades/charades_sta_train.txt'
features_file = '/home/escorciav/datasets/charades/features/resnet101-openimages_5fps_320x240.h5'

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
font_size = 12
COLOR_2ND_AXIS = 'red' 
IOU_COLORS = ['blue', 'orange', 'green']
iou_thresholds = IOU_THRESHOLDS
assert len(IOU_COLORS) - 1 == len(iou_thresholds)

instances = parse_charades_sta(annotation_file)
videos_df, instances_df = make_annotations_df(instances, features_file)

recalls = []
search_space = []
for stride in tqdm(strides):
    recall_iou, search_space_stats = recall_bound_and_search_space(
        videos_df, instances_df, stride,
        length=min_length, scale=num_scales,
        slidding_window_fn=sliding_window,
    )
    recalls.append(recall_iou)
    search_space.append(search_space_stats)
search_space = np.vstack(search_space)
recalls = np.vstack(recalls)
recalls = np.column_stack([recalls, recalls.mean(axis=1)])

fig, ax1 = plt.subplots(figsize=(21, 7))
for i, iou in enumerate(iou_thresholds + [None]):
    ls, label, color = '-', f'tIOU={iou}', IOU_COLORS[i]
    if i == len(IOU_COLORS) - 1:
        ls, label = '-.', 'avg tIOU'
    ax1.plot(strides, recalls[:, i], ls=ls,
             color=color, label=label)
ax1.set_xlabel('stride', fontsize=font_size)
ax1.set_ylabel('Recall', fontsize=font_size)
ax1.tick_params('y')
ax2 = ax1.twinx()
ax2.plot(strides, search_space[:, 0], ls='--', color=COLOR_2ND_AXIS)
ax2.set_ylabel('Median size of search space',
               color=COLOR_2ND_AXIS, fontsize=font_size)
ax2.tick_params('y', colors='r')
for tick in (ax1.xaxis.get_major_ticks() + 
             ax1.yaxis.get_major_ticks() +
             ax2.yaxis.get_major_ticks()):
    tick.label.set_fontsize(font_size)

To get an idea of the values

In [None]:
info = {'Stride': np.array(strides)}
for i in range(recalls.shape[1]):
    if i > (len(iou_thresholds) - 1):
        iou = f'Avg({iou_thresholds[0]}, {iou_thresholds[-1]})'
    else:
        iou = iou_thresholds[i]
    info[f'R@{iou}'] = recalls[:, i]
for i, label in enumerate(['(median)', '(std)']):
    info[f'Search space size {label}'] = search_space[:, i]
display(pd.DataFrame(info))

Let's make sure that the results look similar in testing a.k.a. there is _NOT_ distribution (training v.s. testing) miss-match.

In [None]:
annotation_file = '../data/raw/charades/charades_sta_test.txt'

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
font_size = 12
COLOR_2ND_AXIS = 'red' 
IOU_COLORS = ['blue', 'orange', 'green']
iou_thresholds = IOU_THRESHOLDS
assert len(IOU_COLORS) - 1 == len(iou_thresholds)

instances = parse_charades_sta(annotation_file)
videos_df, instances_df = make_annotations_df(instances, features_file)

recalls = []
search_space = []
for stride in tqdm(strides):
    recall_iou, search_space_stats = recall_bound_and_search_space(
        videos_df, instances_df, stride,
        length=min_length, scale=num_scales,
        slidding_window_fn=sliding_window,
    )
    recalls.append(recall_iou)
    search_space.append(search_space_stats)
search_space = np.vstack(search_space)
recalls = np.vstack(recalls)
recalls = np.column_stack([recalls, recalls.mean(axis=1)])

fig, ax1 = plt.subplots(figsize=(21, 7))
for i, iou in enumerate(iou_thresholds + [None]):
    ls, label, color = '-', f'tIOU={iou}', IOU_COLORS[i]
    if i == len(IOU_COLORS) - 1:
        ls, label = '-.', 'avg tIOU'
    ax1.plot(strides, recalls[:, i], ls=ls,
             color=color, label=label)
ax1.set_xlabel('stride', fontsize=font_size)
ax1.set_ylabel('Recall', fontsize=font_size)
ax1.tick_params('y')
ax2 = ax1.twinx()
ax2.plot(strides, search_space[:, 0], ls='--', color=COLOR_2ND_AXIS)
ax2.set_ylabel('Median size of search space',
               color=COLOR_2ND_AXIS, fontsize=font_size)
ax2.tick_params('y', colors='r')
for tick in (ax1.xaxis.get_major_ticks() + 
             ax1.yaxis.get_major_ticks() +
             ax2.yaxis.get_major_ticks()):
    tick.label.set_fontsize(font_size)

To get an idea of the values

In [None]:
info = {'Stride': np.array(strides)}
for i in range(recalls.shape[1]):
    if i > (len(iou_thresholds) - 1):
        iou = f'Avg({iou_thresholds[0]}, {iou_thresholds[-1]})'
    else:
        iou = iou_thresholds[i]
    info[f'R@{iou}'] = recalls[:, i]
for i, label in enumerate(['(median)', '(std)']):
    info[f'Search space size {label}'] = search_space[:, i]
display(pd.DataFrame(info))

__Conclusions__

TODO: update this cell

- With a 1s stride, there is small gap due to _not_ regressing the exact boundaries for IOU=0.7.

- However with such a low stride, the median size of the search space is an order of magnitud greater than DiDeMo. Note that the median duration of the videos is about the same as in DiDeMo (see section [1a.1](#1a.1-Video-duration)).

- In case we wanna reduce the search space, good values to consider are:
  - _min moment duration_: 3 seconds
  - _max moment duration_: 24 seconds
  - _scale from min to max_: linear, with unit slope, from min to max moment duration.
  - _stride_: 3 seconds
  - _clip/chunk size_: 3 seconds
  
- What would it happen if min/max moment duration or stride are not a multiple of clip/chunk size?

  We need to do rounding or interpolation of features.
  
In case you are interested on a relationship between the size of the search space, $|\mathcal{S}|$, in terms of the duration of the video $d$, you can use this formula:

$|\mathcal{S}| = \sum_{w_l \in \mathcal{W}} (\frac{d}{s} + 1) = |\mathcal{W}| (\frac{d}{s} + 1) $

i.e. $|\mathcal{S}| \propto |\mathcal{W}| \frac{d}{s} $

where:

- $\mathcal{W}$ is the set of all possible durations. 

- $s$ stride of the window. We use the same stride for all the different $w_l$.

_TLDR_ More details. Skip if you are in a rush time.

The above formula comes from the generic:

$|S| \propto \sum_{w_l \in \mathcal{W}} (\frac{d - w_l + 2p}{s} + 1)  $

where $2p$ is the amount padding. In our case, $2p = w_l$ because we kept windows ending at a time longer than $d$.

We can get a better upper-bound by clamping ending time to $d$. However, that's tricky as some windows may have length which is not inside $\mathcal{W}$.

### Video duration

The previous analysis only consider the duration of the moments. What about the video duration?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Q = 95
FPS = 5
QUANTILES = np.arange(25, 101, 2.5)
COLOR = ['blue', 'orange', 'green']
H5_FILE_FEAT_PER_FRAME = '/home/escorciav/datasets/charades/features/resnet152-imagenet_5fps_320x240.hdf5'

all_duration = []
fig, axs = plt.subplots(1, 3, figsize=(21, 7))
for i, subset in enumerate(['train', 'test']):
    filename = f'../data/raw/charades/charades_sta_{subset}.txt'
    data = parse_charades_sta(filename)
    videos_df, _ = make_annotations_df(data, H5_FILE_FEAT_PER_FRAME)
    duration = videos_df['num_frames'] / FPS
    all_duration.append(duration)
    
    print('Negative durations: ', sum(duration <= 0))
    percentiles = np.percentile(duration, QUANTILES)
    axs[i].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[-1].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[i].set_xlabel('Duration')
    axs[i].set_ylabel('Quantile')
    axs[i].set_title(f'Duration stats {subset}\n'
                     f'Min: {np.min(duration[duration > 0]):.2f}, '
                     f'Median: {np.median(duration):.2f}, '
                     f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                     f'Max: {np.max(duration):.2f}')
duration = pd.concat(all_duration, axis=0, ignore_index=True)
percentiles = np.percentile(duration, QUANTILES)
axs[-1].plot(percentiles, QUANTILES, ls='--', color=COLOR[-1])
axs[-1].set_xlabel('Duration')
axs[-1].set_ylabel('Quantile')
_ = axs[-1].set_title('Duration stats (train+test)\n'
                      f'Min: {np.min(duration[duration > 0]):.2f}, '
                      f'Median: {np.median(duration):.2f}, '
                      f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                      f'Max: {np.max(duration):.2f}')

_Conclusion_: comparing the two windows arragement [1a](#1a.-Search-space-with-sliding-windows) vs [1b.1](#1b.1-"DiDeMofying"-untrimmed-videos), we can conclude:

- For 1s stride, 1b generates more windows than 1a.

  In case we are open to explore the search space in a different way, it seems [1a](#1a.-Search-space-with-sliding-windows) reduces the amount of moments to explore.

## 2. Dump data for training and evaluation

### 2.a.1 JSON files

File to dump
```json
{
     'moments': [moment_dict, ...],
     'videos': {video_id: video_dict, ...},
     'date': str,
     'responsible': str,
 }
```

video_dict
```json
{
    'duration': float,
}
```
duration := approximate video durations

moment_dict
```json
{
    'description': str,
    'annotation_id': int,
    'video': str,
    'time': [float, float],
    'times': [[float, float]]
}
```

description := description provided by the annotators

times := list with all segments associated with a given description. Why a list? it is inherited from DiDeMo where you have multiple segments for a given description.

time := first item in `times`. Given that moments in Charades-STA only have a single segment, it's easier to create a new attribute with it. We keep both to not break our dashboards.

video := unique video-id to refer to the video. Make sure that this match the HDF5 with the features.

__Note__: It requires to run 1st cell with `function:parse_charades_sta ` and `function:make_annotations_df`

In [None]:
SUBSETS = ['train', 'test']
MODE = 'w'
FPS = 5
CREATOR = 'EscorciaSSGR'
H5_FILE = '/home/escorciav/datasets/charades/features/resnet152_max.h5'
H5_FILE_FEAT_PER_FRAME = '/home/escorciav/datasets/charades/features/resnet152-imagenet_5fps_320x240.hdf5'
if MODE == 'w':
    print('are you sure you wanna do this? comment these 3 lines!')
    raise
assert SUBSETS == ['train', 'test']

import json
from copy import deepcopy
from datetime import datetime
from pathlib import Path
import h5py
import sys
sys.path.append('..')
from utils import get_git_revision_hash

def extend_metadata(list_of_moments, videos_gbv, filename, offset=0, fps=FPS):
    """Augment moments' (in-place) metadata and create video metadata
    
    Args:
        list_of_moments (list of dicts) : the output of
            function::parse_charades_sta.
        videos_gbv (DataFrame groupedby) : DataFrame grouped by `video_id`. It
            is mandatory that the DataFrame has a column `num_frames` such that
            we can estimate the duration of the video.
        filename (str) : path to HDF5 with all features of the dataset.
        offset (int, optional) : ensure annotation-id accross moments is unique
        
    Adds `annotation_id` and `time` to each moment
    """
    with h5py.File(filename) as fid:
        videos = {}
        keep = []
        for i, moment in enumerate(list_of_moments):
            assert len(moment['times']) == 1
            video_id = moment['video']
            # Get estimated video duration
            num_frames = videos_gbv.get_group(
                video_id)['num_frames'].values[0]
            video_duration = num_frames / fps
            
            # TODO: sanitize by trimming moments up to video duration <= 0
            # Sanitize
            # i) clamp moments inside video
            moment['times'][0][0] = min(moment['times'][0][0], video_duration)
            moment['times'][0][1] = min(moment['times'][0][1], video_duration)
            # ii) remove moments with duration <= 0
            if moment['times'][0][1] <= moment['times'][0][0]:
                continue
            
            keep.append(i)
            moment['time'] = moment['times'][0]
            # we use the row index of the original CSV as unique identifier
            # for the moment. Of course, 0-indexed.
            moment['annotation_id'] = i + offset

            # Update dict with video info
            if video_id not in videos:
                num_clips = fid[video_id].shape[0]
                videos[video_id] = {'duration': video_duration,
                                    'num_clips': num_clips,
                                    'num_moments': 0}
            videos[video_id]['num_moments'] += 1
    
    clean_list_of_moments = []
    for i in keep:
        clean_list_of_moments.append(list_of_moments[i])
    return videos, clean_list_of_moments 

offset = 0
for subset in SUBSETS:
    FILENAME = Path(f'../data/raw/charades/charades_sta_{subset}.txt')
    OUTPUT_FILE = Path(f'../data/interim/charades-sta/{subset}.json')
    
    instances = parse_charades_sta(FILENAME)
    videos_df, _ = make_annotations_df(instances, H5_FILE_FEAT_PER_FRAME)
    videos_gbv = videos_df.groupby('video')
    videos, cleaned_instances = extend_metadata(
        instances, videos_gbv, H5_FILE, offset=offset)
    offset += len(instances)
    
    if not OUTPUT_FILE.parent.is_dir():
        dirname = OUTPUT_FILE.parent
        dirname.mkdir(parents=True)
        print(f'Create dir: {dirname}')
    
    with open(OUTPUT_FILE, MODE) as fid:
        json.dump({'videos': videos,
                   'moments': cleaned_instances,
                   'date': datetime.now().isoformat(),
                   'git_hash': get_git_revision_hash(),
                   'responsible': CREATOR,
                  },
                  fid)

#### 2.a.2 Untied JSON and HDF5 inputs

TLDR; reference: minor-detail. Safe to skip unless you have problems loading data for dispatching training.

At some point, there was a undesired tied btw the JSON and HDF5 files (inputs) required by our implementation. 

- root `time_unit`. This is a property of the features, as such it should reside in the HDF5 a not in the JSON.

- `videos/ith-video/num_clips`. This is a property of the ith-video, as such we should grab it from the HDF5 instead of placed it in the JSON.

The following script was use to update the `*.json` files with metadata for training and evaluation.

```python
import json
from datetime import datetime

import sys
sys.path.append('..')
from utils import get_git_revision_hash

subsets = ['train-01', 'test-01', 'train-02_01', 'val-02_01']

for subset in subsets:
    filename = f'../data/processed/charades-sta/{subset}.json'
    with open(filename, 'r') as fr:
        data = json.load(fr)
    del data['time_unit']
    for video_id in data['videos']:
        del data['videos'][video_id]['num_clips']
    data['date'] = datetime.now().isoformat()
    data['git_hash'] = get_git_revision_hash()
    with open(filename, 'w') as fw:
        json.dump(data, fw)
```

We also update the HDF5 such that it contains `metadata` [Group/Folder](http://docs.h5py.org/en/latest/high/group.html).

```bash
!h5ls /home/escorciav/datasets/charades-sta/features/resnet152_max_cs-5.h5 | grep metadata
```

In case the following line doesn't return anything, it means that you are using an old version of the data.
If you know the `FPS`, `CLIP_LENGTH` and `POOL`ing operation used to get those features, the following snippet will add the metadata required for the most recent version of our code.

```python
FPS = 5
CLIP_LENGTH = 3  # seconds
POOL = 'max'  # pooling operation over time
# verbose
COMMENTS = (f'ResNet152 trained on Imagenet-ILSVRC12, Pytorch model. '
            f'Extracted at {FPS} FPS with an image resolution of 320x240, '
            f'and {POOL} pooled over time every {CLIP_LENGTH} seconds.')
CREATOR = 'EscorciaSSGR'  # please add your name here to sign the file i.e. assign yourself as resposible
filename = f'/home/escorciav/datasets/charades/features/resnet152_rgb_{POOL}_cl-{CLIP_LENGTH}.h5'
from datetime import datetime
import h5py

assert CLIP_LENGTH * FPS >= 1
with h5py.File(filename, 'a') as fw:
    grp = fw.create_group('metadata')
    grp.create_dataset('time_unit', data=CLIP_LENGTH)
    grp.create_dataset('date', data=datetime.now().isoformat(),
                       dtype=h5py.special_dtype(vlen=str))
    grp.create_dataset('responsible', data=CREATOR,
                       dtype=h5py.special_dtype(vlen=str))
    grp.create_dataset('comments', data=COMMENTS,
                       dtype=h5py.special_dtype(vlen=str))
```

### 2b. Chunked features

Go to notebook `4-feature-extraction.ipynb` section `#Varied-length-videos` (remove the # if you use your browser string matching).

_TODO_ add procedure here to avoid jumping over the place.

### 2c. Train-val split

_TODO_

Motivation: create two disjoint partitions of the train set.

1. get action categories for each video in Charades-STA from the [annotations](http://ai2-website.s3.amazonaws.com/data/Charades.zip)([source](https://allenai.org/plato/charades/)).

2. The partition must be randomly generated, ideally with a [pseudo-random number generator](https://docs.python.org/3.6/library/random.html#random.seed), and by videos. That means that a given video $v_i$ cannot appear in both subsets.

  Requirements:
  
  - In one of the subsets the percentage of examples for a given action category  must be ~$p$% of the total number of videos associated with that category.

  - (try) make that all the categories in the $p$% subset have the same number of samples.
  
  Outcome:
  
  - Bar plot with the number of Charades-STA videos associate with each action category on both subsets.
  
  - Add the median number of videos per category in the title of the plots.

3. Dump splits with the format described above.

In [None]:
import json
from copy import deepcopy

trial = '01'

with open('data/processed/charades-sta/train.json', 'r') as fid:
    data_train = json.load(fid)
    id2ind = {
        v['annotation_id']: k
        for k, v in enumerate(data_train['moments'])
    }

for subset, subset_ in [
    ('train', 'training'),
    ('val', 'validation')
    ]:
    filename = f'data/interim/charades-sta/{subset_}_set_split_75-25_threshold_8.json'
    with open(filename, 'r') as fid:
        indices = json.load(fid)
    data_subset = deepcopy(data_train)
    moments = []
    for i in indices:
        if i in id2ind:
            moment_i = data_train['moments'][id2ind[i]]
            moments.append(moment_i)
    data_subset['moments'] = moments
    data_subset['videos'] = {}
    for i in moments:
        video_i = i['video']
        if video_i not in data_subset['videos']:
            data_subset['videos'][video_i] = data_train['videos'][video_i]
    
    with open(f'data/processed/charades-sta/{subset}-{trial}.json', 'x') as fid:
        json.dump(data_subset, fid)

### 2c. Update pooled flow features

We update the pooled flow features such that they span the same duration of our RGB features. In this way, we simplify the late fusion of different modalities.

Code used to pad flow features according to RGB length.

```python
import json
import h5py
import numpy as np

CLIP_LENGTH = 3
file_flow = f'inceptionbn-imagenet-ucf101.1_max_cl-{CLIP_LENGTH}.h5'
file_rgb = f'resnet152_rgb_max_cl-{CLIP_LENGTH}.h5'
subsets = ['train', 'test']
weirds, edited = [], []
counter, toremove, toclamp = 0, 0, 0

videos = {}
moments = []
for subset in subsets:
    with open(f'data/processed/charades-sta/{subset}-01.json', 'r') as fr:
        data = json.load(fr)
    videos.update(data['videos'])
    moments += data['moments']
for i, moment_i in enumerate(moments):
    video_i = moment_i['video']
    if 'indices' not in videos[video_i]:
        videos[video_i]['indices'] = set()
    videos[video_i]['indices'].add(i)

fid = h5py.File(file_flow, 'r')
fid.close()
with h5py.File(file_flow, 'a') as fr_flow, h5py.File(file_rgb, 'r') as fr_rgb:
    for video_id, metadata in videos.items():
        assert video_id in fr_flow
        assert video_id in fr_rgb
        num_clips = fr_flow[video_id].shape[0]

        if abs((metadata['duration'] // CLIP_LENGTH) - num_clips) > 1:
            weirds.append(video_id)
            print(subset, video_id,
                  'Num clips: ', num_clips,
                  'Expected clips: ', metadata['duration'] // CLIP_LENGTH,
                  'Duration (s): ', metadata['duration'])

        duration = metadata['duration']
        estimated_duration = CLIP_LENGTH * num_clips
        if estimated_duration < duration:
            counter += 1
        for ind in metadata['indices']:
            times = np.array(moments[ind]['times'])
            toremove += (times[:, 0] >= estimated_duration).sum()
            toclamp += (times[:, 1] >= estimated_duration).sum()
            
        num_clips_rgb = fr_rgb[video_id].shape[0]
        if num_clips != num_clips_rgb:
            assert num_clips < num_clips_rgb
            features = fr_flow[video_id][:]
            width = num_clips_rgb - num_clips
            edited.append((video_id, width))
            padded_features = np.pad(features, ((0, width), (0, 0)), 'edge')
            assert padded_features.shape[0] == num_clips_rgb
            del fr_flow[video_id]
            fr_flow.create_dataset(video_id, data=padded_features, chunks=True)
print('duration-flow < duration-rgb: ', counter, f'{len(videos)}')
print('Num moments with t_sart >= duration:', toremove)
print('Num moments with t_end >= duration:', toclamp)

with open(f'padded_inceptionbn-imagenet-ucf101.1_max_cl-{CLIP_LENGTH}.txt', 'w') as fid:
    for video_id in edited:
        fid.write('{},{}\n'.format(*video_id))
```

## 3. Baselines single video retrieval

### 3.1 CTRL

| Model            | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 |
| :--------------- | ----------: | ----------: | ----------: | ----------: | 
| CTRL (aln)       |   17.69     |    5.91     |    55.54    |     23.79   |
| CTRL (reg-p)     |   19.22     |    6.64     |    57.98    |     25.22   |
| CTRL (reg-np)    |   21.42     |    7.15     |    59.11    |     26.91   |

### 3.2 Moment Frequency Prior

Results in a train-val split form train set for our search space (sliding windows between length 3s (seconds) and max length 24s with steps of 3s, stride 3s) and with `NMS = 0.6`. Please don't fool yourself and update the baseline according to your search strategy.

In [None]:
metrics = 'r@1,0.5', 'r@5,0.5', 'r@1,0.7', 'r@5,0.7'
bins = [5, 10, 15, 30, 20, 50, 100, 75, 1000, 500]
results = [
    [0.0678, 0.5051, 0.0245, 0.2522],
    [0.0682, 0.5191, 0.0248, 0.2978],
    [0.1729, 0.5815, 0.0841, 0.3755],
    [0.1758, 0.5879, 0.0834, 0.3834], 
    [0.1019, 0.5971, 0.0449, 0.3920],
    [0.1825, 0.6019, 0.0904, 0.4013],
    [0.2051, 0.5939, 0.1057, 0.3758],
    [0.1825, 0.6032, 0.0933, 0.3831],
    [0.1866, 0.6061, 0.0946, 0.3739],
    [0.2111, 0.6048, 0.1025, 0.3857]
]

We chose 75 bins as it's a good compromise for all the four metrics. The rationale is similar to the [BIC](https://en.wikipedia.org/wiki/Bayesian_information_criterion).

For a given number of bins, we proceed to compute the prior using the entire training set, and evaluating of the entire testing set.