# Exploring Charades-STA

In [None]:
import pandas as pd
import h5py

def parse_charades_sta(filename):
    """Parser raw charades-STA annotations
    
    Args:
        filename (str)
    Returns:
        instances (list of dicts)
    TODO:
        update dict by class
        
    """
    instances = []
    with open(filename, 'r') as fid:
        for line in fid:
            line = line.strip()
            video_info, description = line.split('##')
            video_id, t_start, t_end = video_info.split()
            t_start = float(t_start)
            t_end = float(t_end)
            
            instances.append(
                {'video': video_id,
                 'times': [[t_start, t_end]],
                 'description': description}
            )
            # print(video, t_start, t_end, description)
    return instances

def make_annotations_df(instances, file_h5):
    "Create data-frames to play easily with data"
    instances_df = pd.DataFrame([{**i, **{'t_start': i['times'][0][0],
                                          't_end': i['times'][0][1]}}
                                 for i in instances])
    videos_in_charades_sta = {i for i in instances_df['video'].unique()}
    instances_gbv = instances_df.groupby('video')
    with h5py.File(file_h5, 'r') as f:
        videos_info = []
        for video_id, dataset in f.items():
            if video_id not in videos_in_charades_sta:
                continue
            videos_info.append(
                {'video': video_id,
                 'num_frames': dataset.shape[0],
                 'num_instances': instances_gbv.get_group(
                     video_id).shape[0],
                }
            )
    videos_df = pd.DataFrame(videos_info)
    return videos_df, instances_df

## 1. Moments duration analysis

Why? to extend SMCN and all its variants, we need to do the dirty work done by DiDeMo setup.

> DiDeMo makes it easy by defining the search space up front.

What? We need to set a couple of parameters: (i) _minimum_ moment length, (ii) _maximum_ moment length, (iii) _type of range_, how to explore minimum -> maximum, and (iv) _striding_. Those parameters will define the search space, and will set the stage to define the size of the chunk/clip. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Q = 95
QUANTILES = np.arange(25, 101, 2.5)
COLOR = ['blue', 'orange', 'green']

all_duration = []
fig, axs = plt.subplots(1, 3, figsize=(21, 7))
for i, subset in enumerate(['train', 'test']):
    filename = f'../data/raw/charades/charades_sta_{subset}.txt'
    data = parse_charades_sta(filename)
    duration = [i['times'][0][1] - i['times'][0][0]
                for i in data
#                 if i['times'][0][1] > i['times'][0][0]
               ]
    all_duration += duration
    
    duration = np.array(duration)
    print('Negative durations: ', sum(duration <= 0))
    percentiles = np.percentile(duration, QUANTILES)
    axs[i].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[-1].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[i].set_xlabel('Duration')
    axs[i].set_ylabel('Quantile')
    axs[i].set_title(f'Duration stats {subset}\n'
                     f'Min: {np.min(duration[duration > 0]):.2f}, '
                     f'Median: {np.median(duration):.2f}, '
                     f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                     f'Max: {np.max(duration):.2f}')
duration = np.array(all_duration)
percentiles = np.percentile(duration, QUANTILES)
axs[-1].plot(percentiles, QUANTILES, ls='--', color=COLOR[-1])
axs[-1].set_xlabel('Duration')
axs[-1].set_ylabel('Quantile')
_ = axs[-1].set_title('Duration stats (train+test)\n'
                      f'Min: {np.min(duration[duration > 0]):.2f}, '
                      f'Median: {np.median(duration):.2f}, '
                      f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                      f'Max: {np.max(duration):.2f}')

- Minimum length of a moment.

  The initial batch of experiments will be done with 3s as it's close to the minimum and there are psicological references that support that number.

  > DiDeMo makes it easy by defining the minimum length of the segments to 5s with variance 0.
  
- Maximum length of moment.
  
  We ended up taking 24s as it's close to the maximum moment duration in the testing set.
  
    > DiDeMo makes it easy by defining the maximum length of the segments to 30s and setting the length of the video also to 30s.

- Explore range from minimum to maximum moment length.

  For simplicity we will consider a linear scale with unit slope.
    
- Amount of stride.

  The above parameters define $\mathcal{W}$, the set of all possible window durations. When do we place any given $w \in \mathcal{W}$? A potential approach is to place them uniformly every $s$ seconds. The distance between two possible durations would be the stride.

Based on the parameters mentioned above, we will study the size of the search space and its recall upper bound.

There are multiple ways to generate the search space, here we consider two:

## 1a. Search space with sliding windows

In [None]:
import sys
sys.path.append('..')

import h5py
import numpy as np
import pandas as pd

from np_segments_ops import iou as segment_iou

FPS = 5
IOU_THRESHOLDS = [0.5, 0.7]

def generate_windows(length, scale, linear=True):
    "create multi-scale (duration) right aligned windows"
    if not linear:
        raise NotImplementedError('WIP')
    windows = np.zeros((scale, 2))
    windows[:, 1] += np.arange(1, scale + 1) * length
    return windows

def sliding_window(length, scale, stride, t_end,
                   t_start=0, linear=True):
    "Sliding windows for a given time interval"
    list_of_np_windows = []
    canonical_windows = generate_windows(length, scale, linear)
    for t in np.arange(t_start, t_end, stride):
        window_t = canonical_windows * 1
        # shift windows
        window_t += t
        list_of_np_windows.append(window_t)
    windows = np.vstack(list_of_np_windows)
    # only keep valid windows inside video
    # this way is clean but change numbers
    # windows = windows[windows[:, 1] <= t_end, :]
    
    # on the other hand, this way is hacky and gives R@0.5=1
    # hacky := we end up with windows length != alpha * length
    # with alpha in Z+
    # windows[windows[:, 1] > t_end, 1] = t_end
    return windows

def recall_bound_and_search_space(videos_df, instances_df,
                                  stride, length, scale, linear=True,
                                  slidding_window_fn=sliding_window,
                                  iou_thresholds=IOU_THRESHOLDS, fps=FPS):
    """Compute recall and search-space for a given stride
    
    Args:
        videos_df : DataFrame with video info, required `num_frames`.
        instances_df : DataFrame with instance info, required `t_start`
                       and `t_end'.
    
    Note: this takes ~5s
    """
    num_videos = len(videos_df)
    videos_gbv = videos_df.groupby('video')
    instances_gbv = instances_df.groupby('video')
    
    matched_gt_per_iou = [[] for i in range(len(iou_thresholds))]
    search_space_card_per_video = np.empty(num_videos)
    for i, (video_id, gt_instances) in enumerate(instances_gbv):
        # Get ground-truth segments
        instances_start = gt_instances.loc[:, 't_start'].values[:, None]
        instances_end = gt_instances.loc[:, 't_end'].values[:, None]
        gt_segments = np.hstack([instances_start, instances_end])
        
        # Estimate video duration
        num_frames = videos_gbv.get_group(
            video_id)['num_frames'].values[0]
        t_end = num_frames / FPS
        
        # sanitize
        # i) clamp moments inside video
        gt_segments[gt_segments[:, 0] > t_end, 0] = t_end
        gt_segments[gt_segments[:, 1] > t_end, 1] = t_end
        # ii) remove moments with duration <= 0
        duration = gt_segments[:, 1] - gt_segments[:, 0]
        ind = duration > 0
        if ind.sum() == 0:
            continue
        gt_segments = gt_segments[ind, :]
        
        # Generate search space
        windows = slidding_window_fn(length, scale, stride, t_end,
                                     linear=linear)
        search_space_card_per_video[i] = len(windows)
        
        # IOU between windows and gt_segments
        iou_pred_vs_gt = segment_iou(windows, gt_segments)
        # Computing upper-bound of recall, given that we don't have
        # more info to do assignments
        iou_per_gt_i = iou_pred_vs_gt.max(axis=0)
        
        # Compute matched_gt_per_iou for over multiple thresholds
        for j, iou_thr in enumerate(iou_thresholds):
            matched_gt_per_iou[j].append(iou_per_gt_i >= iou_thr)

    recall_ious = np.empty(len(iou_thresholds))
    for i, list_of_arrays in enumerate(matched_gt_per_iou):
        matched_gt_i = np.concatenate(list_of_arrays)
        recall_ious[i] = matched_gt_i.sum() / len(matched_gt_i)
    search_space_median_std = np.array(
        [np.median(search_space_card_per_video),
         np.std(search_space_card_per_video)]
    )
    
    return recall_ious, search_space_median_std

In training the results look as follows like

In [None]:
# Variables to edit
min_length = 3
max_length = 80
num_scales = 8
strides = [1, 2, 3, 4, 5]

annotation_file = '../data/raw/charades/charades_sta_train.txt'
features_file = '/home/escorciav/datasets/charades/features/resnet101-openimages_5fps_320x240.h5'

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
font_size = 12
COLOR_2ND_AXIS = 'red' 
IOU_COLORS = ['blue', 'orange', 'green']
iou_thresholds = IOU_THRESHOLDS
assert len(IOU_COLORS) - 1 == len(iou_thresholds)

instances = parse_charades_sta(annotation_file)
videos_df, instances_df = make_annotations_df(instances, features_file)

recalls = []
search_space = []
for stride in tqdm(strides):
    recall_iou, search_space_stats = recall_bound_and_search_space(
        videos_df, instances_df, stride,
        length=min_length, scale=num_scales,
        slidding_window_fn=sliding_window,
    )
    recalls.append(recall_iou)
    search_space.append(search_space_stats)
search_space = np.vstack(search_space)
recalls = np.vstack(recalls)
recalls = np.column_stack([recalls, recalls.mean(axis=1)])

fig, ax1 = plt.subplots(figsize=(21, 7))
for i, iou in enumerate(iou_thresholds + [None]):
    ls, label, color = '-', f'tIOU={iou}', IOU_COLORS[i]
    if i == len(IOU_COLORS) - 1:
        ls, label = '-.', 'avg tIOU'
    ax1.plot(strides, recalls[:, i], ls=ls,
             color=color, label=label)
ax1.set_xlabel('stride', fontsize=font_size)
ax1.set_ylabel('Recall', fontsize=font_size)
ax1.tick_params('y')
ax2 = ax1.twinx()
ax2.plot(strides, search_space[:, 0], ls='--', color=COLOR_2ND_AXIS)
ax2.set_ylabel('Median size of search space',
               color=COLOR_2ND_AXIS, fontsize=font_size)
ax2.tick_params('y', colors='r')
for tick in (ax1.xaxis.get_major_ticks() + 
             ax1.yaxis.get_major_ticks() +
             ax2.yaxis.get_major_ticks()):
    tick.label.set_fontsize(font_size)

To get an idea of the values

In [None]:
info = {'Stride': np.array(strides)}
for i in range(recalls.shape[1]):
    if i > (len(iou_thresholds) - 1):
        iou = f'Avg({iou_thresholds[0]}, {iou_thresholds[-1]})'
    else:
        iou = iou_thresholds[i]
    info[f'R@{iou}'] = recalls[:, i]
for i, label in enumerate(['(median)', '(std)']):
    info[f'Search space size {label}'] = search_space[:, i]
display(pd.DataFrame(info))

Let's make sure that the results look similar in testing a.k.a. there is _NOT_ distribution (training v.s. testing) miss-match.

In [None]:
annotation_file = '../data/raw/charades/charades_sta_test.txt'

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
font_size = 12
COLOR_2ND_AXIS = 'red' 
IOU_COLORS = ['blue', 'orange', 'green']
iou_thresholds = IOU_THRESHOLDS
assert len(IOU_COLORS) - 1 == len(iou_thresholds)

instances = parse_charades_sta(annotation_file)
videos_df, instances_df = make_annotations_df(instances, features_file)

recalls = []
search_space = []
for stride in tqdm(strides):
    recall_iou, search_space_stats = recall_bound_and_search_space(
        videos_df, instances_df, stride,
        length=min_length, scale=num_scales,
        slidding_window_fn=sliding_window,
    )
    recalls.append(recall_iou)
    search_space.append(search_space_stats)
search_space = np.vstack(search_space)
recalls = np.vstack(recalls)
recalls = np.column_stack([recalls, recalls.mean(axis=1)])

fig, ax1 = plt.subplots(figsize=(21, 7))
for i, iou in enumerate(iou_thresholds + [None]):
    ls, label, color = '-', f'tIOU={iou}', IOU_COLORS[i]
    if i == len(IOU_COLORS) - 1:
        ls, label = '-.', 'avg tIOU'
    ax1.plot(strides, recalls[:, i], ls=ls,
             color=color, label=label)
ax1.set_xlabel('stride', fontsize=font_size)
ax1.set_ylabel('Recall', fontsize=font_size)
ax1.tick_params('y')
ax2 = ax1.twinx()
ax2.plot(strides, search_space[:, 0], ls='--', color=COLOR_2ND_AXIS)
ax2.set_ylabel('Median size of search space',
               color=COLOR_2ND_AXIS, fontsize=font_size)
ax2.tick_params('y', colors='r')
for tick in (ax1.xaxis.get_major_ticks() + 
             ax1.yaxis.get_major_ticks() +
             ax2.yaxis.get_major_ticks()):
    tick.label.set_fontsize(font_size)

To get an idea of the values

In [None]:
info = {'Stride': np.array(strides)}
for i in range(recalls.shape[1]):
    if i > (len(iou_thresholds) - 1):
        iou = f'Avg({iou_thresholds[0]}, {iou_thresholds[-1]})'
    else:
        iou = iou_thresholds[i]
    info[f'R@{iou}'] = recalls[:, i]
for i, label in enumerate(['(median)', '(std)']):
    info[f'Search space size {label}'] = search_space[:, i]
display(pd.DataFrame(info))

__Conclusions__

- With a 1s stride, there is small gap due to _not_ regressing the exact boundaries for IOU=0.7.

- However with such a low stride, the median size of the search space is an order of magnitud greater than DiDeMo. Note that the median duration of the videos is about the same as in DiDeMo (see section [1a.1](#1a.1-Video-duration)).

- In case we wanna reduce the search space, good values to consider are:
  - _min moment duration_: 3 seconds
  - _max moment duration_: 24 seconds
  - _scale from min to max_: linear, with unit slope, from min to max moment duration.
  - _stride_: 3 seconds
  - _clip/chunk size_: 3 seconds
  
- What would it happen if min/max moment duration or stride are not a multiple of clip/chunk size?

  We need to do rounding or interpolation of features.
  
In case you are interested on a relationship between the size of the search space, $|\mathcal{S}|$, in terms of the duration of the video $d$, you can use this formula:

$|\mathcal{S}| = \sum_{w_l \in \mathcal{W}} (\frac{d}{s} + 1) = |\mathcal{W}| (\frac{d}{s} + 1) $

i.e. $|\mathcal{S}| \propto |\mathcal{W}| \frac{d}{s} $

where:

- $\mathcal{W}$ is the set of all possible durations. 

- $s$ stride of the window. We use the same stride for all the different $w_l$.

_TLDR_ More details. Skip if you are in a rush time.

The above formula comes from the generic:

$|S| \propto \sum_{w_l \in \mathcal{W}} (\frac{d - w_l + 2p}{s} + 1)  $

where $2p$ is the amount padding. In our case, $2p = w_l$ because we kept windows ending at a time longer than $d$.

We can get a better upper-bound by clamping ending time to $d$. However, that's tricky as some windows may have length which is not inside $\mathcal{W}$.

### Video duration

The previous analysis only consider the duration of the moments. What about the video duration?

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Q = 95
FPS = 5
QUANTILES = np.arange(25, 101, 2.5)
COLOR = ['blue', 'orange', 'green']
H5_FILE_FEAT_PER_FRAME = '/home/escorciav/datasets/charades/features/resnet152-imagenet_5fps_320x240.hdf5'

all_duration = []
fig, axs = plt.subplots(1, 3, figsize=(21, 7))
for i, subset in enumerate(['train', 'test']):
    filename = f'../data/raw/charades/charades_sta_{subset}.txt'
    data = parse_charades_sta(filename)
    videos_df, _ = make_annotations_df(data, H5_FILE_FEAT_PER_FRAME)
    duration = videos_df['num_frames'] / FPS
    all_duration.append(duration)
    
    print('Negative durations: ', sum(duration <= 0))
    percentiles = np.percentile(duration, QUANTILES)
    axs[i].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[-1].plot(percentiles, QUANTILES, color=COLOR[i])
    axs[i].set_xlabel('Duration')
    axs[i].set_ylabel('Quantile')
    axs[i].set_title(f'Duration stats {subset}\n'
                     f'Min: {np.min(duration[duration > 0]):.2f}, '
                     f'Median: {np.median(duration):.2f}, '
                     f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                     f'Max: {np.max(duration):.2f}')
duration = pd.concat(all_duration, axis=0, ignore_index=True)
percentiles = np.percentile(duration, QUANTILES)
axs[-1].plot(percentiles, QUANTILES, ls='--', color=COLOR[-1])
axs[-1].set_xlabel('Duration')
axs[-1].set_ylabel('Quantile')
_ = axs[-1].set_title('Duration stats (train+test)\n'
                      f'Min: {np.min(duration[duration > 0]):.2f}, '
                      f'Median: {np.median(duration):.2f}, '
                      f'{Q}Q: {percentiles[QUANTILES == Q][0]:.2f} '
                      f'Max: {np.max(duration):.2f}')

## 1b. Alternatives

__SKIP THIS__

_TLDR_ Skip this in the interest of time. The thoughts below came to our mind when we were implementing the idea above.


## 1b.1 Search space with DiDeMo style segments

In this section, we consider the setup of generating segments as in DiDeMo. This setup only change the arrange of the segments over a window of longer duration ($w_{\text{max}}$). In particular, the configuration would look similar to this:

TODO: missing figure

Given that videos may be longer, or shorter, than $w_{\text{max}}$, we must stride this arragement to cover the entire video.

In this case, the search space is computed as follow:

$|S| \propto \frac{d}{w_{\text{max}}} (\frac{w_{\text{max}}}{c} + \sum_{i=1}^{w_{\text{max}}/w_{\text{min}}} (\frac{w_{\text{max}} - i \; w_{\text{min}}}{c} + 1)) $

- $w_{\text{max}}$ is the maximum moment duration.

- $w_{\text{max}}$ is the maximum moment duration.

- $c$ is the length of the clip or chunk.

- Note that this assume as stride equal to $w_{\text{max}}$, which is reflected in the denominator of the first fraction.

__Note__: the formula may break if $w_{\text{max}}$ is not multiple of $w_{\text{min}}$ and $w_{\text{min}}$ is not divisible by $c$.

For fair comparison with [1a](), we also clamped the segments longer than the video duration.

In [None]:
# Generate windows with DiDeMo arragement

def didemo_arragement(w_min, w_max, c=None):
    "canonical DiDeMo-style arrangement of windows"
    if c is None:
        c = w_min
    assert c == w_min
    list_of_np_windows = []
    for w in np.arange(w_min, w_max + 1e-6, c):
        t_start = np.arange(0, w_max - w + 1e-6, c)
        windows_w = np.empty((len(t_start), 2))
        windows_w[:, 0] = t_start
        windows_w[:, 1] = t_start + w
        list_of_np_windows.append(windows_w)
    windows = np.vstack(list_of_np_windows)
    return windows

def sliding_window3(w_min, w_max, stride, t_end,
                    t_start=0, linear=True):
    "Sliding windows for a given time interval"
    list_of_np_windows = []
    canonical_windows = didemo_arragement(w_min, w_max)
    for t in np.arange(t_start, t_end, stride):
        window_t = canonical_windows * 1
        # shift windows
        window_t += t
        list_of_np_windows.append(window_t)
    windows = np.vstack(list_of_np_windows)
    # TBD
    # windows = windows[windows[:, 1] <= t_end, :]
    
    # TBD
    # windows[windows[:, 1] > t_end, 1] = t_end
    # Extra because these arragement does not guarantee
    # that previous clamp will give correct segments
    # windows = windows[windows[:, 1] > windows[:, 0], :]
    return windows

Running analysis only in testing set because we already know that we can reach Recall of 1 there.

In [None]:
# Variables to edit
min_length = 3
max_length = 24
strides = [1, 3, 12, 24]

annotation_file = '../data/raw/charades/charades_sta_test.txt'
features_file = '/home/escorciav/datasets/charades/features/resnet101-openimages_5fps_320x240.h5'

import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline
font_size = 12
COLOR_2ND_AXIS = 'red' 
IOU_COLORS = ['blue', 'orange', 'green']
iou_thresholds = IOU_THRESHOLDS
assert len(IOU_COLORS) - 1 == len(iou_thresholds)

instances = parse_charades_sta(annotation_file)
videos_df, instances_df = make_annotations_df(instances, features_file)

recalls = []
search_space = []
for stride in tqdm(strides):
    # this was a hack to evaluate didemo-style arragement of windows
    recall_iou, search_space_stats = recall_bound_and_search_space(
        videos_df, instances_df, stride,
        length=min_length, scale=max_length,
        slidding_window_fn=sliding_window3,
    )
    recalls.append(recall_iou)
    search_space.append(search_space_stats)
search_space = np.vstack(search_space)
recalls = np.vstack(recalls)
recalls = np.column_stack([recalls, recalls.mean(axis=1)])

fig, ax1 = plt.subplots(figsize=(21, 7))
for i, iou in enumerate(iou_thresholds + [None]):
    ls, label, color = '-', f'tIOU={iou}', IOU_COLORS[i]
    if i == len(IOU_COLORS) - 1:
        ls, label = '-.', 'avg tIOU'
    ax1.plot(strides, recalls[:, i], ls=ls,
             color=color, label=label)
ax1.set_xlabel('stride', fontsize=font_size)
ax1.set_ylabel('Recall', fontsize=font_size)
ax1.tick_params('y')
ax2 = ax1.twinx()
ax2.plot(strides, search_space[:, 0], ls='--', color=COLOR_2ND_AXIS)
ax2.set_ylabel('Median size of search space',
               color=COLOR_2ND_AXIS, fontsize=font_size)
ax2.tick_params('y', colors='r')
for tick in (ax1.xaxis.get_major_ticks() + 
             ax1.yaxis.get_major_ticks() +
             ax2.yaxis.get_major_ticks()):
    tick.label.set_fontsize(font_size)

In [None]:
info = {'Stride': np.array(strides)}
for i in range(recalls.shape[1]):
    if i > (len(iou_thresholds) - 1):
        iou = f'Avg({iou_thresholds[0]}, {iou_thresholds[-1]})'
    else:
        iou = iou_thresholds[i]
    info[f'R@{iou}'] = recalls[:, i]
for i, label in enumerate(['(median)', '(std)']):
    info[f'Search space size {label}'] = search_space[:, i]
display(pd.DataFrame(info))

_Conclusion_: comparing the two windows arragement [1a](#1a.-Search-space-with-sliding-windows) vs [1b.1](#1b.1-"DiDeMofying"-untrimmed-videos), we can conclude:

- For 1s stride, 1b generates more windows than 1a.

  In case we are open to explore the search space in a different way, it seems [1a](#1a.-Search-space-with-sliding-windows) reduces the amount of moments to explore.

### 1b.2 Other consideration

Thoughts regarding the selection of the parameters mentioned at the begining of section 1. 

- Minimum length of moment.

  We can pick the minimum or probably something around
  
  a. $\text{min} ( \text{duration} ) (1 + \text{tIOU}_\text{ref})$
  
  b. $\text{min} ( \text{duration} ) (1 + \text{tIOU}_\text{ref}) \pm \sigma_{\text{annotators concensus}}$
  
     $\sigma_{\text{annotators concensus}}$ could be taken from [Sigurdsson et. al ICCV-2017](https://arxiv.org/abs/1708.02696)

- Maximum length of moment.

  Picking the maximum duration is trickier that the minimum. Indeed, this could become a chicken or egg problem. Anyways, the practical thoughs are:
  
  a. $\text{max} ( \text{duration} ) (1 - \text{tIOU}_\text{ref})$
  
  b. $\text{max} ( \text{duration} ) (1 - \text{tIOU}_\text{ref}) \pm \sigma_{\text{annotators concensus}}$
  
     $\sigma_{\text{annotators concensus}}$ could be taken from [Sigurdsson et. al ICCV-2017](https://arxiv.org/abs/1708.02696)
     
  Those assume that your data is unimorly distributed which is not the case based on the above plots. The distribution is skewed towards smaller durations.
  
  c. If you aren't interested on working on the long tail, a hard cut-off is OK. Why? the [paper](https://arxiv.org/pdf/1705.02101.pdf) didn't disclose the UI to collect annotation, nor mechanism to enforce consensus. thus, it is fine to consider them outliers or out of the scope of this work if you wanna be politically correct.
  
  d. We must tackle the long tail! OK, the first step is to study multiple annotators, later we can come-up with scalable methods for tackling the long tail. This is still an open problem, people that has done [it](https://cs.stanford.edu/people/ranjaykrishna/densevid/) are unsure how to setup and scale the study. Once, we understand the nature of the long tail, designing the methods will be more interesting. It's like throwing dards to the actual target rather than the wall.

- Explore range from minimum to maximum moment length.

  - We haven't still made our mind around this. Sounds like a nice assigment in a computer vision class. Thus, we won't put it in the appendix.

### 1b.3 Other thoughts

_Note_ this was written before implementing section 1b.

The previous analysis studies how to recover as many moments as possible only focus on their duration. This perspective assumes that the search space will consider many candidates. The candicates could come from a proposals method or a sliding window approach.

Another perspective is to define a canonical video length. To scale this perspective to videos of multiple length, we could consider the following strategies:

(i) sliding the model over time (convolution kinda).

(ii) rescaling the temporal dimension (temporal pyramids) similar to how we re-scale images.

  > sounds a bit like a variable clip/chunk-size.

## 2. Dump data for training and evaluation

### 2a. JSON files

File to dump
```json
{
     'moments': [moment_dict, ...],
     'videos': {video_id: video_dict, ...},
     'time_unit': 3,
     'date': str,
     'responsible': str,
 }
```

time_unit := size of chunk/clip in seconds.

video_dict
```json
{
    'duration': float,
    'num_clips': int
}
```
duration := approximate video durations

num_clips := number of chunks with of time_unit length in the video

moment_dict
```json
{
    'description': str,
    'annotation_id': int,
    'video': str,
    'time': [float, float],
    'times': [[float, float]]
}
```

description := description provided by the annotators

times := list with all segments associated with a given description. Why a list? it is inherited from DiDeMo where you have multiple segments for a given description.

time := first item in `times`. Given that moments in Charades-STA only have a single segment, it's easier to create a new attribute with it. We keep both to not break our dashboards.

video := unique video-id to refer to the video. Make sure that this match the HDF5 with the features.

__Note__: It requires to run 1st cell with `function:parse_charades_sta ` and `function:make_annotations_df`

In [None]:
SUBSETS = ['train', 'test']
TIME_UNIT = 3
MODE = 'w'
FPS = 5
CREATOR = 'EscorciaSSGR'
H5_FILE = '/home/escorciav/datasets/charades/features/resnet152_max.h5'
H5_FILE_FEAT_PER_FRAME = '/home/escorciav/datasets/charades/features/resnet152-imagenet_5fps_320x240.hdf5'
if MODE == 'w':
    print('are you sure you wanna do this? comment these 3 lines!')
    raise
assert SUBSETS == ['train', 'test']

import json
from copy import deepcopy
from datetime import datetime
from pathlib import Path
import h5py
import sys
sys.path.append('..')
from utils import get_git_revision_hash

def extend_metadata(list_of_moments, videos_gbv, filename, offset=0, fps=FPS):
    """Augment moments' (in-place) metadata and create video metadata
    
    Args:
        list_of_moments (list of dicts) : the output of
            function::parse_charades_sta.
        videos_gbv (DataFrame groupedby) : DataFrame grouped by `video_id`. It
            is mandatory that the DataFrame has a column `num_frames` such that
            we can estimate the duration of the video.
        filename (str) : path to HDF5 with all features of the dataset.
        offset (int, optional) : ensure annotation-id accross moments is unique
        
    Adds `annotation_id` and `time` to each moment
    """
    with h5py.File(filename) as fid:
        videos = {}
        keep = []
        for i, moment in enumerate(list_of_moments):
            assert len(moment['times']) == 1
            video_id = moment['video']
            # Get estimated video duration
            num_frames = videos_gbv.get_group(
                video_id)['num_frames'].values[0]
            video_duration = num_frames / fps
            
            # TODO: sanitize by trimming moments up to video duration <= 0
            # Sanitize
            # i) clamp moments inside video
            moment['times'][0][0] = min(moment['times'][0][0], video_duration)
            moment['times'][0][1] = min(moment['times'][0][1], video_duration)
            # ii) remove moments with duration <= 0
            if moment['times'][0][1] <= moment['times'][0][0]:
                continue
            
            keep.append(i)
            moment['time'] = moment['times'][0]
            # we use the row index of the original CSV as unique identifier
            # for the moment. Of course, 0-indexed.
            moment['annotation_id'] = i + offset

            # Update dict with video info
            if video_id not in videos:
                num_clips = fid[video_id].shape[0]
                videos[video_id] = {'duration': video_duration,
                                    'num_clips': num_clips,
                                    'num_moments': 0}
            videos[video_id]['num_moments'] += 1
    
    clean_list_of_moments = []
    for i in keep:
        clean_list_of_moments.append(list_of_moments[i])
    return videos, clean_list_of_moments 

offset = 0
for subset in SUBSETS:
    FILENAME = Path(f'../data/raw/charades/charades_sta_{subset}.txt')
    OUTPUT_FILE = Path(f'../data/interim/charades-sta/{subset}.json')
    
    instances = parse_charades_sta(FILENAME)
    videos_df, _ = make_annotations_df(instances, H5_FILE_FEAT_PER_FRAME)
    videos_gbv = videos_df.groupby('video')
    videos, cleaned_instances = extend_metadata(
        instances, videos_gbv, H5_FILE, offset=offset)
    offset += len(instances)
    
    if not OUTPUT_FILE.parent.is_dir():
        dirname = OUTPUT_FILE.parent
        dirname.mkdir(parents=True)
        print(f'Create dir: {dirname}')
    
    with open(OUTPUT_FILE, MODE) as fid:
        json.dump({'videos': videos,
                   'moments': cleaned_instances,
                   'time_unit': TIME_UNIT,
                   'date': datetime.now().isoformat(),
                   'git_hash': get_git_revision_hash(),
                   'responsible': CREATOR,
                  },
                  fid)

### 2b. Chunked features

Go to notebook `4-feature-extraction.ipynb` section `#Varied-length-videos` (remove the # if you use your browser string matching).

_TODO_ add procedure here to avoid jumping over the place.

### 2c. Train-val split

_TODO_

Motivation: create two disjoint partitions of the train set.

1. get action categories for each video in Charades-STA from the [annotations](http://ai2-website.s3.amazonaws.com/data/Charades.zip)([source](https://allenai.org/plato/charades/)).

2. The partition must be randomly generated, ideally with a [pseudo-random number generator](https://docs.python.org/3.6/library/random.html#random.seed), and by videos. That means that a given video $v_i$ cannot appear in both subsets.

  Requirements:
  
  - In one of the subsets the percentage of examples for a given action category  must be ~$p$% of the total number of videos associated with that category.

  - (try) make that all the categories in the $p$% subset have the same number of samples.
  
  Outcome:
  
  - Bar plot with the number of Charades-STA videos associate with each action category on both subsets.
  
  - Add the median number of videos per category in the title of the plots.

3. Dump splits with the format described above.

In [None]:
import json
from copy import deepcopy

trial = '01'

with open('data/processed/charades-sta/train.json', 'r') as fid:
    data_train = json.load(fid)
    id2ind = {
        v['annotation_id']: k
        for k, v in enumerate(data_train['moments'])
    }

for subset, subset_ in [
    ('train', 'training'),
    ('val', 'validation')
    ]:
    filename = f'data/interim/charades-sta/{subset_}_set_split_75-25_threshold_8.json'
    with open(filename, 'r') as fid:
        indices = json.load(fid)
    data_subset = deepcopy(data_train)
    moments = []
    for i in indices:
        if i in id2ind:
            moment_i = data_train['moments'][id2ind[i]]
            moments.append(moment_i)
    data_subset['moments'] = moments
    data_subset['videos'] = {}
    for i in moments:
        video_i = i['video']
        if video_i not in data_subset['videos']:
            data_subset['videos'][video_i] = data_train['videos'][video_i]
    
    with open(f'data/processed/charades-sta/{subset}-{trial}.json', 'x') as fid:
        json.dump(data_subset, fid)

## 3. Baselines single video retrieval

### 3.1 CTRL

| Model            | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 |
| :--------------- | ----------: | ----------: | ----------: | ----------: | 
| CTRL (aln)       |   17.69     |    5.91     |    55.54    |     23.79   |
| CTRL (reg-p)     |   19.22     |    6.64     |    57.98    |     25.22   |
| CTRL (reg-np)    |   21.42     |    7.15     |    59.11    |     26.91   |

### 3.2 Moment Frequency Prior

Results in a train-val split form train set for our search space (sliding windows between length 3s (seconds) and max length 24s with steps of 3s, stride 3s) and with `NMS = 0.6`. Please don't fool yourself and update the baseline according to your search strategy.

In [None]:
metrics = 'r@1,0.5', 'r@5,0.5', 'r@1,0.7', 'r@5,0.7'
bins = [5, 10, 15, 30, 20, 50, 100, 75, 1000, 500]
results = [
    [0.0678, 0.5051, 0.0245, 0.2522],
    [0.0682, 0.5191, 0.0248, 0.2978],
    [0.1729, 0.5815, 0.0841, 0.3755],
    [0.1758, 0.5879, 0.0834, 0.3834], 
    [0.1019, 0.5971, 0.0449, 0.3920],
    [0.1825, 0.6019, 0.0904, 0.4013],
    [0.2051, 0.5939, 0.1057, 0.3758],
    [0.1825, 0.6032, 0.0933, 0.3831],
    [0.1866, 0.6061, 0.0946, 0.3739],
    [0.2111, 0.6048, 0.1025, 0.3857]
]

We chose 75 bins as it's a good compromise for all the four metrics. The rationale is similar to the [BIC](https://en.wikipedia.org/wiki/Bayesian_information_criterion).

For a given number of bins, we proceed to compute the prior using the entire training set, and evaluating of the entire testing set.