# Exploring ActivityNet-Captions

In [None]:
import sys
import json
import glob
import time

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from nb_utils import make_annotations_df
from nb_utils import recall_bound_and_search_space
from nb_utils import sliding_window

def parse_activitynet_captions(filename):
    """Parser raw ActivityNet Captions annotations
    Args:
        filename (str)
    Returns:
        instances (list of dicts)
    """
    instances = []
    with open(filename) as f:
        dataset = json.load(f)
        for video_id in dataset:
            time_and_descriptions = zip(
                dataset[video_id]["timestamps"],
                dataset[video_id]["sentences"])
            for interval, description in time_and_descriptions:
                instances.append(
                    {'video': video_id,
                     'times': [interval],
                     'description': description}
                )
                #print(video_id, interval, description)
    return instances

## 1. Moments duration analysis

Similar to notebook 11.

We need to set a couple of parameters:

(i) _minimum_ moment length

(ii) _maximum_ moment length

(iii) _type of range_, how to explore minimum -> maximum

(iv) _striding_.

Those parameters will define the search space, and will set the stage to define the size of the chunk/clip. 

_Note:_ following [Xu et. al arxiv-2018](https://arxiv.org/pdf/1804.05113.pdf), we fuse the two annotations in the validation set.

The first step is to get an indea of the duration of the moments in the dataset


In [None]:
Q = 95
QUANTILES = np.arange(25, 101, 5)

# plot stuff
COLOR = ['blue', 'orange', 'green']
fontsize = 14
lw = 3 # linewidth

all_duration = []
fig, axs = plt.subplots(1, 3, figsize=(21, 7))
# we use a list of list to merge val_1 and val_2 ;)
for i, subsets in enumerate([['train'], ['val_1', 'val_2']]):
    data = []
    for subset in subsets:
        filename = '../data/raw/activitynet/{}.json'.format(subset)
        data.append(parse_activitynet_captions(filename))
    # this will merge the instances ;)
    data = sum(data, [])

    duration = [i['times'][0][1] - i['times'][0][0]
                for i in data
                # ignore negative duration
                if i['times'][0][1] > i['times'][0][0]
               ]
    all_duration += duration
    duration = np.array(duration)
    if subset.startswith('val'):
        subset = subset[:-2]
    print('Negative durations in {}: {}'.format(subset, sum(duration <= 0)))
    percentiles = np.percentile(duration, QUANTILES)
    axs[i].plot(percentiles, QUANTILES, color=COLOR[i], lw=lw)
    axs[-1].plot(percentiles, QUANTILES, color=COLOR[i], lw=lw)
    axs[i].set_xlabel('Duration', fontsize=fontsize)
    axs[i].set_ylabel('Percentile', fontsize=fontsize)
    axs[i].tick_params(labelsize=fontsize)
    axs[i].set_title('Duration stats {}\nMin: {:.2f}, Median: {:.2f}, {}Q: {:.2f} Max: {:.2f}'
                     .format(subset, np.min(duration[duration > 0]), np.median(duration), Q,
                             percentiles[QUANTILES == Q][0], np.max(duration)),
                     fontsize=fontsize)

duration = np.array(all_duration)
percentiles = np.percentile(duration, QUANTILES)
axs[-1].plot(percentiles, QUANTILES, ls='--', color=COLOR[-1], lw=lw)
axs[-1].set_xlabel('Duration', fontsize=fontsize)
axs[-1].set_ylabel('Quantile', fontsize=fontsize)
axs[-1].tick_params(labelsize=fontsize)
_ = axs[-1].set_title('Duration stats (train+val1+val2)\nMin: {:.2f}, Median: {:.2f}, {}Q: {:.2f} Max: {:.2f}'
                      .format(np.min(duration[duration > 0]), np.median(duration), Q,
                              percentiles[QUANTILES == Q][0], np.max(duration)),
                     fontsize=fontsize)
#fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_percentile-moment-duration.pdf', bbox_inches='tight')

Given that the distribution is quite particular, we decided to analyze the PDF and CDF closer

In [None]:
duration_step = 5 # seconds
duration_edges = np.arange(0, 400 + duration_step - 0.1, duration_step)

# plot stuff
COLOR = ['blue', 'orange', 'green']
fontsize = 14
rwidth = 0.75

all_duration = []
fig, axs = plt.subplots(2, 3, figsize=(21, 14), sharex=True)
cdf_val, edges_val = None, None
# we use a list of list to merge val_1 and val_2 ;)
for i, subsets in enumerate([['train'], ['val_1', 'val_2']]):
    data = []
    for subset in subsets:
        filename = '../data/raw/activitynet/{}.json'.format(subset)
        data.append(parse_activitynet_captions(filename))
    # this will merge the instances ;)
    data = sum(data, [])

    duration = [i['times'][0][1] - i['times'][0][0]
                for i in data
                # ignore negative duration
                if i['times'][0][1] > i['times'][0][0]
               ]
    all_duration += duration
    duration = np.array(duration)
    if subset.startswith('val'):
        subset = subset[:-2]
    print('Negative durations in {}: {}'.format(subset, sum(duration <= 0)))
    axs[0, i].hist(duration, duration_edges, color=COLOR[i], density=True,
                   rwidth=rwidth)
    cdf, edges, *_ = axs[1, i].hist(duration, duration_edges, color=COLOR[i], density=True,
                                    cumulative=True, rwidth=rwidth)
    axs[1, i].set_xlabel('Duration', fontsize=fontsize)
    axs[0, i].tick_params(labelsize=fontsize)
    axs[1, i].tick_params(labelsize=fontsize)
    axs[0, i].set_title(subset, fontsize=fontsize)

duration = np.array(all_duration)
axs[0, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, rwidth=rwidth)
axs[1, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, cumulative=True, rwidth=rwidth)
axs[0, 0].set_ylabel('Norm frequency', fontsize=fontsize)
axs[1, 0].set_ylabel('Cum frequency', fontsize=fontsize)
axs[1, -1].set_xlabel('Duration', fontsize=fontsize)
axs[0, -1].set_title('train+val', fontsize=fontsize)
axs[0, -1].tick_params(labelsize=fontsize)
axs[1, -1].tick_params(labelsize=fontsize)
fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_cdf-pdf_moments-duration.pdf', bbox_inches='tight')

OK, the search space is gonna be humongous doing a linear search over a minimum length of 5s. Maybe, 10s is a better choice.

Why not doing a non-linear exploration?

In particular a piece-wise linear (_irrelevant joke (ignore it)_  because who does not like PWL functions these days. Before you pull the trigger in our head, make sure that you did not use a conv+relu combo in the last 10 years. Otherwise, you should commit suicide 😉).

In [None]:
times = [0, 25, 55, 125]
widths = [5, 10, 25]
duration_edges = [np.arange(times[i], times[i+1], width)
                  for i, width in enumerate(widths)]
duration_edges = np.concatenate(duration_edges)
# plot stuff
fontsize = 14
rwidth = 0.75

all_duration = []
fig, axs = plt.subplots(2, 3, figsize=(21, 14), sharex=True)
# we use a list of list to merge val_1 and val_2 ;)
for i, subsets in enumerate([['train'], ['val_1', 'val_2']]):
    data = []
    for subset in subsets:
        filename = '../data/raw/activitynet/{}.json'.format(subset)
        data.append(parse_activitynet_captions(filename))
    # this will merge the instances ;)
    data = sum(data, [])

    duration = [i['times'][0][1] - i['times'][0][0]
                for i in data
                # ignore negative duration
                if i['times'][0][1] > i['times'][0][0]
               ]
    all_duration += duration
    duration = np.array(duration)
    if subset.startswith('val'):
        subset = subset[:-2]
    print('Negative durations in {}: {}'.format(subset, sum(duration <= 0)))
    axs[0, i].hist(duration, duration_edges, color=COLOR[i], density=True, rwidth=rwidth)
    cdf, edges, *_ = axs[1, i].hist(duration, duration_edges, color=COLOR[i], density=True, cumulative=True, rwidth=rwidth)
    axs[1, i].set_xlabel('Duration', fontsize=fontsize)
    axs[0, i].tick_params(labelsize=fontsize)
    axs[1, i].tick_params(labelsize=fontsize)
    axs[0, i].set_title(subset, fontsize=fontsize)
    
duration = np.array(all_duration)
axs[0, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, rwidth=rwidth)
axs[1, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, cumulative=True, rwidth=rwidth)
axs[0, 0].set_ylabel('Norm frequency', fontsize=fontsize)
axs[1, 0].set_ylabel('Cum frequency', fontsize=fontsize)
axs[1, -1].set_xlabel('Duration', fontsize=fontsize)
axs[0, -1].set_title('train+val', fontsize=fontsize)
axs[0, -1].tick_params(labelsize=fontsize)
axs[1, -1].tick_params(labelsize=fontsize)
fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_cdf-pdf_moments-duration_non-uniform-bins.pdf', bbox_inches='tight')

The plot look sparse and piece-wise linear :), but how did we pick the values?

1. The minimum lenght of 5s was picked based on the fact that it covers at least 10% of the annotations (check the cum freq plot). We liked multiples, thus the other windows are a scaled version of 5s ;).

1. What about the 25s (in `times`)? it's close to the median duration, thus we explore the range 0-50% precisely.

1. The value of 55s (in `times`) was conveniently chosen to cover around 75% of the annotations.

1. The value of 125s (in `times`) was picked because matches with the 95% coverage of the annotations.

1. There was no science picking the `widths` 10 and 25s. @escorcia wanted to move forward quickly.

  In future work, we may repeat the reasoning of (2) (explore 10% to 50%) in those ranges to select them appropriately. A more interesting strategy is to make duration edges that give you a linear cdf i.e. you force your data to be uniformly distributed wrt to duration 😉. That approach should be constrained to produce few number of widths and take into account the IOU threshold during evaluation.

### Video duration

Let's take a look at the duration of the videos

In [None]:
H5_FILE_FEAT_PER_FRAME = '/home/escorciav/datasets/activitynet/features/resnet152-imagenet_5fps_320x240.hdf5'
FPS = 5

duration_edges = np.arange(0, 211, 30)

# plot stuff
COLOR = ['blue', 'orange', 'green']
fontsize = 14
rwidth = 0.75

all_duration = []
fig, axs = plt.subplots(2, 3, figsize=(21, 14), sharex=True)
# we use a list of list to merge val_1 and val_2 ;)
for i, subsets in enumerate([['train'], ['val_1', 'val_2']]):
    data = []
    for subset in subsets:
        filename = '../data/raw/activitynet/{}.json'.format(subset)
        data.append(parse_activitynet_captions(filename))
    # this will merge the instances ;)
    data = sum(data, [])
    
    videos_df, _ = make_annotations_df(data, H5_FILE_FEAT_PER_FRAME)
    duration = videos_df['num_frames'].values / FPS
    all_duration.append(duration)
    
    if subset.startswith('val'):
        subset = subset[:-2]
    axs[0, i].hist(duration, duration_edges, color=COLOR[i], density=True, rwidth=rwidth)
    axs[1, i].hist(duration, duration_edges, color=COLOR[i], density=True, cumulative=True, rwidth=rwidth)
    axs[1, i].set_xlabel('Duration', fontsize=fontsize)
    axs[0, i].tick_params(labelsize=fontsize)
    axs[1, i].tick_params(labelsize=fontsize)
    axs[0, i].set_title(subset, fontsize=fontsize)

    
duration = np.concatenate(all_duration)
axs[0, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, rwidth=rwidth)
axs[1, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, cumulative=True, rwidth=rwidth)
axs[0, 0].set_ylabel('Norm frequency', fontsize=fontsize)
axs[1, 0].set_ylabel('Cum frequency', fontsize=fontsize)
axs[1, -1].set_xlabel('Duration', fontsize=fontsize)
axs[0, -1].set_title('train_val', fontsize=fontsize)
axs[0, -1].tick_params(labelsize=fontsize)
axs[1, -1].tick_params(labelsize=fontsize)
# fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_cdf-pdf_video-duration_uniform-bins-30s.pdf', bbox_inches='tight')

### 1a. Search space with sliding windows

#### 1a.1 Linear

Durations choose between 5-50s, strides multiples of 5s and sublinear strides such as 1 and 2.5s

In [None]:
# Variables to edit for analysis
min_length = 5
num_scales = 10
strides = [1, 2.5, 5, 10]

# Eval stuff
IOU_THRESHOLDS = [0.5, 0.7]

# Dataset stuff
annotation_files = ['../data/raw/activitynet/val_1.json',
                    '../data/raw/activitynet/val_2.json']
features_file = '/home/escorciav/datasets/activitynet/features/resnet152-imagenet_5fps_320x240.hdf5'

# plot stuff
figsize = (10, 7) # (21, 7)
fontsize = 14
lw = 3  # linewidth
COLOR_2ND_AXIS = 'red' 
IOU_COLORS = ['blue', 'orange', 'green']
iou_thresholds = IOU_THRESHOLDS
assert len(IOU_COLORS) - 1 == len(iou_thresholds)

instances = sum(
    [parse_activitynet_captions(i) for i in annotation_files],
    [])
videos_df, instances_df = make_annotations_df(instances, features_file)

recalls = []
search_space = []
for stride in tqdm(strides):
    recall_iou, search_space_stats = recall_bound_and_search_space(
        videos_df, instances_df, stride,
        length=min_length, scale=num_scales,
        slidding_window_fn=sliding_window,
    )
    recalls.append(recall_iou)
    search_space.append(search_space_stats)
search_space = np.vstack(search_space)
recalls = np.vstack(recalls)
recalls = np.column_stack([recalls, recalls.mean(axis=1)])

fig, ax1 = plt.subplots(figsize=figsize)
for i, iou in enumerate(iou_thresholds + [None]):
    ls, label, color = '-', f'tIOU={iou}', IOU_COLORS[i]
    if i == len(IOU_COLORS) - 1:
        ls, label = '-.', 'avg tIOU'
    ax1.plot(strides, recalls[:, i], ls=ls, lw=lw,
             color=color, label=label)
ax1.set_xlabel('stride', fontsize=fontsize)
ax1.set_ylabel('Recall', fontsize=fontsize)
ax1.tick_params('y')
ax2 = ax1.twinx()
ax2.plot(strides, search_space[:, 0], ls='--', lw=lw,
         color=COLOR_2ND_AXIS)
ax2.set_ylabel('Median size of search space',
               color=COLOR_2ND_AXIS, fontsize=fontsize)
ax2.tick_params('y', colors='r', labelsize=fontsize)
ax1.tick_params(labelsize=fontsize)
ax1.legend(fontsize=fontsize)
# fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_recall-and-ss-vs-stride_sw-5-50-10.pdf', bbox_inches='tight')

check this

In [None]:
info = {'Stride': np.array(strides)}
for i in range(recalls.shape[1]):
    if i > (len(iou_thresholds) - 1):
        iou = f'Avg({iou_thresholds[0]}, {iou_thresholds[-1]})'
    else:
        iou = iou_thresholds[i]
    info[f'R@{iou}'] = recalls[:, i]
for i, label in enumerate(['(median)', '(std)']):
    info[f'S3 {label}'] = search_space[:, i]
display(pd.DataFrame(info))

#### 1a.2 Linear

Durations choose between 10-120s, strides multiples of 10s and one sublinear strides such as 5s

In [None]:
# Variables to edit for analysis
min_length = 10
num_scales = 12
strides = [5, 10, 20, 30]

# Eval stuff
IOU_THRESHOLDS = [0.5, 0.7]

# Dataset stuff
annotation_files = ['../data/raw/activitynet/val_1.json',
                    '../data/raw/activitynet/val_2.json']
features_file = '/home/escorciav/datasets/activitynet/features/resnet152-imagenet_5fps_320x240.hdf5'

# plot stuff
font_size = 14
COLOR_2ND_AXIS = 'red' 
IOU_COLORS = ['blue', 'orange', 'green']
iou_thresholds = IOU_THRESHOLDS
assert len(IOU_COLORS) - 1 == len(iou_thresholds)

instances = sum(
    [parse_activitynet_captions(i) for i in annotation_files],
    [])
videos_df, instances_df = make_annotations_df(instances, features_file)

recalls = []
search_space = []
for stride in tqdm(strides):
    recall_iou, search_space_stats = recall_bound_and_search_space(
        videos_df, instances_df, stride,
        length=min_length, scale=num_scales,
        slidding_window_fn=sliding_window,
    )
    recalls.append(recall_iou)
    search_space.append(search_space_stats)
search_space = np.vstack(search_space)
recalls = np.vstack(recalls)
recalls = np.column_stack([recalls, recalls.mean(axis=1)])

fig, ax1 = plt.subplots(figsize=(21, 7))
for i, iou in enumerate(iou_thresholds + [None]):
    ls, label, color = '-', f'tIOU={iou}', IOU_COLORS[i]
    if i == len(IOU_COLORS) - 1:
        ls, label = '-.', 'avg tIOU'
    ax1.plot(strides, recalls[:, i], ls=ls,
             color=color, label=label)
ax1.set_xlabel('stride', fontsize=font_size)
ax1.set_ylabel('Recall', fontsize=font_size)
ax1.tick_params('y')
ax2 = ax1.twinx()
ax2.plot(strides, search_space[:, 0], ls='--', color=COLOR_2ND_AXIS)
ax2.set_ylabel('Median size of search space',
               color=COLOR_2ND_AXIS, fontsize=font_size)
ax2.tick_params('y', colors='r', labelsize=fontsize)
ax1.tick_params(labelsize=fontsize)

In [None]:
info = {'Stride': np.array(strides)}
for i in range(recalls.shape[1]):
    if i > (len(iou_thresholds) - 1):
        iou = f'Avg({iou_thresholds[0]}, {iou_thresholds[-1]})'
    else:
        iou = iou_thresholds[i]
    info[f'R@{iou}'] = recalls[:, i]
for i, label in enumerate(['(median)', '(std)']):
    info[f'Search space size {label}'] = search_space[:, i]
display(pd.DataFrame(info))

#### 1a.2 Piece-wise linear exploration

_todo_ non-linear search space with piece-wise linear function:

durations $\in \{5, 10, 25\}$

time range for those durations t = $\{0, 25, 55, 125\}$

TODO: get all possible duration in a linear function over the corresponding pairs of $t$ e.g. from $t = [25, 55] \rightarrow D_t = \{25, 35, 45\}$

TODO: write down piece-wise function of $t$ that return all possible durations.

strides TBD according to overlap threshold, around 0.6.

Try this config. BTW, the last two lines can span $D$ as piece-wise linear function inside time range.

```python
times = [0, 5, 15, 25, 45, 100]
widths = [2.5, 5, 10, 20, 50]
duration_edges = [np.arange(times[i], times[i+1], width)
                  for i, width in enumerate(widths)]
duration_edges = np.concatenate(duration_edges)
```

## 2. Dump data for training and evaluation

### 2a. Chunked features

Go to notebook `4-feature-extraction.ipynb` section `#Varied-length-videos` (remove the # if you use your browser string matching).

_TODO_ add procedure here to avoid jumping over the place.

### 2b. JSON files

The same as in notebook `11-charades-sta.ipynb` section 2a.

Following [this paper](https://arxiv.org/abs/1804.05113), we merge the validations set into a single validation set.

__Note__: It requires to run 1st cell with function `parse_activitynet_captions`.

_minor details_

To avoid spaghetti code, we copy the function `extend_metadata` into the module `nb_utils.py`.

_comment:_ in principle we could automate this. However, there is a well-know principle that says: 

1. do it manually.

2. repeat 1 again.

3. automate it.

we are in the step 2 😉

In [None]:
%%time
SUBSETS = [['train'], ['val_1', 'val_2']]
TIME_UNIT = 5
MODE = 'x'
FPS = 5
CREATOR = 'EscorciaSSGR'
H5_FILE = f'/home/escorciav/datasets/activitynet/features/rgb_resnet152_max_cs-{TIME_UNIT}.h5'
H5_FILE_FEAT_PER_FRAME = f'/home/escorciav/datasets/activitynet/features/resnet152-imagenet_{FPS}fps_320x240.hdf5'
if MODE == 'w':
    print('are you sure you wanna do this? comment these 3 lines!')
    raise
assert SUBSETS == [['train'], ['val_1', 'val_2']]

import json
from copy import deepcopy
from datetime import datetime
from pathlib import Path
import h5py

from nb_utils import extend_metadata
import sys
sys.path.append('..')
from utils import get_git_revision_hash 

offset = 0
for subset in SUBSETS:
    FILENAMES = [Path(f'../data/raw/activitynet/{i}.json') for i in subset]
    subset = subset[0]
    if subset.startswith('val'):
        subset = 'val'
    OUTPUT_FILE = Path(f'../data/interim/activitynet/{subset}.json')
    
    # trick to aggregate val_1 and val_2
    instances = sum([parse_activitynet_captions(i) for i in FILENAMES],
                    [])
    videos_df, _ = make_annotations_df(instances, H5_FILE_FEAT_PER_FRAME)
    videos_gbv = videos_df.groupby('video')
    videos, cleaned_instances = extend_metadata(
        instances, videos_gbv, H5_FILE, offset=offset)
    offset += len(instances)
    
    if not OUTPUT_FILE.parent.is_dir():
        dirname = OUTPUT_FILE.parent
        dirname.mkdir(parents=True)
        print(f'Create dir: {dirname}')
    
    print('Subset:', subset)
    print('\tNum videos:', len(videos))
    print('\tNum instances:', len(instances))
    print('\tNum dumped instances:', len(cleaned_instances))
    with open(OUTPUT_FILE, MODE) as fid:
        json.dump({'videos': videos,
                   'moments': cleaned_instances,
                   'time_unit': TIME_UNIT,
                   'date': datetime.now().isoformat(),
                   'git_hash': get_git_revision_hash(),
                   'responsible': CREATOR,
                  },
                  fid)
    print('\tDumped file:', OUTPUT_FILE)

### 2c. Random partition

Train/val partition out of training set. In the interest of time, we didn't take into account the action level information as we did for Charades-STA.

In [None]:
%%time
import json
import random
from copy import deepcopy

trial = '01'
seed = 1701
filename = '../data/processed/activitynet-captions/train.json'

def create_subset(x):
    "Copy data and "
    split = deepcopy(x)
    split['videos'] = {}
    split['moments'] = []
    return split

random.seed(seed)
with open(filename, 'r') as fid:
    data = json.load(fid)
    video2moment_ind = {}
    for i, moment in enumerate(data['moments']):
        video_id = moment['video']
        if video_id not in video2moment_ind:
            video2moment_ind[video_id] = []
        video2moment_ind[video_id].append(i)
    
train_split = create_subset(data)
val_split = create_subset(data)

videos = list(data['videos'].keys())
cut = int(len(videos) * 0.75)
random.shuffle(videos)

repo = train_split
for i, video_id in enumerate(videos):
    if i > cut:
        repo = val_split
    repo['videos'][video_id] = data['videos'][video_id]
    for j in video2moment_ind[video_id]:
        repo['moments'].append(data['moments'][j])
with open(f'../data/processed/activitynet-captions/train-{trial}.json', 'x') as fid:
    json.dump(train_split, fid)
with open(f'../data/processed/activitynet-captions/val-{trial}.json', 'x') as fid:
    json.dump(val_split, fid)

## 3. Baselines 

### 3.1 Single video moment retrieval

Concurrent [work](https://arxiv.org/abs/1804.05113)

| Model            | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 |
| :--------------- | ----------: | ----------: | ----------: | ----------: | 
| Random           |   0.0025    |   0.008     |   0.113     |     0.04    |
| Vector Embedding |   0.237     |    0.11     |    0.52     |    0.321    |
| LSTM+QSPN+Cap    |   0.277     |   0.136     |   0.592     |    0.383    |

The values above look nice, but these are easy to copy and paste 😉

Random
R@{1,5,10},0.5: 2.5  11.3  21.6
R@{1,5,10},0.7: 0.8  4.0  8.1

VE
R@{1,5,10},0.5: 23.7  52.0  62.2
R@{1,5,10},0.7: 11.0  32.1  42.1

LSTM+QSPN+Cap
R@{1,5,10},0.5: 27.7   59.2   69.3
R@{1,5,10},0.7: 13.6   38.3   49.1

### 3.2 Moment frequency prior

Results in a train-val split form train set for our search space (sliding windows between length 10s (seconds) and max length 120 with steps of 10s, stride 10s) and with `NMS = 0.5`. Please don't fool yourself and update the baseline according to your search strategy.

```bash
for i in 10 50 75 100 250 500 1000 2500 5000; do
  python moment_freq_prior.py \
    --train-list data/processed/activitynet-captions/train-01.json \
    --test-list data/processed/activitynet-captions/val-01.json \
    --bins $i \
    --proposal-interface SlidingWindowMSFS \
    --min-length 10 --num-scales 12 --stride 10 --nms-threshold 0.5 \
    --logfile data/processed/activitynet-captions/mfp-$i.log;
done
```

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

metric = ['r@1,0.5', 'r@5,0.5', 'r@1,0.7', 'r@5,0.7']
files= !ls ../data/processed/activitynet-captions/mfp-*.log
bins = []
results = []
for file in files:
    bins.append(int(file.split('-')[-1].split('.')[0]))
    with open(file, 'r') as fid:
        for line in fid:
            line = line.strip()
            if 'r@1,0.5' in line:
                blocks= line.split('\t')
                metrics = []
                for i, content in enumerate(blocks):
                    metrics.append(
                        float(content.split()[-1]))
                results.append(metrics)
results = [x for _, x in sorted(zip(bins, results))]
bins.sort()
bins = np.array(bins)
results = np.array(results)
plt.figure(figsize=(8, 6))
for i in range(results.shape[1]):
    plt.plot(bins, results[:, i], label=metric[i], lw=4,
             marker='o')
plt.xlabel('Number of bins')
plt.ylabel('R@k,IoU')
plt.legend(loc='upper right')

We chose 500 bins as it's a good compromise for all the four metrics. The rationale is similar to the [BIC](https://en.wikipedia.org/wiki/Bayesian_information_criterion).

For a given number of bins, we proceed to compute the prior using the entire training set, and evaluating of the entire testing set.

```bash
python moment_freq_prior.py \
  --train-list data/processed/activitynet-captions/train.json \
  --test-list data/processed/activitynet-captions/val.json \
  --bins 500 \
  --proposal-interface SlidingWindowMSFS \
  --min-length 10 --num-scales 12 --stride 10 --nms-threshold 0.5 \
  --logfile data/processed/activitynet-captions/mfp.log
```