# Exploring ActivityNet-Captions

In [None]:
import sys
import json
import glob
import time

import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from nb_utils import make_annotations_df
from nb_utils import recall_bound_and_search_space
from nb_utils import sliding_window

def parse_activitynet_captions(filename):
    """Parser raw ActivityNet Captions annotations
    Args:
        filename (str)
    Returns:
        instances (list of dicts)
    """
    instances = []
    with open(filename) as f:
        dataset = json.load(f)
        for video_id in dataset:
            time_and_descriptions = zip(
                dataset[video_id]["timestamps"],
                dataset[video_id]["sentences"])
            for interval, description in time_and_descriptions:
                instances.append(
                    {'video': video_id,
                     'times': [interval],
                     'description': description}
                )
                #print(video_id, interval, description)
    return instances

## 1. Moments duration analysis

Similar to notebook 11.

We need to set a couple of parameters:

(i) _minimum_ moment length

(ii) _maximum_ moment length

(iii) _type of range_, how to explore minimum -> maximum

(iv) _stride_.

_Note:_ following [Xu et. al arxiv-2018](https://arxiv.org/pdf/1804.05113.pdf), we fuse the two annotations in the validation set.

The first step is to get an indea of the duration of the moments in the dataset

In [None]:
Q = 95
QUANTILES = np.arange(25, 101, 5)

# plot stuff
COLOR = ['blue', 'orange', 'green']
fontsize = 14
lw = 3 # linewidth

all_duration = []
fig, axs = plt.subplots(1, 3, figsize=(21, 7))
# we use a list of list to merge val_1 and val_2 ;)
for i, subsets in enumerate([['train'], ['val_1', 'val_2']]):
    data = []
    for subset in subsets:
        filename = '../data/raw/activitynet/{}.json'.format(subset)
        data.append(parse_activitynet_captions(filename))
    # this will merge the instances ;)
    data = sum(data, [])

    duration = [i['times'][0][1] - i['times'][0][0]
                for i in data
                # ignore negative duration
                if i['times'][0][1] > i['times'][0][0]
               ]
    all_duration += duration
    duration = np.array(duration)
    if subset.startswith('val'):
        subset = subset[:-2]
    print('Negative durations in {}: {}'.format(subset, sum(duration <= 0)))
    percentiles = np.percentile(duration, QUANTILES)
    axs[i].plot(percentiles, QUANTILES, color=COLOR[i], lw=lw)
    axs[-1].plot(percentiles, QUANTILES, color=COLOR[i], lw=lw)
    axs[i].set_xlabel('Duration', fontsize=fontsize)
    axs[i].set_ylabel('Percentile', fontsize=fontsize)
    axs[i].tick_params(labelsize=fontsize)
    axs[i].set_title('Duration stats {}\nMin: {:.2f}, Median: {:.2f}, {}Q: {:.2f} Max: {:.2f}'
                     .format(subset, np.min(duration[duration > 0]), np.median(duration), Q,
                             percentiles[QUANTILES == Q][0], np.max(duration)),
                     fontsize=fontsize)

duration = np.array(all_duration)
percentiles = np.percentile(duration, QUANTILES)
axs[-1].plot(percentiles, QUANTILES, ls='--', color=COLOR[-1], lw=lw)
axs[-1].set_xlabel('Duration', fontsize=fontsize)
axs[-1].set_ylabel('Quantile', fontsize=fontsize)
axs[-1].tick_params(labelsize=fontsize)
_ = axs[-1].set_title('Duration stats (train+val1+val2)\nMin: {:.2f}, Median: {:.2f}, {}Q: {:.2f} Max: {:.2f}'
                      .format(np.min(duration[duration > 0]), np.median(duration), Q,
                              percentiles[QUANTILES == Q][0], np.max(duration)),
                     fontsize=fontsize)
#fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_percentile-moment-duration.pdf', bbox_inches='tight')

Given that the distribution is quite particular, we decided to analyze the PDF and CDF closer

In [None]:
duration_step = 5 # seconds
duration_edges = np.arange(0, 400 + duration_step - 0.1, duration_step)

# plot stuff
COLOR = ['blue', 'orange', 'green']
fontsize = 14
rwidth = 0.75

all_duration = []
fig, axs = plt.subplots(2, 3, figsize=(21, 14), sharex=True)
cdf_val, edges_val = None, None
# we use a list of list to merge val_1 and val_2 ;)
for i, subsets in enumerate([['train'], ['val_1', 'val_2']]):
    data = []
    for subset in subsets:
        filename = '../data/raw/activitynet/{}.json'.format(subset)
        data.append(parse_activitynet_captions(filename))
    # this will merge the instances ;)
    data = sum(data, [])

    duration = [i['times'][0][1] - i['times'][0][0]
                for i in data
                # ignore negative duration
                if i['times'][0][1] > i['times'][0][0]
               ]
    all_duration += duration
    duration = np.array(duration)
    if subset.startswith('val'):
        subset = subset[:-2]
    print('Negative durations in {}: {}'.format(subset, sum(duration <= 0)))
    axs[0, i].hist(duration, duration_edges, color=COLOR[i], density=True,
                   rwidth=rwidth)
    cdf, edges, *_ = axs[1, i].hist(duration, duration_edges, color=COLOR[i], density=True,
                                    cumulative=True, rwidth=rwidth)
    axs[1, i].set_xlabel('Duration', fontsize=fontsize)
    axs[0, i].tick_params(labelsize=fontsize)
    axs[1, i].tick_params(labelsize=fontsize)
    axs[0, i].set_title(subset, fontsize=fontsize)

duration = np.array(all_duration)
axs[0, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, rwidth=rwidth)
axs[1, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, cumulative=True, rwidth=rwidth)
axs[0, 0].set_ylabel('Norm frequency', fontsize=fontsize)
axs[1, 0].set_ylabel('Cum frequency', fontsize=fontsize)
axs[1, -1].set_xlabel('Duration', fontsize=fontsize)
axs[0, -1].set_title('train+val', fontsize=fontsize)
axs[0, -1].tick_params(labelsize=fontsize)
axs[1, -1].tick_params(labelsize=fontsize)
fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_cdf-pdf_moments-duration.pdf', bbox_inches='tight')

### Video duration

Let's take a look at the duration of the videos

In [None]:
H5_FILE_FEAT_PER_FRAME = '/home/escorciav/datasets/activitynet/features/resnet152-imagenet_5fps_320x240.hdf5'
FPS = 5

duration_edges = np.arange(0, 211, 30)

# plot stuff
COLOR = ['blue', 'orange', 'green']
fontsize = 14
rwidth = 0.75

all_duration = []
fig, axs = plt.subplots(2, 3, figsize=(21, 14), sharex=True)
# we use a list of list to merge val_1 and val_2 ;)
for i, subsets in enumerate([['train'], ['val_1', 'val_2']]):
    data = []
    for subset in subsets:
        filename = '../data/raw/activitynet/{}.json'.format(subset)
        data.append(parse_activitynet_captions(filename))
    # this will merge the instances ;)
    data = sum(data, [])
    
    videos_df, _ = make_annotations_df(data, H5_FILE_FEAT_PER_FRAME)
    duration = videos_df['num_frames'].values / FPS
    all_duration.append(duration)
    
    if subset.startswith('val'):
        subset = subset[:-2]
    axs[0, i].hist(duration, duration_edges, color=COLOR[i], density=True, rwidth=rwidth)
    axs[1, i].hist(duration, duration_edges, color=COLOR[i], density=True, cumulative=True, rwidth=rwidth)
    axs[1, i].set_xlabel('Duration', fontsize=fontsize)
    axs[0, i].tick_params(labelsize=fontsize)
    axs[1, i].tick_params(labelsize=fontsize)
    axs[0, i].set_title(subset, fontsize=fontsize)

    
duration = np.concatenate(all_duration)
axs[0, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, rwidth=rwidth)
axs[1, -1].hist(duration, duration_edges, ls='--', color=COLOR[-1], density=True, cumulative=True, rwidth=rwidth)
axs[0, 0].set_ylabel('Norm frequency', fontsize=fontsize)
axs[1, 0].set_ylabel('Cum frequency', fontsize=fontsize)
axs[1, -1].set_xlabel('Duration', fontsize=fontsize)
axs[0, -1].set_title('train_val', fontsize=fontsize)
axs[0, -1].tick_params(labelsize=fontsize)
axs[1, -1].tick_params(labelsize=fontsize)
# fig.savefig('/home/escorciav/Downloads/adobe-prj/anet_cdf-pdf_video-duration_uniform-bins-30s.pdf', bbox_inches='tight')

### 1a. Search space with sliding windows

Durations choose between 10-120s, strides multiples of 10s and one sublinear strides such as 5s

TODO

In [None]:
import sys
sys.path.append('..')
import numpy as np
from proposals import SlidingWindowMSRSS
from nb_utils import parse_moments
from nb_utils import recall_bound_and_search_space

filename = '../data/processed/activitynet-captions/val.json'
clip_length = 5
proposals_prm = dict(
    length=clip_length,
    scales=list(range(2, 27, 2)),
    stride=0.3
)

dataset = parse_moments(filename)
proposals_fn = SlidingWindowMSRSS(**proposals_prm)
train_results = recall_bound_and_search_space(
    filename, proposals_fn)
recall_ious, search_space, durations = train_results
num_clips = np.ceil(durations / clip_length).sum()
search_space[-1] /= num_clips
print(recall_ious)
print(search_space)

## 2. Dump data for training and evaluation

### 2a. Chunked features

Go to notebook `4-feature-extraction.ipynb` section `#Varied-length-videos` (remove the # if you use your browser string matching).

_TODO_ add procedure here to avoid jumping over the place.

### 2b. JSON files

The same as in notebook `11-charades-sta.ipynb` section 2a.

Following [this paper](https://arxiv.org/abs/1804.05113), we merge the validations set into a single validation set.

__Note__: It requires to run 1st cell with function `parse_activitynet_captions`.

_minor details_

To avoid spaghetti code, we copy the function `extend_metadata` into the module `nb_utils.py`.

In [None]:
%%time
SUBSETS = [['train'], ['val_1', 'val_2']]
TIME_UNIT = 5
MODE = 'x'
FPS = 5
CREATOR = 'EscorciaSSGR'
H5_FILE = f'/home/escorciav/datasets/activitynet/features/rgb_resnet152_max_cs-{TIME_UNIT}.h5'
H5_FILE_FEAT_PER_FRAME = f'/home/escorciav/datasets/activitynet/features/resnet152-imagenet_{FPS}fps_320x240.hdf5'
if MODE == 'w':
    print('are you sure you wanna do this? comment these 3 lines!')
    raise
assert SUBSETS == [['train'], ['val_1', 'val_2']]

import json
from copy import deepcopy
from datetime import datetime
from pathlib import Path
import h5py

from nb_utils import extend_metadata
import sys
sys.path.append('..')
from utils import get_git_revision_hash 

offset = 0
for subset in SUBSETS:
    FILENAMES = [Path(f'../data/raw/activitynet/{i}.json') for i in subset]
    subset = subset[0]
    if subset.startswith('val'):
        subset = 'val'
    OUTPUT_FILE = Path(f'../data/interim/activitynet/{subset}.json')
    
    # trick to aggregate val_1 and val_2
    instances = sum([parse_activitynet_captions(i) for i in FILENAMES],
                    [])
    videos_df, _ = make_annotations_df(instances, H5_FILE_FEAT_PER_FRAME)
    videos_gbv = videos_df.groupby('video')
    videos, cleaned_instances = extend_metadata(
        instances, videos_gbv, H5_FILE, offset=offset)
    offset += len(instances)
    
    if not OUTPUT_FILE.parent.is_dir():
        dirname = OUTPUT_FILE.parent
        dirname.mkdir(parents=True)
        print(f'Create dir: {dirname}')
    
    print('Subset:', subset)
    print('\tNum videos:', len(videos))
    print('\tNum instances:', len(instances))
    print('\tNum dumped instances:', len(cleaned_instances))
    with open(OUTPUT_FILE, MODE) as fid:
        json.dump({'videos': videos,
                   'moments': cleaned_instances,
                   'time_unit': TIME_UNIT,
                   'date': datetime.now().isoformat(),
                   'git_hash': get_git_revision_hash(),
                   'responsible': CREATOR,
                  },
                  fid)
    print('\tDumped file:', OUTPUT_FILE)

#### 2.b.1 Untied JSON and HDF5 inputs

TLDR; reference: minor-detail. Safe to skip unless you have problems loading data for dispatching training.

At some point, there was a undesired tied btw the JSON and HDF5 files (inputs) required by our implementation. 

- root `time_unit`. This is a property of the features, as such it should reside in the HDF5 a not in the JSON.

- `videos/ith-video/num_clips`. This is a property of the ith-video, as such we should grab it from the HDF5 instead of placed it in the JSON.

The following script was use to update the `*.json` files with metadata for training and evaluation.

```python
import json
from datetime import datetime

import sys
sys.path.append('..')
from utils import get_git_revision_hash

subsets = ['train', 'val', 'train-01', 'val-01', 'train-02', 'val-02']

for subset in subsets:
    filename = f'../data/processed/activitynet-captions/{subset}.json'
    with open(filename, 'r') as fr:
        data = json.load(fr)
    del data['time_unit']
    for video_id in data['videos']:
        del data['videos'][video_id]['num_clips']
    data['date'] = datetime.now().isoformat()
    data['git_hash'] = get_git_revision_hash()
    with open(filename, 'w') as fw:
        json.dump(data, fw)
```

We also update the HDF5 such that it contains `metadata` [Group/Folder](http://docs.h5py.org/en/latest/high/group.html).

```bash
!h5ls /home/escorciav/datasets/activitynet-captions/features/resnet152_max_cs-5.h5 | grep metadata
```

In case the following line doesn't return anything, it means that you are using an old version of the data.
If you know the `FPS`, `CLIP_LENGTH` and `POOL`ing operation used to get those features, the following snippet will add the metadata required for the most recent version of our code.

```python
FPS = 5
CLIP_LENGTH = 5  # seconds
POOL = 'max'  # pooling operation over time
# verbose
COMMENTS = (f'ResNet152 trained on Imagenet-ILSVRC12, Pytorch model. '
            f'Extracted at {FPS} FPS with an image resolution of 320x240, '
            f'and {POOL} pooled over time every {CLIP_LENGTH} seconds.')
CREATOR = 'EscorciaSSGR'  # please add your name here to sign the file i.e. assign yourself as resposible
filename = f'/home/escorciav/datasets/activitynet/features/resnet152_rgb_{POOL}_cl-{CLIP_LENGTH}.h5'
from datetime import datetime
import h5py

assert CLIP_LENGTH * FPS >= 1
with h5py.File(filename, 'a') as fw:
    grp = fw.create_group('metadata')
    grp.create_dataset('time_unit', data=CLIP_LENGTH)
    grp.create_dataset('date', data=datetime.now().isoformat(),
                       dtype=h5py.special_dtype(vlen=str))
    grp.create_dataset('responsible', data=CREATOR,
                       dtype=h5py.special_dtype(vlen=str))
    grp.create_dataset('comments', data=COMMENTS,
                       dtype=h5py.special_dtype(vlen=str))
```

### 2c. Random partition

Train/val partition out of training set. In the interest of time, we didn't take into account the action level information as we did for Charades-STA.

In [None]:
%%time
import json
import random
from copy import deepcopy

trial = '01'
seed = 1701
filename = '../data/processed/activitynet-captions/train.json'

def create_subset(x):
    "Copy data and "
    split = deepcopy(x)
    split['videos'] = {}
    split['moments'] = []
    return split

random.seed(seed)
with open(filename, 'r') as fid:
    data = json.load(fid)
    video2moment_ind = {}
    for i, moment in enumerate(data['moments']):
        video_id = moment['video']
        if video_id not in video2moment_ind:
            video2moment_ind[video_id] = []
        video2moment_ind[video_id].append(i)
    
train_split = create_subset(data)
val_split = create_subset(data)

videos = list(data['videos'].keys())
cut = int(len(videos) * 0.75)
random.shuffle(videos)

repo = train_split
for i, video_id in enumerate(videos):
    if i > cut:
        repo = val_split
    repo['videos'][video_id] = data['videos'][video_id]
    for j in video2moment_ind[video_id]:
        repo['moments'].append(data['moments'][j])
with open(f'../data/processed/activitynet-captions/train-{trial}.json', 'x') as fid:
    json.dump(train_split, fid)
with open(f'../data/processed/activitynet-captions/val-{trial}.json', 'x') as fid:
    json.dump(val_split, fid)

## 3. Baselines 

### 3.1 Single video moment retrieval

Concurrent [work](https://arxiv.org/abs/1804.05113)

| Model            | R@1,IoU=0.5 | R@1,IoU=0.7 | R@5,IoU=0.5 | R@5,IoU=0.7 |
| :--------------- | ----------: | ----------: | ----------: | ----------: | 
| Random           |   0.0025    |   0.008     |   0.113     |     0.04    |
| Vector Embedding |   0.237     |    0.11     |    0.52     |    0.321    |
| LSTM+QSPN+Cap    |   0.277     |   0.136     |   0.592     |    0.383    |

The values above look nice, but these are easy to copy and paste 😉

Random
R@{1,5,10},0.5: 2.5  11.3  21.6
R@{1,5,10},0.7: 0.8  4.0  8.1

VE
R@{1,5,10},0.5: 23.7  52.0  62.2
R@{1,5,10},0.7: 11.0  32.1  42.1

LSTM+QSPN+Cap
R@{1,5,10},0.5: 27.7   59.2   69.3
R@{1,5,10},0.7: 13.6   38.3   49.1

### 3.2 Moment frequency prior

Results in a train-val split form train set for our search space (sliding windows between length 10s (seconds) and max length 120 with steps of 10s, stride 10s) and with `NMS = 0.5`. Please don't fool yourself and update the baseline according to your search strategy.

```bash
for i in 10 50 75 100 250 500 1000 2500 5000; do
  python moment_freq_prior.py \
    --train-list data/processed/activitynet-captions/train-01.json \
    --test-list data/processed/activitynet-captions/val-01.json \
    --bins $i \
    --proposal-interface SlidingWindowMSFS \
    --min-length 10 --num-scales 12 --stride 10 --nms-threshold 0.5 \
    --logfile data/processed/activitynet-captions/mfp-$i.log;
done
```

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

metric = ['r@1,0.5', 'r@5,0.5', 'r@1,0.7', 'r@5,0.7']
files= !ls ../data/processed/activitynet-captions/mfp-*.log
bins = []
results = []
for file in files:
    bins.append(int(file.split('-')[-1].split('.')[0]))
    with open(file, 'r') as fid:
        for line in fid:
            line = line.strip()
            if 'r@1,0.5' in line:
                blocks= line.split('\t')
                metrics = []
                for i, content in enumerate(blocks):
                    metrics.append(
                        float(content.split()[-1]))
                results.append(metrics)
results = [x for _, x in sorted(zip(bins, results))]
bins.sort()
bins = np.array(bins)
results = np.array(results)
plt.figure(figsize=(8, 6))
for i in range(results.shape[1]):
    plt.plot(bins, results[:, i], label=metric[i], lw=4,
             marker='o')
plt.xlabel('Number of bins')
plt.ylabel('R@k,IoU')
plt.legend(loc='upper right')

We chose 500 bins as it's a good compromise for all the four metrics. The rationale is similar to the [BIC](https://en.wikipedia.org/wiki/Bayesian_information_criterion).

For a given number of bins, we proceed to compute the prior using the entire training set, and evaluating of the entire testing set.

```bash
python moment_freq_prior.py \
  --train-list data/processed/activitynet-captions/train.json \
  --test-list data/processed/activitynet-captions/val.json \
  --bins 500 \
  --proposal-interface SlidingWindowMSFS \
  --min-length 10 --num-scales 12 --stride 10 --nms-threshold 0.5 \
  --logfile data/processed/activitynet-captions/mfp.log
```