## 1. Dump data for training and evaluation

### 1a. Chunked features

In case, you haven't dumped the features. Go to notebook [old feature extraction](4-feature-extraction.ipynb) section `#Varied-length-videos` (remove the # if you use your browser string matching).

_TODO_ add procedure here to avoid jumping over the place.

### 1b. JSON files

The format is the same as in notebook [charades notebook](11-charades-sta.ipynb).

We added the field:
    - `annotation_id_didemo` given that didemo provides an annotation id, but is only unique inside a subset.

In [None]:
import json
from copy import deepcopy
from datetime import datetime
from pathlib import Path
import h5py
import numpy as np

import sys
sys.path.append('..')
from utils import get_git_revision_hash

DIDEMO_TIME_UNIT = 5

def update_instances_make_videos_dict(moments, offset=0):
    """Update (in-place) metadata from instances
    
    1. Transform annotations from index to time
    2. Backup annotation-id and create a new-one
    4. Remove unneeded fields `num_segments`, `dl_link`. Note that we can go
       back to them because we preserve the original `annotation_id`.
    3. Add field `time` added 'cause we weren't planning to merge both
       domains, untrimmed & trimmed videos.

    Args:
        moments (list of dict): raw data from DiDeMo
        
    Returns:
        videos (dict) : map information about videos in the subset.
    """
    videos = {}
    for moment_i in moments:
        time_stamps = np.array(moment_i['times'])
        time_stamps *= DIDEMO_TIME_UNIT
        time_stamps[:, 1] += DIDEMO_TIME_UNIT
        moment_i['times'] = time_stamps.tolist()
        # DIDEMO_TIME_UNIT * 6 == 30s, which is the time-span that annotators
        # watched
        assert (time_stamps <= DIDEMO_TIME_UNIT * 6).all()
        
        moment_i['annotation_id_original'] = moment_i['annotation_id']
        moment_i['annotation_id'] = offset
        
        del moment_i['num_segments']
        del moment_i['dl_link']
        moment_i['time'] = None
        offset += 1
        
        video_id = moment_i['video']
        if video_id in videos:
            videos[video_id]['num_instances'] += 1
            continue
        videos[video_id] = {
            'num_instances': 1,
            # This is incorrect, but we follow the ICCV17 recipe for fair
            # comparison. Note that we dumped features accordingly.
            'num_clips': 6,
            'num_frames': MAX_TIME * DIDEMO_TIME_UNIT
        }
    return videos

In [None]:
%%time
SUBSETS = ['train', 'val', 'test']
MODE = 'x'
MAX_TIME = 30
CREATOR = 'EscorciaSSGR'
RAW_DATA_FMT = '../data/raw/{}_data.json'
OUTPUT_FMT = '../data/interim/didemo/{}.json'
H5_FILE = '/home/escorciav/datasets/charades/features/resnet152_max.h5'
if MODE == 'w':
    print('are you sure you wanna do this? comment these 3 lines!')
    raise
assert SUBSETS == ['train', 'val', 'test']

offset = 0
for subset in SUBSETS:
    filename = Path(RAW_DATA_FMT.format(subset))
    output_file = Path(OUTPUT_FMT.format(subset))
    with open(filename, 'r') as fid:
        instances = json.load(fid)
        videos = update_instances_make_videos_dict(instances, offset)
        offset += len(instances)

    if not output_file.parent.is_dir():
        dirname = output_file.parent
        dirname.mkdir(parents=True)
        print(f'Create dir: {dirname}')

    print('Subset:', subset)
    print('\tNum videos:', len(videos))
    print('\tNum instances:', len(instances))
    with open(output_file, MODE) as fid:
        json.dump({'videos': videos,
                   'moments': instances,
                   'time_unit': DIDEMO_TIME_UNIT,
                   'date': datetime.now().isoformat(),
                   'git_hash': get_git_revision_hash(),
                   'responsible': CREATOR,
                  },
                  fid)
    print('\tDumped file:', output_file)