In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import os.path as osp
import json
import matplotlib.pyplot as plt
import argparse
import datetime
import os
from collections import defaultdict

In [2]:
# META DATA
meta_data_file_path = "/fb-agios-acai-efs/Ego4D/ego4d_data/ego4d.json"

# ANNOTATION DATA LTA
annotation_file_dir = "/fb-agios-acai-efs/Ego4D/ego4d_data/v1/annotations"
annotation_file_names = {'train': "fho_lta_train.json", 'val': 'fho_lta_val.json',
                         'test': 'fho_lta_test_unannotated.json'}
train_annotation_file = osp.join(annotation_file_dir, annotation_file_names['train'])
val_annotation_file = osp.join(annotation_file_dir, annotation_file_names['val'])

In [3]:
class ArgumentsWrapper:
    def __init__(self): pass


args = ArgumentsWrapper()
args.seed =0
args.sort_by_col = "sum_clip_duration_min"
args.nb_users_train = 40
args.nb_users_thresh = 50

In [4]:
def summarize_clips_by_user(joined_df):
    """Group annotation entries by clip_uid, then group those unique clips by user."""
    clip_df = joined_df.groupby(joined_df['clip_uid'], as_index=False).agg(
        {'fb_participant_id': lambda x: np.unique(x).tolist(),
         'scenarios': list,
         'verb': list, 'noun': list, 'verb_label': list, 'noun_label': list, 'action_idx': list,
         #          'video_uid':list,'duration_sec':list, # This is the raw uncut video, don't need this info
         'clip_id': list, 'clip_parent_start_sec': lambda x: np.unique(x).tolist(),
         'clip_parent_end_sec': lambda x: np.unique(x).tolist()})

    # Check users/clip_starts and ends are only 1 unique
    assert (clip_df.fb_participant_id.apply(len) == 1).all()
    assert (clip_df.clip_parent_start_sec.apply(len) == 1).all()
    assert (clip_df.clip_parent_end_sec.apply(len) == 1).all()

    # Unpack
    for col_name in ['fb_participant_id', 'clip_parent_start_sec', 'clip_parent_end_sec']:
        clip_df[col_name] = clip_df[col_name].apply(lambda x: x[0])

    # Get actual clip lengths in seconds (~5min=300s)
    clip_df['clip_duration_sec'] = clip_df.loc[:, ('clip_parent_end_sec', 'clip_parent_start_sec')].apply(
        lambda x: x[0] - x[1], axis=1)

    # Group by fb_participant_id, which has allocated multiple 5min clips (unique clip_uid's)
    user_df = clip_df.groupby(clip_df['fb_participant_id'], as_index=False).agg(
        {
            'scenarios': list,
            'verb': list, 'noun': list, 'verb_label': list, 'noun_label': list, 'action_idx': list,
            'clip_id': list, 'clip_parent_start_sec': list, 'clip_parent_end_sec': list, 'clip_duration_sec': list}
    )

    # Sum clip lengths per user
    user_df['sum_clip_duration_sec'] = user_df['clip_duration_sec'].apply(sum)
    user_df['sum_clip_duration_min'] = user_df['sum_clip_duration_sec'].apply(lambda x: x / 60)

    # The scenarios only apply to the raw uncut video, not the 5min clips
    user_df = user_df.rename(columns={"scenarios": "possible_clip_scenarios"})

    # Check that no NaN user
    assert not (user_df['fb_participant_id'].isna().any())

    return user_df

In [5]:
    # Open meta data object
    with open(meta_data_file_path, 'r') as meta_data_file:
        meta_data_obj = json.load(meta_data_file)
    meta_df = pd.json_normalize(meta_data_obj['videos'])
    print(f"meta_data.shape={meta_df.shape}")

    # Open train and val objects
    with open(train_annotation_file, 'r') as train_file, \
            open(val_annotation_file, 'r') as val_file:
        train_clips = json.load(train_file)['clips']
        val_clips = json.load(val_file)['clips']

    train_clips_df = pd.json_normalize(train_clips)
    val_clips_df = pd.json_normalize(val_clips)
    print(f"trainshape={train_clips_df.shape}, valshape={val_clips_df.shape}")

    # Show overlapping
    print(f"Meta colnames={list(meta_df)}")
    print(f"Annotation colnames={list(train_clips_df)}")
    overlapping_colnames = [x for x in list(meta_df) if x in list(train_clips_df)]
    print(f"Overlapping colnames={overlapping_colnames}")

    # MERGE dataframes on video_uid (Right join: keep annotation entries, but add video_uid info)
    train_joined_df = pd.merge(meta_df, train_clips_df,
                               on="video_uid", validate="one_to_many", how="right")
    val_joined_df = pd.merge(meta_df, val_clips_df,
                             on="video_uid", validate="one_to_many", how="right")
    print(f"train_joined_df={train_joined_df.shape}, val_joined_df={val_joined_df.shape}")

    # CONCAT the dataframes (312 rows × 12 columns)
    trainval_joined_df = pd.concat([train_joined_df, val_joined_df], ignore_index=True, sort=False)

#     # FIND USERS that satisfy video-length threshold
#     # Note: video_uid relates to the entire uncut raw video,
#     # these are split into ~5-min clips, denoted with clip_id for the annotations.
#     trainval_user_df = summarize_clips_by_user(trainval_joined_df)

#     # Sort users on sum_length
#     trainval_user_df = trainval_user_df.sort_values(by=[args.sort_by_col], ascending=False)

meta_data.shape=(9645, 54)
trainshape=(23610, 20), valshape=(15587, 20)
Meta colnames=['video_uid', 'duration_sec', 'scenarios', 'split_em', 'split_av', 'split_fho', 's3_path', 'manifold_path', 'origin_video_id', 'video_source', 'device', 'physical_setting_name', 'fb_participant_id', 'is_stereo', 'has_imu', 'has_gaze', 'imu_metadata', 'gaze_metadata', 'video_components', 'concurrent_sets', 'has_redacted_regions', 'redacted_intervals', 'gaps', 'video_metadata.fps', 'video_metadata.num_frames', 'video_metadata.video_codec', 'video_metadata.audio_codec', 'video_metadata.display_resolution_width', 'video_metadata.display_resolution_height', 'video_metadata.sample_resolution_width', 'video_metadata.sample_resolution_height', 'video_metadata.mp4_duration_sec', 'video_metadata.video_start_sec', 'video_metadata.video_duration_sec', 'video_metadata.audio_start_sec', 'video_metadata.audio_duration_sec', 'video_metadata.video_start_pts', 'video_metadata.video_duration_pts', 'video_metadata.video_

In [11]:
# Check for a single user, which is how it's going to be in-code. (Each user has separate annotation entries)
pd.options.display.max_rows = 500

# Video components have no timestamp available either:
# single_user_df = trainval_joined_df.loc[trainval_joined_df['fb_participant_id'] == 30.0]['video_components'].tolist()[0] # Has multiple videos

# Origin_video_id might still have the best shot for chronological ordering
single_user_df = trainval_joined_df.loc[trainval_joined_df['fb_participant_id'] == 30.0]
single_user_df['origin_video_id']


sorted_single_user_df = single_user_df.sort_values(['origin_video_id','clip_parent_start_sec','action_idx'],ascending=True)
sorted_single_user_df.head(n=100)[['origin_video_id','clip_parent_start_sec','clip_uid','action_idx']]

Unnamed: 0,origin_video_id,clip_parent_start_sec,clip_uid,action_idx
38736,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,0
38737,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,1
38738,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,2
38739,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,3
38740,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,4
38741,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,5
38742,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,6
38743,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,7
38744,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,8
38745,unict_013_4865,0.0,f47e0ac5-3d16-4f7b-be00-1363945e07b6,9
