This notebook contains sample code to explore the data annotations in DVD. The notebook runs on an example video and dialogue from the DVD benchmark. 

In [1]:
import json
import pickle as pkl
from IPython.display import Video

# Helper functions
The following functions are defined to extract question types/subtypes, turn distances of object references in a question, video input size, etc.

In [2]:
def get_question_type(template, prior_template):
    last_node_type = template['nodes'][-1]['type']
    text = template['text'][0].lower()
    if 'same set of activities' in text:
        qtype = 'compare action set'
    elif 'same sequence of activities' in text:
        qtype = 'compare action sequence'
    elif 'frequently' in text:
        qtype = 'compare int'
    elif 'how many times' in text:
        qtype = 'action count'
    elif 'how many' in text or 'what number' in text:
        qtype = 'obj count'
    elif 'is there' in text:
        qtype = 'obj exist'
    elif 'what color' in text or 'what material' in text or 'what shape' in text or 'what size' in text:
        qtype = 'attr query'
    elif 'what type of action' in text or 'what is the' in text or 'what types of action' in text:
        qtype = 'action query'
    else:
        assert 'what about' in text
        qtype = get_question_type(prior_template, None)
    return qtype

def get_question_subtype(template, prior_template):
    last_node_type = template['nodes'][-1]['type']
    text = template['text'][0].lower()
    if 'same set of activities' in text:
        if 'how many' in text:
            qtype = 'compare action set (count)'
        else:
            qtype = 'compare action set (exist)'
    elif 'same sequence of activities' in text:
        if 'how many' in text:
            qtype = 'compare action seq (count)'
        else:
            qtype = 'compare action seq (exist)'
    elif 'frequently' in text:
        if 'as frequently' in text:
            qtype = 'compare int (equal)'
        elif 'less frequently' in text:
            qtype = 'compare int (less)'
        elif 'more frequently' in text:
            qtype = 'compare int (more)'
    elif 'how many times' in text:
        qtype = 'action count'
    elif 'how many' in text or 'what number' in text:
        qtype = 'obj count'
    elif 'is there' in text:
        qtype = 'obj exist'
    elif 'what color' in text or 'what about its color' in text:
        qtype = 'attr query (color)'
    elif 'what material' in text or 'what about its material'in text:
        qtype = 'attr query (material)'
    elif 'what shape' in text or 'what about its shape' in text:
        qtype = 'attr query (shape)'
    elif 'what size' in text or 'what about its size' in text:
        qtype = 'attr query (size)'
    elif 'what type of action' in text or 'what is the' in text or 'what types of action' in text:
        if '<o>' in text:
            qtype = 'action query (by order)'
        elif '<f>' in text:
            qtype = 'ation query (by freq)'
        else:
            qtype = 'action query (all actions)'
    else:
        assert 'what about' in text
        assert 'color' not in text and 'size' not in text and \
                'shape' not in text and 'material' not in text
        qtype = get_question_subtype(prior_template, None)
    return qtype

def get_interval_type(turn):
    template = turn['template']
    template_fn = turn['template_filename']
    interval_type = template['interval_type']
    if interval_type == 'none':
        return 'none'
    elif interval_type == 'atomic':
        if 'one_hop' in template_fn:
            return 'atomic (spatial)'
        else:
            return 'atomic (non-spatial)'
    elif interval_type == 'compositional':
        return 'compositional'

In [3]:
# compute the turn distance of long-term object references 
# for each turn, if there is more than one object references, return the highest turn distance 
def compute_long_term_or_turn_dist(turn_idx, turn):
    template = turn['template']
    used_objects = template['used_objects']
    dependencies = turn['turn_dependencies']
    all_dists = [0]
    if dependencies['object'] != 'none':
        if dependencies['object'] == 'earlier_unique':
            obj_id = str(template['earlier_unique_obj'])
            if obj_id not in used_objects:
                pdb.set_trace()
            turn_dist = turn_idx - used_objects[obj_id]['original_turn'] + 1
            all_dists.append(turn_dist)

    if dependencies['temporal'] != 'none':
        if 'earlier_unique' in dependencies['temporal']:
            obj_id = str(template['temporal_obj_id'])
            if obj_id not in used_objects:
                pdb.set_trace()
            turn_dist = turn_idx - used_objects[obj_id]['original_turn'] + 1
            all_dists.append(turn_dist)
    return max(all_dists)

# obtain the turn distance of long-term object references (OR)
# effective_or_only=True: only track objects that cannot be uniquely identified from video alone 
# e.g. when there are more than one red cube in the video, 'aforementioned red rube' is effective 
def get_long_term_or_turn_dist(turn_idx, turn, effective_or_only=True):
    if turn_idx>0:
        obj_dist = compute_long_term_or_turn_dist(turn_idx, turn)
        if obj_dist!=0 and effective_or_only:
            if ('effective_obj_tracking' in turn and turn['effective_obj_tracking'])or \
            ('effective_temporal_obj_tracking' in turn and turn['effective_temporal_obj_tracking']):
                pass 
            else:
                obj_dist = -1
    else:
        obj_dist = 0
    return obj_dist 

In [4]:
def get_start_end_time(period):
    start, end = period
    if start is None:
        start = 0
    else:
        start = start[-1]
    if end is None:
        end = 301
    else:
        end = end[-1]
    return start, end

def get_interval_size(period):
    if period is None:
        return 0
    start, end = get_start_end_time(period)
    return end - start

# Obtain the interval size as a ratio of the whole video input size 
# effective_local_interval_only=True, return the size only when the question cannot be solved with WHOLE video 
# e.g. when current interval has only 1 moving red red cube but the whole video has 2 moving red cubes
# 'during <interval>, how many moving red cubes are there' is effective 
def get_interval_relative_size(template, effective_local_interval_only=True):
    period = template['used_periods'][-1]
    whole_video_period = (None, template['cutoff'])
    if effective_local_interval_only and not turn['effective_temporal_localizing']:
        return -1
    return get_interval_size(period)/get_interval_size(whole_video_period)

# CATER Video
- Originally, each CATER video file contains the annotations of objects (size, color, material, shape) and their actions (start/end time). 
- After following the preprocessing code for CATER videos, each video file is extended with a list of possible intervals (atomic/compositional) and precomputed object actions and spatial relations in each interval. 
- The object ID (0 to 9) and time frame ID (1 to 301) are used to process the functional programs in questions in DVD.

In [5]:
# Original CATER video 
Video("CATER_new_000002.mp4")

In [6]:
vid = pkl.load(open('CATER_new_000002.pkl', 'rb'))

In [7]:
print("Video: {}".format(vid['image_filename']))
for obj_idx, obj in enumerate(vid['objects']):
    print("Obj ID {}: {} {} {} {}".format(
        obj_idx, obj['size'], obj['color'], obj['material'], obj['shape']))

Video: CLEVR_new_000002.avi
Obj ID 0: small gold metal spl
Obj ID 1: medium purple metal cone
Obj ID 2: large green metal cone
Obj ID 3: large yellow rubber cylinder
Obj ID 4: small cyan metal cylinder
Obj ID 5: large green rubber cube
Obj ID 6: large blue metal cylinder
Obj ID 7: large red rubber sphere
Obj ID 8: medium gray metal cone
Obj ID 9: small purple rubber cylinder


In [8]:
dial = json.load(open('CLEVR_new_000002.json','r'))[0]

# DVDialogue
- Each json file contains one dialogue for a specific video from CATER. All object ids/actions and frame ids are referenced from the annotations of the CATER video. 
- Each dialogue has 10 turns. In each turn, the data is a dictionary with the following attributes: 
    - `question`: a question about the video
    - `answer`: an answer to the above question based on the visual content of the video 
    - `turn_dependencies`: the cross-turn dependencies that are embedded in this turn. The 1st turn of each dialogue always have `none` type dependencies (no cross-turn relations) 
        - `temporal`: relations that determine the video interval of the current turn, including: 
            - `<1/2/3/4>_<flying/sliding/rotating>_among_<before/after/during>`: action reference to a set of action in the previous turn e.g. "among them, after the third slide" 
            - `prior_<flying/sliding/rotating>_<before/after/during>`: action reference to a unique action in the previous turn e.g. "during this slide"
            - `after`/`before`/`during`: interval references to the interval of the previous turn e.g. "after this period" 
            - `video_update`: topic transfer (temporal) with incremental video input to the video input of the previous turn e.g. "what about up until now"
            - `earlier_unique_obj_none`: interval with long-term object references e.g. "during the aforementioned yellow thing 's first rotation"
            - `last_unique_obj_none`: interval with short-term object references e.g. "before its third rotation"
        - `spatial`: topic transfer (spatial) from the previous turn, including: 
            - `left` / `right`/ `front`/ `behind` e.g. "what about to the left of it?"
        - `attribute`: topic transfer (attribute) from the previous turn, including: 
            - `query_color`/ `query_shape`/ `query_size`/ `query_material` e.g. "what about its color?" 
        - `object`: object references to objects mentioned in dialouge context, including:  
            - `earlier_unique`: long-term object references (> 1 turn distance) e.g. "the earlier mentioned red sphere"
            - `last_unique`: short-term object references (1-turn distance) e.g. "them", "it"
    - `program`: the functional program that is used to solve the question in a multi-step reasoning process. This is a sequence of node, each node including the following attributes: 
        - `type`: type of nodes e.g. `filter_color`, `count_object`, etc. 
        - `inputs`: indices of the preceding nodes; their outputs are inputs to the current node 
        - `side_inputs`: parameters of the current node e.g. "green", "yellow", "rubber", "before", "after", etc. 
        - `_output`: the output of the current node e.g. object count, object ids, interval period by start/end frame id 
        - Please refer to the Appendix in the paper for more details of functional program types and data types 
    - `template`: template of the question, containing the information to determine the question interval type and question type/subtype. Other information includes: 
        - `cutoff`: the cutoff event from the original CATER video. The input video of this turn will be from frame #0 to the cutoff event
        - `used_periods`: contains all time periods up to the current turn. Each period is determine by a start event and end event 
        - event: each cutoff event or start/end event is defined as by the start/end time of an object action. Event is in the form of `[<object_id>,start/end_rotating/sliding/flying, <order>, <frame id>]`
        - if an event is `None`, it is either the start or the end of the original CATER video 
        - `used_objects`: all unique objects that are mentioned up to the previous turn. This is used to solve any long-term object references in the question of the curren turn. This is a dictionary with key as the object id and the values are:
            - `original_turn`: the original turn id the object was mentioned 
            - object attributes mentioned in the dialogue so far: `<Z>`: size, `<C>`: color, `<M>`: material, `<S>`: shape 
- The code in the next cell shows how these information fields can be extracted from a sample dialogue 

In [9]:
print("Number of turns: {}".format(len(dial)))
video = '{}-{}'.format(dial[0]['split'], dial[0]['image_filename'])
print("Video ID: {}".format(video))
print()
for turn_idx, turn in enumerate(dial):
    print("Turn #{}".format(turn_idx+1))
    template = turn['template']
    if turn_idx>0:
        prior_template = dial[turn_idx-1]['template']
    else:
        prior_template = None
    question_type = get_question_type(template, prior_template)
    question_subtype = get_question_subtype(template, prior_template)
    interval_type = get_interval_type(turn)
    print("Video input: [0,{}]".format(template['cutoff']))
    print("Question: {}".format(turn['question']))
    print("Question Type: {} & Subtype: {}".format(question_type, question_subtype))
    print("Interval type: {} & Interval temporal period: {}".format(interval_type, template['used_periods'][-1]))
    print("Interval relative size: {}".format(get_interval_relative_size(template, False)))
    print("Interval relative size (Effective only): {}".format(get_interval_relative_size(template, True)))
    print("Program:")
    for node_idx, node in enumerate(turn['final_all_program']): 
        print(node_idx, node)
    print("Answer: {}".format(turn['answer']))
    print("Turn dependencies: {}".format(turn['turn_dependencies']))
    print("Tracked objects:")
    for k,v in template['used_objects'].items():
        print('{}:{}'.format(k,v))
    print("Long-term object reference turn distance: {}".format(get_long_term_or_turn_dist(turn_idx, turn, False)))
    print("Long-term object reference turn distance (Effective only): {}".
          format(get_long_term_or_turn_dist(turn_idx, turn, True)))
    print("Number of contained objects in question: {} IDs: {}".
          format(len(turn['contained_objs']), turn['contained_objs']))
    print("=======================================================")

Number of turns: 10
Video ID: all_actions-CLEVR_new_000002.avi

Turn #1
Video input: [0,[5, 'start_rotating', 4, 123]]
Question: during the whole video , how many other objects perform the same sequence of activities as the yellow object ?
Question Type: compare action sequence & Subtype: compare action seq (count)
Interval type: compositional & Interval temporal period: [None, [5, 'start_rotating', 4, 123]]
Interval relative size: 1.0
Interval relative size (Effective only): -1
Program:
0 {'type': 'scene', 'inputs': [], '_output': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}
1 {'type': 'filter_color', 'inputs': [0], 'side_inputs': ['yellow'], '_output': [3]}
2 {'type': 'unique', 'inputs': [1], '_output': 3}
3 {'type': 'same_action_seq', 'inputs': [2], '_output': [4]}
4 {'type': 'count_object', 'inputs': [3], '_output': 1}
Answer: 1
Turn dependencies: {'temporal': 'none', 'object': 'none', 'attribute': 'none', 'spatial': 'none'}
Tracked objects:
Long-term object reference turn distance: 0
Long-term