In [1]:
 cd /data/p300488/lang2prog


/data/p300488/lang2prog


# Exploring GQA questions dataset

In [2]:
import os
gqa_path = '/data/p300488/datasets/gqa'
train_questions_path = os.path.join(gqa_path, 'questions/train_balanced_questions.json')
train_scenes_path = os.path.join(gqa_path, 'scenes/train_sceneGraphs.json')

Read GQA questions dataset. Size of training set?

In [3]:
import json
ds = json.load(open(train_questions_path))
#ds = list(ds.values())
print(len(ds))

943000


Let's see the annotation of a training example

In [29]:
from pprint import pprint

pprint(list(ds.values())[0])

{'annotations': {'answer': {},
                 'fullAnswer': {'2': '2486325'},
                 'question': {'2': '2486325'}},
 'answer': 'yes',
 'entailed': ['02930160',
              '02930158',
              '02930159',
              '02930154',
              '02930155',
              '02930156',
              '02930153'],
 'equivalent': ['02930152'],
 'fullAnswer': 'Yes, the sky is dark.',
 'groups': {'global': None, 'local': '06-sky_dark'},
 'imageId': '2354786',
 'isBalanced': True,
 'question': 'Is the sky dark?',
 'semantic': [{'argument': 'sky (2486325)',
               'dependencies': [],
               'operation': 'select'},
              {'argument': 'dark',
               'dependencies': [0],
               'operation': 'verify color'}],
 'semanticStr': 'select: sky (2486325)->verify color: dark [0]',
 'types': {'detailed': 'verifyAttr',
           'semantic': 'attr',
           'structural': 'verify'}}


Let' s see the structure and type of the questions.

In [4]:
# what question structures in each question type
q_struc_types = {}
for _, sample in ds.items():
    typ = sample['types']['semantic']
    if typ not in q_struc_types.keys():
        q_struc_types[typ] = set()
    q_struc_types[typ].add(sample['types']['structural'])
                 
pprint(q_struc_types)

{'attr': {'choose', 'query', 'logical', 'compare', 'verify'},
 'cat': {'query', 'choose'},
 'global': {'query', 'verify', 'choose'},
 'obj': {'logical', 'verify'},
 'rel': {'query', 'verify', 'choose'}}


Structure means what is the final reasoning step

 - **verify**:
  yes/no (e.g  Is the bowl red? 
          ==> select: bowl -> verify color: red [0] ) 

 - **choose**: 
  multiple choice A or B (e.g.  Is it rainy or sunny? 
                          ==> select: it -> choose: rainy|sunny [0])   

 - **compare**: 
  compare attributes of two entities (e.g    Are the bowl and the cup the same color?   
                                      ==>    select: bowl -> select: cup -> same color: [0,1])

 - **logical**:
   two verify branches connected with and/or (e.g.   Is there either a large bowl or a cup?
                                              ==>    select: bowl -> filter size: large [0] -> exist: ? [1] 
                                                  -> select: cup -> > exist: ? [3] -> or: [2,4]

 - **query**:
   ask for the name, an attrubute or the position of an entity 
      (e.g.  What is the cup made of?  
       ==>  select: cup -> query: material [0] )

Question type means what the question is about:
   - **attr** : about an attribute of an entity (e.g. Is the bowl red?, Is the glass broken?)
   - **cat**: about the name/category of an entity (e.g.  What soda drink is this?)
   - **global**: about the weather, the location or the background of the scene
   - **obj**: about the existence of certain objects  (e.g. Is there a big dog?)
   - **rel**: contains relationship between two entities  (What is the man sitting in the chair eating?) 

Let's see what are all the different reasoning primitives

In [5]:
all_primitives = set()
for _, sample in ds.items():
    functions = sample['semantic']
    for fn in functions:
        all_primitives.add(fn['operation'])

pprint(all_primitives)

{'and',
 'choose',
 'choose activity',
 'choose age',
 'choose brightness',
 'choose cleanliness',
 'choose color',
 'choose company',
 'choose depth',
 'choose face expression',
 'choose fatness',
 'choose flavor',
 'choose gender',
 'choose hardness',
 'choose healthier',
 'choose height',
 'choose hposition',
 'choose larger',
 'choose length',
 'choose less healthy',
 'choose location',
 'choose lower',
 'choose material',
 'choose name',
 'choose older',
 'choose opaqness',
 'choose pattern',
 'choose place',
 'choose pose',
 'choose race',
 'choose realism',
 'choose rel',
 'choose shape',
 'choose shorter',
 'choose size',
 'choose smaller',
 'choose sportActivity',
 'choose state',
 'choose taller',
 'choose thickness',
 'choose tone',
 'choose vposition',
 'choose weather',
 'choose weight',
 'choose width',
 'choose younger',
 'common',
 'different',
 'different color',
 'different shape',
 'exist',
 'filter',
 'filter activity',
 'filter age',
 'filter brightness',
 'filter 

These primitives are attribute-aware. Let's see them if we treat attribute as argument:

In [6]:
all_attributes=set()
primitives=set()
primitives_and_attributes = {}
for fn in all_primitives:
        toks = fn.split(' ')
        primitive = toks[0]
        primitives.add(primitive)

        if primitive not in primitives_and_attributes.keys():
            primitives_and_attributes[primitive] = set()

        if len(toks) > 1:
            all_attributes.add(toks[1])
            primitives_and_attributes[primitive].add(toks[1])

pprint(primitives)

{'and',
 'choose',
 'common',
 'different',
 'exist',
 'filter',
 'or',
 'query',
 'relate',
 'same',
 'select',
 'verify'}


From these ```verify, filter, query, same/different``` and ```choose``` are associated with attribute concepts. Do all attributes appear in all primitives? Let's see them

In [7]:
for fn, atts in primitives_and_attributes.items():
        if atts:
            print(f"{fn} ({len(atts)})\n{','.join(sorted(atts))}")
            print()

choose (44)
activity,age,brightness,cleanliness,color,company,depth,face,fatness,flavor,gender,hardness,healthier,height,hposition,larger,length,less,location,lower,material,name,older,opaqness,pattern,place,pose,race,realism,rel,shape,shorter,size,smaller,sportActivity,state,taller,thickness,tone,vposition,weather,weight,width,younger

verify (37)
activity,age,brightness,cleanliness,color,company,depth,face,fatness,flavor,gender,hardness,height,hposition,length,location,material,opaqness,pattern,place,pose,race,realism,rel,room,shape,size,sportActivity,state,texture,thickness,tone,type,vposition,weather,weight,width

filter (37)
activity,age,brightness,cleanliness,color,company,depth,event,face,fatness,flavor,gender,hardness,height,hposition,length,liquid,material,opaqness,orientation,pattern,pose,race,realism,room,shape,size,sport,sportActivity,state,texture,thickness,tone,vposition,weather,weight,width

same (3)
color,material,shape

different (2)
color,shape



For some reason only in the ```query``` primitive attribute concepts are defined as arguments. Same for some cases of ```same/different``` primitives. Let's retrieve them and compare with other attributes.

In [8]:
temp = {f: set() for f in ['query', 'same', 'different']}
for _, sample in ds.items():
    ops = [fn['operation'] for fn in sample['semantic']]
    for fn_name in ['query', 'same', 'different']:
        if fn_name in ops:
            arg = sample['semantic'][ops.index(fn_name)]['argument']
            if arg != 'None':
                temp[fn_name].add(arg)

pprint(temp)

{'different': {'type', 'gender'},
 'query': {'15',
           '16',
           '18',
           '24',
           '25',
           '27',
           '31',
           'activity',
           'age',
           'cleanliness',
           'color',
           'company',
           'depth',
           'face expression',
           'fatness',
           'flavor',
           'gender',
           'hardness',
           'height',
           'hposition',
           'length',
           'material',
           'name',
           'pattern',
           'place',
           'pose',
           'race',
           'realism',
           'room',
           'shape',
           'size',
           'sport',
           'sportActivity',
           'state',
           'texture',
           'thickness',
           'tone',
           'weather',
           'weight',
           'width'},
 'same': {'type', 'gender'}}


wtf are the numbers?

In [9]:
def print_qa(sample):
    print(f"Q: {sample['question']}")
    print(f"P: {sample['semanticStr']}")
    print(f"A: {sample['answer']}")
    print()

    
for _, sample in ds.items():
    ops = [fn['operation'] for fn in sample['semantic']]
    if 'query' in ops:
        arg = sample['semantic'][ops.index('query')]['argument']
        if arg in ['15', '16', '18', '24', '25', '27', '31']:
            print_qa(sample)

Q: What's the man doing?
P: select: man (758399)->query: 27 [0]
A: taking pictures

Q: What is the person doing?
P: select: person (875671)->query: 15 [0]
A: taking a picture

Q: What is he doing?
P: select: he (2580846)->query: 18 [0]
A: brushing teeth

Q: What's he doing?
P: select: he (1620687)->query: 31 [0]
A: brushing teeth

Q: What are the people to the left of the chair doing?
P: select: chair (653821)->relate: people,to the left of,s (653812) [0]->query: 16 [1]
A: taking pictures

Q: What is the man to the left of the people doing?
P: select: people (2424517)->relate: man,to the left of,s (3867654) [0]->query: 24 [1]
A: making a face

Q: What is the woman doing?
P: select: woman (317132)->query: 25 [0]
A: brushing teeth



Apparently its noise, it corresponds to the question: "\What is/are the __ doing?", which refers to the ``Activity`` attribute

In [10]:
query_atts = set([att if att not in ['15', '16', '18', '24', '25', '27', '31'] else 'activity' for att in temp['query']])

In [11]:
# attribites in query primitive that not appear in the rest
for a in query_atts:
    primitives_and_attributes['query'].add(a)
    if a not in all_attributes:
        print(a)
        all_attributes.add(a)

primitives_and_attributes['same'].update(temp['same'])
primitives_and_attributes['different'].update(temp['different'])

face expression


This one didnt exist in the other primitives, so we added it, together with missing ```same/different``` concepts. Let's see everything again now:

In [12]:
for fn, atts in primitives_and_attributes.items():
        if atts:
            print(f"{fn} ({len(atts)})\n{','.join(sorted(atts))}")
            print()

choose (44)
activity,age,brightness,cleanliness,color,company,depth,face,fatness,flavor,gender,hardness,healthier,height,hposition,larger,length,less,location,lower,material,name,older,opaqness,pattern,place,pose,race,realism,rel,shape,shorter,size,smaller,sportActivity,state,taller,thickness,tone,vposition,weather,weight,width,younger

verify (37)
activity,age,brightness,cleanliness,color,company,depth,face,fatness,flavor,gender,hardness,height,hposition,length,location,material,opaqness,pattern,place,pose,race,realism,rel,room,shape,size,sportActivity,state,texture,thickness,tone,type,vposition,weather,weight,width

filter (37)
activity,age,brightness,cleanliness,color,company,depth,event,face,fatness,flavor,gender,hardness,height,hposition,length,liquid,material,opaqness,orientation,pattern,pose,race,realism,room,shape,size,sport,sportActivity,state,texture,thickness,tone,vposition,weather,weight,width

same (5)
color,gender,material,shape,type

different (4)
color,gender,shape,type

Let's check out the ```relate``` primitive. It's also associated with some type of relational concepts.

In [13]:
relate_concepts = set()
for _, sample in ds.items():
    ops = [fn['operation'] for fn in sample['semantic']]
    if 'relate' in ops:
        # avoid ground truth tag for image
        arg = sample['semantic'][ops.index('relate')]['argument'].split(' (')[0]
        # first field corresponds to the subject. Its used for verifying relations
        subj, concept, sth = arg.split(',')
        relate_concepts.add(concept)
#         if len(arg.split(',')) < 3:
#             print_qa(sample)
#             print(arg.split(','))

pprint(relate_concepts)

{'about to hit',
 'above',
 'across',
 'adjusting',
 'along',
 'approaching',
 'around',
 'at',
 'at the edge of',
 'attached to',
 'balancing on',
 'behind',
 'below',
 'beneath',
 'beside',
 'between',
 'beyond',
 'biting',
 'blowing out',
 'boarding',
 'brushing',
 'buying',
 'by',
 'carrying',
 'catching',
 'chained to',
 'chasing',
 'chewing',
 'cleaning',
 'climbing',
 'close to',
 'coming down',
 'coming from',
 'coming out of',
 'connected to',
 'contain',
 'cooked in',
 'cooking',
 'covered by',
 'covered in',
 'covered with',
 'covering',
 'crossing',
 'cutting',
 'decorated by',
 'decorated with',
 'decorating',
 'displayed in',
 'displayed on',
 'down',
 'dragging',
 'draped over',
 'drawn on',
 'drinking',
 'drinking from',
 'driving',
 'driving down',
 'driving on',
 'eating',
 'eating at',
 'eating from',
 'eating in',
 'enclosing',
 'entering',
 'exiting',
 'facing',
 'falling off',
 'feeding',
 'filled with',
 'floating in',
 'floating on',
 'flying',
 'flying above',


These are pretty generic. Let's consider them as concept values of the ``rel`` concept also included as attribute. 

What are the concept values for all other attribute concepts? 

In [16]:
attribute_values = {**{a: set() for a in all_attributes}, 'generic': set()} 

for _, sample in ds.items():
    ops = [fn['operation'] for fn in sample['semantic']]
    for node in sample['semantic']:
        fn = node['operation']
        arg = node['argument']
        
        # adapt arg for rels
        if len(fn.split()) == 2 and fn.split()[1] == "rel":
                   arg = arg.split(' (')[0].split(',')[1]
        
        # not(x) -> x
        if arg.startswith('not'):
            arg = arg[4:-1]
        
        if fn in ['filter', 'verify', 'same', 'different']:
                   attribute_values['generic'].add(arg.strip())
                   
        elif fn.startswith('filter') or fn.startswith('verify') or fn.startswith('same') or fn.startswith('different'):
                   attribute = fn.split()[1]
                   assert attribute in all_attributes
                   attribute_values[attribute].add(arg.strip())
        
        elif fn.startswith('choose'):
                   check = fn.split()
                   args = set([x.strip() for x in arg.split('|')])
                   if len(check) == 1:
                        attribute_values['generic'].update(args)
                   else:
                       attribute = fn.split()[1]
                       assert attribute in all_attributes
                       attribute_values[attribute].update(args)

pprint(attribute_values)
# making sure our attributes as computed here agree with before
print(len(attribute_values), len(all_attributes))
print((set(attribute_values.keys()).difference(all_attributes)))

{'activity': {'cooking',
              'drinking',
              'driving',
              'eating',
              'having meeting',
              'looking down',
              'looking up',
              'playing',
              'posing',
              'reading',
              'resting',
              'sleeping',
              'staring',
              'talking',
              'waiting'},
 'age': {'young', 'old', 'little'},
 'brightness': {'dark', 'bright'},
 'cleanliness': {'stained', 'tinted', 'dirty', 'clean'},
 'color': {'',
           'beige',
           'black',
           'blond',
           'blue',
           'brown',
           'brunette',
           'cream colored',
           'dark',
           'dark blue',
           'dark brown',
           'gold',
           'gray',
           'green',
           'khaki',
           'light blue',
           'light brown',
           'maroon',
           'orange',
           'pink',
           'purple',
           'red',
           'silver'

The only difference is the ``generic`` concept, which we defined ourselves above. This covers all cases where primitives (e.g. ``filter, query, same`` etc.) where not followed by a specific attribute, so we organize them together. The ``generic`` attribute values are in the long-tail:

In [21]:
pprint(attribute_values['generic'])

{'abandoned',
 'abundant',
 'adult',
 'alert',
 'american',
 'analog',
 'angled',
 'antique',
 'apple',
 'arched',
 'artificial',
 'assorted',
 'athletic',
 'attached',
 'baby',
 'baked',
 'bald',
 'balding',
 'banana',
 'bare',
 'barefoot',
 'barren',
 'baseball',
 'beautiful',
 'bent',
 'black and white',
 'blank',
 'blooming',
 'blowing',
 'blurry',
 'boiled',
 'breaking',
 'bright',
 'broken',
 'browned',
 'brushing teeth',
 'bunched',
 'burning',
 'burnt',
 'bushy',
 'busy',
 'calico',
 'carved',
 'cast on shadow',
 'ceramic',
 'chinese',
 'chipped',
 'chopped',
 'chubby',
 'clear',
 'closed',
 'clumped',
 'cluttered',
 'collared',
 'colorful',
 'comfortable',
 'commercial',
 'complete',
 'computer',
 'connected',
 'cooked',
 'cordless',
 'covered',
 'cracked',
 'creamy',
 'crisp',
 'crispy',
 'crooked',
 'crossed',
 'crowded',
 'crumbled',
 'crumpled',
 'crusty',
 'curious',
 'curled',
 'curly',
 'curved',
 'curvy',
 'cushioned',
 'cut',
 'cylindrical',
 'damaged',
 'dark',
 'dar

Let's compare concept values for the attribute ``rel`` and relation values we extracted before

In [22]:
pprint(', '.join(attribute_values['rel'].difference(relate_concepts)))
print()
pprint(', '.join(relate_concepts.difference(attribute_values['rel'])))

attribute_values['rel'].update(relate_concepts)

'looking'

('walking across, smaller than, on the front of, on the side of, parked along, '
 'splashing, eating in, grazing in, beside, same color, in between, standing '
 'around, walking into, sewn on, down, growing near, wading in, traveling '
 'down, at, on the other side of, parked alongside, skiing down, placed on, '
 'next to, coming down, growing from, displayed in, playing at, sticking out '
 'of, walking through, beyond, longer than, larger than, walking up, higher '
 'than, of, on the bottom of, across, with, in the center of, on the back of, '
 'near, growing next to, close to, driving down, outside, chained to, along, '
 'same shape, at the edge of, growing in, growing along, in the middle of, '
 'same material, by, blowing out, looking toward, drawn on, sitting around, '
 'over, looking for, pointing to')


They're pretty much the same, a few concept values that exist when using the ``relate`` primitive dont exist when relation is treated as an attribute concept for ``filter, verify`` etc. We included everything in the vocabulary.

But some attributes dont have values. Let's check them out:

In [23]:
no_value_attributes = [att for att, v in attribute_values.items() if v == {''}]
pprint(no_value_attributes)

['healthier',
 'younger',
 'smaller',
 'shorter',
 'less',
 'older',
 'larger',
 'lower',
 'taller']


They are all *superlatives*!! meaning they compare a certain entity with other entities of same semantic category. E.g *''larger bowl''* -> *''bowl larger than other bowls''*. We'll probably need to treat relations with these arguments as separate primitives (e.g. ``relateSuper``).

# Exploring GQA scene graphs

In [24]:
scenes = json.load(open(train_scenes_path))
print(f"Total scenes: {len(scenes)}")
print(f'Some image ids: {",".join(list(scenes.keys())[:10])}')

Total scenes: 74942
Some image ids: 2386621,2373554,2370799,2370791,2370790,2332650,2373556,2414608,2373557,2413658


Let's inspect a scene graph, and all questions about it

In [25]:
image_id = '2370799'

# scene location and objects,attributes and relations of scene graph
objects = scenes[image_id]['objects']
if 'location' in scenes.keys():
    print(f"I see {scenes['location']}")
print()
for oid, obj in objects.items():
    print(f"I see {','.join(obj['attributes'])} {obj['name']} ({oid}), {', '.join([r['name'] + ' ' + objects[r['object']]['name'] for r in obj['relations']])}")
    print()

print('==' * 48)

# all questions about that scene
for qid, sample in ds.items():
    if sample['imageId'] == image_id:
        print_qa(sample)


I see  brush (2588934), to the right of mud, to the right of water, to the left of branch

I see  water (2071415), to the left of grass, to the left of brush

I see  branch (1884387), to the right of brush

I see  men (2375807), to the left of bike, to the right of bag, to the left of helmet, to the left of man, riding bike

I see  man (3856641), to the right of men, wearing helmet, to the right of bike, riding bike, to the right of bag

I see  mud (2305851), to the left of grass, to the left of brush

I see  dirt (2198303), to the left of bike, to the left of tires

I see  boot (2205964), 

I see  helmet (1974896), to the right of men

I see blue bike (3856635), to the right of dirt, to the left of bike, to the left of man

I see  tires (3856636), to the right of dirt

I see orange bike (1828300), to the right of bike, to the right of men

I see black bag (2129762), to the left of men, to the left of man

I see old tree (2198302), 

I see blue helmet (2504852), 

I see tall grass (18

As we see, all questions can be answered by the attributes and relations listed in the scene graph

# Executing GQA Programs