In [1]:
 cd /data/p300488/lang2prog

/data/p300488/lang2prog


# Exploring CLEVR questions dataset

In [2]:
import os
import json
from pprint import pprint 
import numpy as np

root = '/data/p300488/lang2prog'

# dataset
clevr_path = '/data/p300488/datasets/clevr/CLEVR_v1.0'
train_questions_path = os.path.join(clevr_path, 'questions/CLEVR_train_questions.json')
val_questions_path = os.path.join(clevr_path, 'questions/CLEVR_val_questions.json')
test_questions_path = os.path.join(clevr_path, 'questions/CLEVR_test_questions.json')

# generalization split
cogent_path = '/data/p300488/datasets/clevr/CLEVR_CoGenT_v1.0'
gen_trainA_questions_path = os.path.join(cogent_path, 'questions/CLEVR_trainA_questions.json')
gen_valA_questions_path = os.path.join(cogent_path, 'questions/CLEVR_valA_questions.json')
gen_valB_questions_path = os.path.join(cogent_path, 'questions/CLEVR_valB_questions.json')

Read CLEVR questions dataset. Size of training set?

In [3]:
ds = json.load(open(train_questions_path))['questions']
print(len(ds))

699989


See the structure of a sample, it contains a program annotation for the question.

In [4]:
pprint(ds[0])

{'answer': 'yes',
 'image_filename': 'CLEVR_train_000000.png',
 'image_index': 0,
 'program': [{'function': 'scene', 'inputs': [], 'value_inputs': []},
             {'function': 'filter_size',
              'inputs': [0],
              'value_inputs': ['large']},
             {'function': 'filter_color',
              'inputs': [1],
              'value_inputs': ['green']},
             {'function': 'count', 'inputs': [2], 'value_inputs': []},
             {'function': 'scene', 'inputs': [], 'value_inputs': []},
             {'function': 'filter_size',
              'inputs': [4],
              'value_inputs': ['large']},
             {'function': 'filter_color',
              'inputs': [5],
              'value_inputs': ['purple']},
             {'function': 'filter_material',
              'inputs': [6],
              'value_inputs': ['metal']},
             {'function': 'filter_shape',
              'inputs': [7],
              'value_inputs': ['cube']},
             {'function': 'c

Let'see all the different reasoning primitives and their related concept values

In [5]:
all_primitives = set()
for sample in ds:
    for node in sample['program']:
        _fn = node['function']
        _side_input =  '[' + node['value_inputs'][0] + ']' if node['value_inputs'] else ''
        all_primitives.add(_fn + _side_input)

pprint(all_primitives)

{'count',
 'equal_color',
 'equal_integer',
 'equal_material',
 'equal_shape',
 'equal_size',
 'exist',
 'filter_color[blue]',
 'filter_color[brown]',
 'filter_color[cyan]',
 'filter_color[gray]',
 'filter_color[green]',
 'filter_color[purple]',
 'filter_color[red]',
 'filter_color[yellow]',
 'filter_material[metal]',
 'filter_material[rubber]',
 'filter_shape[cube]',
 'filter_shape[cylinder]',
 'filter_shape[sphere]',
 'filter_size[large]',
 'filter_size[small]',
 'greater_than',
 'intersect',
 'less_than',
 'query_color',
 'query_material',
 'query_shape',
 'query_size',
 'relate[behind]',
 'relate[front]',
 'relate[left]',
 'relate[right]',
 'same_color',
 'same_material',
 'same_shape',
 'same_size',
 'scene',
 'union',
 'unique'}


In this formalism, the primitives are both concept-aware (``filter_color, filter_size`` etc.), as well as vocabulary-aware (``filter_color[red], filter_color[blue]``, etc ). Let's create a version which decouples specific concept values from the primitives (*vocabulary-agnostic*):

In [19]:
vocab_agnostic_primitives = set()
concept_agnostic_primitives = set()
for fn in all_primitives:
    f = fn.split('[')[0]
    vocab_agnostic_primitives.add(f)
    if len(f.split('_')) > 1:
        f = f.split('_')[0] if f.split('_')[1] not in ['than', 'integer'] else f
    concept_agnostic_primitives.add(f)

pprint(vocab_agnostic_primitives)

{'count',
 'equal_color',
 'equal_integer',
 'equal_material',
 'equal_shape',
 'equal_size',
 'exist',
 'filter_color',
 'filter_material',
 'filter_shape',
 'filter_size',
 'greater_than',
 'intersect',
 'less_than',
 'query_color',
 'query_material',
 'query_shape',
 'query_size',
 'relate',
 'same_color',
 'same_material',
 'same_shape',
 'same_size',
 'scene',
 'union',
 'unique'}


And the most general formalism, without concept-awareness (*concept_agnostic*)\:

In [20]:
pprint(concept_agnostic_primitives)

{'count',
 'equal',
 'equal_integer',
 'exist',
 'filter',
 'greater_than',
 'intersect',
 'less_than',
 'query',
 'relate',
 'same',
 'scene',
 'union',
 'unique'}


Let's give some context on different primitive types:

   - **Operational** : ``scene``: Initializes a set of objects given RGB image, ``unique``: {n} -> n
   
   - **Logical**: ``union/intersection``: union / intersection of two sets (outputs of two reasoning branches),
   
   - **Enumeration**: ``exist``: is a set non-empty?, ``count``: size of set, ``less_than/greater_than/equal_integer``: compares two integers
   
   - **Visual**: ``filter``: isolate object set based on attribute value, ``query``: ask for an attribute value, ``same``: object set which has same attribute value as given, ``equal``: whether two objects have equal attribute value
   
   - **Spatial**: ``relate``: object set which has certain spatial relation to given object

# Building Language-to-Program datasets

## Tokenizing Programs with different formalisms

Let's convert the program annotations to a universal format, which also works for GQA dataset. The format breaks down each reasoning primitive as a ``ProgramNode`` object, consisting of a function call (*concept_agnostic*, e.g. ``filter``), a ``concept_input`` field (e.g. ``color``), a ``value_input`` field (e.g. ``[blue]``) and an ``inputs`` field, denoting the index of the reasoning steps whose output is input for the current step. 

This representation allows us to restructure the program annotations in whatever fashion (*concept_agnostic*, *vocab-agnostic* etc.) we want for different learning models.

In [3]:
from typings import *
from pprint import pprint

In [6]:
CLEVR_CONCEPTS = ["color", "material", "size", "shape"]

def formalize_program_annots(ds, concept_list=CLEVR_CONCEPTS):
    programs = []
    for sample in ds:
        _nodes = []
        for i, node in enumerate(sample['program']):
            _fn_toks = node['function'].split('_')
            if len(_fn_toks) > 1 and _fn_toks[1] in concept_list:
                _fn, _concept = _fn_toks
            else:
                _fn = '_'.join(_fn_toks)
                _concept = None
            _value = None if not node['value_inputs'] else node['value_inputs'][0]
            _nodes.append(ProgramNode(step=i,
                                     function=_fn,
                                     inputs=node['inputs'],
                                     concept_input=_concept,
                                     value_input=_value,
            ))
        programs.append(_nodes)
    return programs


# do for training set
train_progs = formalize_program_annots(ds)

In [7]:
# Alternatively, if you already have the checkpoint load from here
train_progs = json.load(open('./checkpoints/clevr_programs/train.json'))
train_progs = [[ProgramNode(**node) for node in p] for p in train_progs]

FileNotFoundError: [Errno 2] No such file or directory: './checkpoints/clevr_programs/CLEVR_train_programs.json'

Let's see some processed program annotations:

In [8]:
pprint(train_progs[:2])

NameError: name 'train_progs' is not defined

Repeat for val and test datasets as well and save them under ``checkpoints`` folder

In [9]:
save_dir = os.path.join(root, "checkpoints/clevr/trainval/programs")
if not os.path.isdir(save_dir):
    os.mkdir(save_dir)

with open(os.path.join(save_dir, 'train.json'), 'w') as f:
    json.dump([[p.__dict__ for p in ps] for ps in train_progs], f)
    
_ds = json.load(open(val_questions_path))['questions']
_progs = formalize_program_annots(_ds)
with open(os.path.join(save_dir, 'val.json'), 'w') as f:
    json.dump([[p.__dict__ for p in ps] for ps in _progs], f)

# TEST set has no program annots :P
# _ds = json.load(open(test_questions_path))['questions']
# _progs = formalize_program_annots(_ds)
# with open(os.path.join(save_dir, 'CLEVR_test_programs.json'), 'w') as f:
#     json.dump([[p.__dict__ for p in ps] for ps in _progs], f)

del _ds, _progs

We develop some tools to generate the program vocabulary and tokenize the datasets. Note that we convert all programs to chains and reverse them as the final annotation.

In [4]:
from program_tokenizer import ProgramTokenizer

### v.0) Domain-Specific Version 

Lets tokenize the training programs with all domain information (*concept-aware, vocab-aware*) and spectate the resulting vocabulary. It looks exactly like the ``all_primitives`` set we extracted before, with the adittion of the special tags for decoding:

In [12]:
tokenizer = ProgramTokenizer(version=0)
tokenizer.make_from_dataset(train_progs)
#Vocab = json.load(open('checkpoints/clevr_programs/CLEVR_vocabularies.json'))['prog2id']
#tokenizer.make_vocab(Vocab)

Vocab = tokenizer.vocab
pprint(Vocab)

0
{'<END>': 2,
 '<PAD>': 0,
 '<START>': 1,
 '<UNK>': 3,
 'count': 4,
 'equal_integer': 5,
 'equal{color}': 6,
 'equal{material}': 7,
 'equal{shape}': 8,
 'equal{size}': 9,
 'exist': 10,
 'filter{color}[blue]': 11,
 'filter{color}[brown]': 12,
 'filter{color}[cyan]': 13,
 'filter{color}[gray]': 14,
 'filter{color}[green]': 15,
 'filter{color}[purple]': 16,
 'filter{color}[red]': 17,
 'filter{color}[yellow]': 18,
 'filter{material}[metal]': 19,
 'filter{material}[rubber]': 20,
 'filter{shape}[cube]': 21,
 'filter{shape}[cylinder]': 22,
 'filter{shape}[sphere]': 23,
 'filter{size}[large]': 24,
 'filter{size}[small]': 25,
 'greater_than': 26,
 'intersect': 27,
 'less_than': 28,
 'query{color}': 29,
 'query{material}': 30,
 'query{shape}': 31,
 'query{size}': 32,
 'relate[behind]': 33,
 'relate[front]': 34,
 'relate[left]': 35,
 'relate[right]': 36,
 'same{color}': 37,
 'same{material}': 38,
 'same{shape}': 39,
 'same{size}': 40,
 'scene': 41,
 'union': 42,
 'unique': 43}


Let's tokenize some programs to see the structure of the tokens

In [13]:
example_programs = train_progs[:2]
pprint(example_programs)
print()

pprint(tokenizer.convert_programs_to_tokens(example_programs))

[[(0): scene(),
  (1): filter{size}[large](0),
  (2): filter{color}[green](1),
  (3): count(2),
  (4): scene(),
  (5): filter{size}[large](4),
  (6): filter{color}[purple](5),
  (7): filter{material}[metal](6),
  (8): filter{shape}[cube](7),
  (9): count(8),
  (10): greater_than(3,9)],
 [(0): scene(),
  (1): filter{size}[small](0),
  (2): filter{color}[cyan](1),
  (3): filter{material}[rubber](2),
  (4): unique(3),
  (5): same{shape}(4),
  (6): count(5)]]

[['scene',
  'filter{size}[large]',
  'filter{color}[green]',
  'count',
  'scene',
  'filter{size}[large]',
  'filter{color}[purple]',
  'filter{material}[metal]',
  'filter{shape}[cube]',
  'count',
  'greater_than'],
 ['scene',
  'filter{size}[small]',
  'filter{color}[cyan]',
  'filter{material}[rubber]',
  'unique',
  'same{shape}',
  'count']]


Tokens are identical to the program representation, with the exception of dropping dependency inputs. These will be figured out by the program executor, as the program is structured as a chain

Let's manually encode-decode some program tokens to see the structure

In [14]:
print('Example program tokens:')
example_program = train_progs[1231]
example_tokens = tokenizer.tokenize(example_program)
print(example_tokens)
print()

print('Encoded ids:')
encoded = tokenizer.encode(example_tokens)
print(encoded)
print()
print('Raw conversion to tokens:')
print(tokenizer.convert_ids_to_tokens([encoded.tolist()])[0])
print()
print('Decoded tokens:')
decoded = tokenizer.decode(encoded)
print(decoded)


Example program tokens:
['scene', 'filter{size}[small]', 'filter{color}[yellow]', 'filter{material}[metal]', 'unique', 'same{shape}', 'exist']

Encoded ids:
tensor([ 1, 10, 39, 43, 19, 18, 25, 41,  2])

Raw conversion to tokens:
['<START>', 'exist', 'same{shape}', 'unique', 'filter{material}[metal]', 'filter{color}[yellow]', 'filter{size}[small]', 'scene', '<END>']

Decoded tokens:
['scene', 'filter{size}[small]', 'filter{color}[yellow]', 'filter{material}[metal]', 'unique', 'same{shape}', 'exist']


The same can be done for many samples at the same time with the use of ``batch_`` prefix

In [15]:
example_batch = tokenizer.convert_programs_to_tokens(train_progs[:32])
print(f'Batch size = {len(example_batch)}')
print([len(p) for p in example_batch])
print()

encoded = tokenizer.batch_encode(example_batch)
print(f'Encoded size = {encoded.shape}')
print()

decoded = tokenizer.batch_decode(encoded)
print(len(decoded))
print([len(p) for p in decoded])


Batch size = 32
[11, 7, 12, 15, 6, 16, 13, 8, 7, 10, 9, 7, 10, 9, 14, 16, 16, 12, 11, 17, 12, 15, 10, 14, 13, 8, 16, 14, 7, 7, 8, 6]

Encoded size = torch.Size([32, 19])

32
[11, 7, 12, 15, 6, 16, 13, 8, 7, 10, 9, 7, 10, 9, 14, 16, 16, 12, 11, 17, 12, 15, 10, 14, 13, 8, 16, 14, 7, 7, 8, 6]


We can do the same directly from ``ProgramNode`` representation by using the ``_program`` suffix

In [16]:
print('Example program:')
example_program = train_progs[1312]
pprint(example_program)

print('Encoded ids:')
encoded = tokenizer.encode_program(example_program)
print(encoded)
print()
print('Raw conversion to programs:')
print(tokenizer.convert_ids_to_programs([encoded.tolist()])[0])
print()
print('Decoded programs:')
decoded = tokenizer.decode_program(encoded)
print(decoded)
print()

# and same for multiple sample adding batch prefix
example_batch = train_progs[:32]
print(len(example_batch), [len(p) for p in example_batch])
encoded = tokenizer.batch_encode_program(example_batch)
decoded = tokenizer.batch_decode_program(encoded)
print(encoded.shape)
print(len(decoded), [len(p) for p in decoded])

Example program:
[(0): scene(),
 (1): filter{size}[large](0),
 (2): filter{color}[gray](1),
 (3): filter{shape}[cylinder](2),
 (4): unique(3),
 (5): relate[right](4),
 (6): filter{size}[large](5),
 (7): filter{material}[metal](6),
 (8): unique(7),
 (9): relate[left](8),
 (10): filter{color}[brown](9),
 (11): filter{shape}[cube](10),
 (12): count(11)]
Encoded ids:
tensor([ 1,  4, 21, 12, 35, 43, 19, 24, 36, 43, 22, 14, 24, 41,  2])

Raw conversion to programs:
[(0): <START>(-1), (1): count(0), (2): filter{shape}[cube](1), (3): filter{color}[brown](2), (4): relate[left](3), (5): unique(4), (6): filter{material}[metal](5), (7): filter{size}[large](6), (8): relate[right](7), (9): unique(8), (10): filter{shape}[cylinder](9), (11): filter{color}[gray](10), (12): filter{size}[large](11), (13): scene(), (14): <END>(13)]

Decoded programs:
[(0): scene(), (1): filter{size}[large](0), (2): filter{color}[gray](1), (3): filter{shape}[cylinder](2), (4): unique(3), (5): relate[right](4), (6): filter{

Encode the programs to token IDs and save the checkpoint datasets for both train and val set!

In [18]:
def save_checkpoints(train_progs, tokenizer, save_dir, val_path):
    if not os.path.isdir(save_dir):
        os.mkdir(save_dir)

    with open(os.path.join(save_dir, 'vocab.json'), 'w') as f:
        json.dump(tokenizer.vocab, f)

    train_prog_ids = tokenizer.batch_encode_program(train_progs)
    np.save(os.path.join(save_dir,'train_ids.npy' ), train_prog_ids.numpy())

    _ds = json.load(open(val_questions_path))['questions']
    _progs = formalize_program_annots(_ds)
    _progs = tokenizer.batch_encode_program(_progs)
    np.save(os.path.join(save_dir,'val_ids.npy'), _progs.numpy())

    del _ds, _progs

    
save_dir = os.path.join(root, "checkpoints/clevr/trainval/programs/v0")
save_checkpoints(train_progs, tokenizer, save_dir)

### v.1) Vocab-agnostic Version

We repeat the same as before but use the special tokens ``[, ]`` to define specific concept values as arguments, untangled from primitives, in order to reach the *vocab-agnostic* version of the library that we mention above. For a learning system, the specific concept values will be chosen from the input query, therefore freeing the primitives library from specific vocabulary. This will hopefully enable generalization,

In [19]:
tokenizer = ProgramTokenizer(version=1)
tokenizer.make_from_dataset(train_progs)
#Vocab = json.load(open('checkpoints/clevr_programs/CLEVR_vocabularies.json'))['prog2id']
#tokenizer.make_vocab(Vocab)

Vocab = tokenizer.vocab
pprint(Vocab)

{'<END>': 2,
 '<PAD>': 0,
 '<START>': 1,
 '<UNK>': 3,
 '<[>': 6,
 '<]>': 7,
 'count': 8,
 'equal_integer': 9,
 'equal{color}': 10,
 'equal{material}': 11,
 'equal{shape}': 12,
 'equal{size}': 13,
 'exist': 14,
 'filter{color}': 15,
 'filter{material}': 16,
 'filter{shape}': 17,
 'filter{size}': 18,
 'greater_than': 19,
 'intersect': 20,
 'less_than': 21,
 'query{color}': 22,
 'query{material}': 23,
 'query{shape}': 24,
 'query{size}': 25,
 'relate': 26,
 'same{color}': 27,
 'same{material}': 28,
 'same{shape}': 29,
 'same{size}': 30,
 'scene': 31,
 'union': 32,
 'unique': 33}


We see the two extra special tokens, that signature the beginning and end of a concept value argument. Our primitives are now free of the domain vocabulary!

Let's do the same as before to see the structure of tokens, encode-decode some programs and save train-val checkpoints

In [20]:
print('Example program:')
example_program = train_progs[1231]
print(example_program)
print()

print('Example program tokens:')
example_tokens = tokenizer._tokenize(example_program)
print(example_tokens)
print()

print('Encoded ids:')
encoded = tokenizer.encode(example_tokens)
print(encoded)
print()
print('Raw conversion to tokens:')
print(tokenizer.convert_ids_to_tokens([encoded.tolist()])[0])
print()
print('Decoded tokens:')
decoded = tokenizer.decode(encoded)
print(decoded)

Example program:
[(0): scene(), (1): filter{size}[small](0), (2): filter{color}[yellow](1), (3): filter{material}[metal](2), (4): unique(3), (5): same{shape}(4), (6): exist(5)]

Example program tokens:
['scene', 'filter{size}', '<[>', 'small', '<]>', 'filter{color}', '<[>', 'yellow', '<]>', 'filter{material}', '<[>', 'metal', '<]>', 'unique', 'same{shape}', 'exist']

Encoded ids:
tensor([ 1, 14, 29, 33,  7,  3,  6, 16,  7,  3,  6, 15,  7,  3,  6, 18, 31,  2])

Raw conversion to tokens:
['<START>', 'exist', 'same{shape}', 'unique', '<]>', '<UNK>', '<[>', 'filter{material}', '<]>', '<UNK>', '<[>', 'filter{color}', '<]>', '<UNK>', '<[>', 'filter{size}', 'scene', '<END>']

Decoded tokens:
['scene', 'filter{size}', '<[>', '<UNK>', '<]>', 'filter{color}', '<[>', '<UNK>', '<]>', 'filter{material}', '<[>', '<UNK>', '<]>', 'unique', 'same{shape}', 'exist']


As this version is *vocab-agnostic* it doesn't have a token for each specific concept value. Instead, specific values are replaced by the unknown ``<UNK>`` token.

In [22]:
# Directly from ProgramNode representations

print('Example program:')
example_program = train_progs[1312]
print(example_program)
print()

print('Example program tokens:')
example_tokens = tokenizer._tokenize(example_program)
print(example_tokens)
print()

print('Encoded ids:')
encoded = tokenizer.encode_program(example_program)
print(encoded)
print()

print('Raw conversion to programs:')
print(tokenizer.convert_ids_to_programs([encoded.tolist()])[0])
print()

print('Decoded programs:')
decoded = tokenizer.decode_program(encoded)
print(decoded)
print()

# and same for multiple sample adding batch prefix
example_batch = train_progs[:32]
print(len(example_batch), [len(p) for p in example_batch])
encoded = tokenizer.batch_encode_program(example_batch)
decoded = tokenizer.batch_decode_program(encoded)
print(encoded.shape)
print(len(decoded), [len(p) for p in decoded])

Example program:
[(0): scene(), (1): filter{size}[large](0), (2): filter{color}[gray](1), (3): filter{shape}[cylinder](2), (4): unique(3), (5): relate[right](4), (6): filter{size}[large](5), (7): filter{material}[metal](6), (8): unique(7), (9): relate[left](8), (10): filter{color}[brown](9), (11): filter{shape}[cube](10), (12): count(11)]

Example program tokens:
['scene', 'filter{size}', '<[>', 'large', '<]>', 'filter{color}', '<[>', 'gray', '<]>', 'filter{shape}', '<[>', 'cylinder', '<]>', 'unique', 'relate', '<[>', 'right', '<]>', 'filter{size}', '<[>', 'large', '<]>', 'filter{material}', '<[>', 'metal', '<]>', 'unique', 'relate', '<[>', 'left', '<]>', 'filter{color}', '<[>', 'brown', '<]>', 'filter{shape}', '<[>', 'cube', '<]>', 'count']

Encoded ids:
tensor([ 1,  8,  7,  3,  6, 17,  7,  3,  6, 15,  7,  3,  6, 26, 33,  7,  3,  6,
        16,  7,  3,  6, 18,  7,  3,  6, 26, 33,  7,  3,  6, 17,  7,  3,  6, 15,
         7,  3,  6, 18, 31,  2])

Raw conversion to programs:
[(0): <END>(

Save checkpoints

In [23]:
save_dir = os.path.join(root, "checkpoints/clevr/trainval/programs/v1")
save_checkpoints(train_progs, tokenizer, save_dir)

### v.2) Concept-agnostic Version

We repeat the same as before but use the special tokens ``{, }, [, ]`` to define both concepts and concept values as arguments, untangled from primitives, in order to reach the *concept-agnostic* version of the library that we mention above. For a learning system, the specific concepts and concept values will be chosen from the input query and the domain specification, therefore freeing the primitives library from specific vocabulary, while enabling it to generalize to novel combinations of primitives and concepts.

In [24]:
tokenizer = ProgramTokenizer(version=2)
tokenizer.make_from_dataset(train_progs)
#Vocab = json.load(open('checkpoints/clevr_programs/CLEVR_vocabularies.json'))['prog2id']
#tokenizer.make_vocab(Vocab)

Vocab = tokenizer.vocab
pprint(Vocab)

{'<END>': 2,
 '<PAD>': 0,
 '<START>': 1,
 '<UNK>': 3,
 '<[>': 10,
 '<]>': 11,
 '<{>': 12,
 '<}>': 13,
 'color': 14,
 'count': 15,
 'equal': 16,
 'equal_integer': 17,
 'exist': 18,
 'filter': 19,
 'greater_than': 20,
 'intersect': 21,
 'less_than': 22,
 'material': 23,
 'query': 24,
 'relate': 25,
 'same': 26,
 'scene': 27,
 'shape': 28,
 'size': 29,
 'union': 30,
 'unique': 31}


It's as before, with the two extra special tokens ```{, }``` to denote beginning and end of concept arguments. The arguments themselves are also part of the program vocabulary.

Let's inspect them as before

In [25]:
print('Example program:')
example_program = train_progs[1231]
print(example_program)
print()

print('Example program tokens:')
example_tokens = tokenizer._tokenize(example_program)
print(example_tokens)
print()

print('Encoded ids:')
encoded = tokenizer.encode(example_tokens)
print(encoded)
print()
print('Raw conversion to tokens:')
print(tokenizer.convert_ids_to_tokens([encoded.tolist()])[0])
print()
print('Decoded tokens:')
decoded = tokenizer.decode(encoded)
print(decoded)

Example program:
[(0): scene(), (1): filter{size}[small](0), (2): filter{color}[yellow](1), (3): filter{material}[metal](2), (4): unique(3), (5): same{shape}(4), (6): exist(5)]

Example program tokens:
['scene', 'filter', '<{>', 'size', '<}>', '<[>', 'small', '<]>', 'filter', '<{>', 'color', '<}>', '<[>', 'yellow', '<]>', 'filter', '<{>', 'material', '<}>', '<[>', 'metal', '<]>', 'unique', 'same', '<{>', 'shape', '<}>', 'exist']

Encoded ids:
tensor([ 1, 18, 13, 28, 12, 26, 31, 11,  3, 10, 13, 23, 12, 19, 11,  3, 10, 13,
        14, 12, 19, 11,  3, 10, 13, 29, 12, 19, 27,  2])

Raw conversion to tokens:
['<START>', 'exist', '<}>', 'shape', '<{>', 'same', 'unique', '<]>', '<UNK>', '<[>', '<}>', 'material', '<{>', 'filter', '<]>', '<UNK>', '<[>', '<}>', 'color', '<{>', 'filter', '<]>', '<UNK>', '<[>', '<}>', 'size', '<{>', 'filter', 'scene', '<END>']

Decoded tokens:
['scene', 'filter', '<{>', 'size', '<}>', '<[>', '<UNK>', '<]>', 'filter', '<{>', 'color', '<}>', '<[>', '<UNK>', '<]>', '

In [27]:
# Directly from ProgramNode representations

print('Example program:')
example_program = train_progs[1312]
print(example_program)
print()

print('Example program tokens:')
example_tokens = tokenizer._tokenize(example_program)
print(example_tokens)
print()

print('Encoded ids:')
encoded = tokenizer.encode_program(example_program)
print(encoded)
print()

print('Raw conversion to programs:')
print(tokenizer.convert_ids_to_programs([encoded.tolist()])[0])
print()

print('Decoded programs:')
decoded = tokenizer.decode_program(encoded)
print(decoded)
print()

# and same for multiple sample adding batch prefix
example_batch = train_progs[:32]
print(len(example_batch), [len(p) for p in example_batch])
encoded = tokenizer.batch_encode_program(example_batch)
decoded = tokenizer.batch_decode_program(encoded)
print(encoded.shape)
print(len(decoded), [len(p) for p in decoded])

Example program:
[(0): scene(), (1): filter{size}[large](0), (2): filter{color}[gray](1), (3): filter{shape}[cylinder](2), (4): unique(3), (5): relate[right](4), (6): filter{size}[large](5), (7): filter{material}[metal](6), (8): unique(7), (9): relate[left](8), (10): filter{color}[brown](9), (11): filter{shape}[cube](10), (12): count(11)]

Example program tokens:
['scene', 'filter', '<{>', 'size', '<}>', '<[>', 'large', '<]>', 'filter', '<{>', 'color', '<}>', '<[>', 'gray', '<]>', 'filter', '<{>', 'shape', '<}>', '<[>', 'cylinder', '<]>', 'unique', 'relate', '<[>', 'right', '<]>', 'filter', '<{>', 'size', '<}>', '<[>', 'large', '<]>', 'filter', '<{>', 'material', '<}>', '<[>', 'metal', '<]>', 'unique', 'relate', '<[>', 'left', '<]>', 'filter', '<{>', 'color', '<}>', '<[>', 'brown', '<]>', 'filter', '<{>', 'shape', '<}>', '<[>', 'cube', '<]>', 'count']

Encoded ids:
tensor([ 1, 15, 11,  3, 10, 13, 28, 12, 19, 11,  3, 10, 13, 14, 12, 19, 11,  3,
        10, 25, 31, 11,  3, 10, 13, 23, 12

In [28]:
save_dir = os.path.join(root, "checkpoints/clevr/programs/v2")
save_checkpoints(train_progs, tokenizer, save_dir)

## Generating generalization splits

Besides evaluating in the validation split of CLEVR dataset, we wish to asses the generalization performance of a semantic parser that  maps language to programs. To that end, we will create 3 generalization test splits:

  - **Novel Combinations**: Test in unseen combinations of attributes, e.g. train in red cubes and blue spheres and evaluate for blue cubes and red spheres.
  - **Novel Vocabulary**: Test in unseen concept values, e.g. train in cubes and cylinders and evaluate for spheres.
  - **Novel Tasks**: Test in unseen tasks, i.e. unseen combinations of primitives and concepts, e.g. train for filtering shape and querying for color, and evaluate for filtering color and querying for shape.

### Test-A) Novel Combinations

The CLEVR dataset actually comes with a pre-generated dataset for this generalization test, it is caled CoGEN-T, and you can download it together with the original dataset. In this setup, there are two conditions: A and B. In condition A there are gray, blue, brown, or yellow cubes, red, green, purple, or cyan cylinders and any color spheres, while in condition B the colors between cubes and cylinders are reversed. The experiment involves training on condition A and then evaluating in condition B with no further training (*zero-shot*) or with fine-tuning in few examples (*few-shot*). We will use the zero-shot setup in order to evaluate the combinatorial generalization abilities of learning systems. We will use the training and validation sets of condition A to train the model and validation set of condition B to test it.

All we have to do then is repeat the above steps for the downloaded CoGEN-T dataset.

In [31]:
save_dir = os.path.join(root, "checkpoints/clevr/testA/programs")
if not os.path.isdir(save_dir):
    os.mkdir(save_dir)

_ds = json.load(open(gen_trainA_questions_path))['questions']
_progs = formalize_program_annots(_ds)
with open(os.path.join(save_dir, 'train.json'), 'w') as f:
    json.dump([[p.__dict__ for p in ps] for ps in train_progs], f)
    
_ds = json.load(open(gen_valA_questions_path))['questions']
_progs = formalize_program_annots(_ds)
with open(os.path.join(save_dir, 'val.json'), 'w') as f:
    json.dump([[p.__dict__ for p in ps] for ps in _progs], f)

_ds = json.load(open(gen_valB_questions_path))['questions']
_progs = formalize_program_annots(_ds)
with open(os.path.join(save_dir, 'test.json'), 'w') as f:
    json.dump([[p.__dict__ for p in ps] for ps in _progs], f)

del _ds, _progs

Save tokenized checkpoints

In [9]:
def unpad(X, eos_token_id=2):
    eos_mask = (torch.where(X == eos_token_id)[1] + 1).tolist()
    token_ids = [toks[:idx] for toks, idx in zip(X.tolist(), eos_mask)]
    return token_ids


def save_checkpoints(save_dir, path, name):
    def _save_checkpoints(progs, tokenizer, save_dir, name):
        if not os.path.isdir(save_dir):
            os.mkdir(save_dir)

        with open(os.path.join(save_dir, 'vocab.json'), 'w') as f:
            json.dump(tokenizer.vocab, f)

        name = f'{name}_ids.json'
        prog_ids = tokenizer.batch_encode_program(progs)
        prog_ids = unpad(prog_ids)
        #np.save(os.path.join(save_dir, name), prog_ids.numpy())
        with open(os.path.join(save_dir, name), 'w') as g:
            json.dump(prog_ids, g)
            
    _ds = json.load(open(path))['questions']
    _progs = formalize_program_annots(_ds)
    
    for version in [0, 1, 2]:
        tokenizer = ProgramTokenizer(version=version)
        _ = tokenizer.make_from_dataset(_progs)
        _save_checkpoints(_progs, tokenizer, os.path.join(save_dir, f'v{version}'), name=name)
        
    
save_dir = os.path.join(root, "checkpoints/clevr/testA/programs")
paths = [gen_trainA_questions_path, gen_valA_questions_path, gen_valB_questions_path]
names = ['train', 'val', 'test']
for p, n in zip(paths, names):
    save_checkpoints(save_dir, p, n)

NameError: name 'torch' is not defined

### Test-B) Novel Vocabulary

### Test-C) Novel Tasks

# Executing CLEVR programs