### Scrips for generating splits
This script assums you have the main ReaSCAN generated by the generate_ReaSCAN.py script. After that, you can use this file to generate/extrapolate different splits. In the future, we may consolidate two files.

In [25]:
from collections import namedtuple, OrderedDict
import os
from typing import List
from typing import Tuple
import logging
from collections import defaultdict
from collections import Counter
import json
import torch
import numpy as np

def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter
if isnotebook():
    device = torch.device("cpu")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO,
                    datefmt="%Y-%m-%d %H:%M")
logger = logging.getLogger(__name__)

from world import *
from vocabulary import Vocabulary as ReaSCANVocabulary
from object_vocabulary import *

#### P1: gSCAN Pattern

In [26]:
p1_path_to_data = "../../data-files/ReaSCAN-compositional-p1/data-train.txt"
logger.info(f"Reading dataset from file: {p1_path_to_data}...")
p1_data_json = json.load(open(p1_path_to_data, "r"))

p1_all_fake_train = p1_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p1_all_fake_train)

2021-05-19 17:01 Reading dataset from file: ../../data-files/ReaSCAN-compositional-p1/data-train.txt...


121500

In [27]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p1_id_example_map = OrderedDict({})
p1_id_splits_map = OrderedDict({})
index = 0
for example in p1_data_json["examples"]["train"]:
    p1_id_example_map[index] = example
    p1_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [31]:
p1_splits_distribution = OrderedDict({})
p1_splits_assignment = OrderedDict({})
for index, splits in p1_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p1_splits_distribution.keys():
            p1_splits_distribution[split] += 1
        else:
            p1_splits_distribution[split] = 1
        
        if split in p1_splits_assignment:
            p1_splits_assignment[split].append(index)
        else:
            p1_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p1_all_fake_train)*0.01)
gscan_test_size = int(len(p1_all_fake_train)*0.052)
p1_all_example_id = p1_splits_assignment["train"]
random.shuffle(p1_all_example_id)
p1_train_example_id = p1_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p1_dev_example_id = p1_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p1_test_example_id = p1_all_example_id[-gscan_dev_size:]
p1_splits_assignment["train"] = p1_train_example_id
p1_splits_assignment["dev"] = p1_dev_example_id
p1_splits_assignment["test"] = p1_test_example_id

In [39]:
for split, all_ids in p1_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 113967 examples.
for dev split, we have 6318 examples.
for test split, we have 1215 examples.


In [40]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p1_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p1_id_example_map[_id])

In [41]:
# save it to the disk
p1_data_json["examples"] = updated_examples
with open("../../data-files/ReaSCAN-compositional-p1/data-compositional-splits.txt", "w") as fd:
    json.dump(p1_data_json, fd, indent=4)

#### P2: Single Clause

In [42]:
p2_path_to_data = "../../data-files/ReaSCAN-compositional-p2/data-train.txt"
logger.info(f"Reading dataset from file: {p2_path_to_data}...")
p2_data_json = json.load(open(p2_path_to_data, "r"))

p2_all_fake_train = p2_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p2_all_fake_train)

2021-05-19 18:38 Reading dataset from file: ../../data-files/ReaSCAN-compositional-p2/data-train.txt...


362233

In [43]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p2_id_example_map = OrderedDict({})
p2_id_splits_map = OrderedDict({})
index = 0
for example in p2_data_json["examples"]["train"]:
    p2_id_example_map[index] = example
    p2_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [44]:
p2_splits_distribution = OrderedDict({})
p2_splits_assignment = OrderedDict({})
for index, splits in p2_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p2_splits_distribution.keys():
            p2_splits_distribution[split] += 1
        else:
            p2_splits_distribution[split] = 1
        
        if split in p2_splits_assignment:
            p2_splits_assignment[split].append(index)
        else:
            p2_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p2_all_fake_train)*0.01)
gscan_test_size = int(len(p2_all_fake_train)*0.052)
p2_all_example_id = p2_splits_assignment["train"]
random.shuffle(p2_all_example_id)
p2_train_example_id = p2_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p2_dev_example_id = p2_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p2_test_example_id = p2_all_example_id[-gscan_dev_size:]
p2_splits_assignment["train"] = p2_train_example_id
p2_splits_assignment["dev"] = p2_dev_example_id
p2_splits_assignment["test"] = p2_test_example_id

In [45]:
for split, all_ids in p2_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 339775 examples.
for dev split, we have 18836 examples.
for test split, we have 3622 examples.


In [46]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p2_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p2_id_example_map[_id])

In [47]:
# save it to the disk
p2_data_json["examples"] = updated_examples
with open("../../data-files/ReaSCAN-compositional-p2/data-compositional-splits.txt", "w") as fd:
    json.dump(p2_data_json, fd, indent=4)

#### P3: Double Clause

In [54]:
p3_path_to_data = "../../data-files/ReaSCAN-compositional-p3/data-train.txt"
logger.info(f"Reading dataset from file: {p3_path_to_data}...")
p3_data_json = json.load(open(p3_path_to_data, "r"))

2021-05-20 03:18 Reading dataset from file: ../../data-files/ReaSCAN-compositional-p3/data-train.txt...


In [58]:
p3_all_fake_train = data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p3_all_fake_train)

531406

In [59]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p3_id_example_map = OrderedDict({})
p3_id_splits_map = OrderedDict({})
index = 0
for example in p3_data_json["examples"]["train"]:
    p3_id_example_map[index] = example
    p3_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [60]:
p3_splits_distribution = OrderedDict({})
p3_splits_assignment = OrderedDict({})
for index, splits in p3_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p3_splits_distribution.keys():
            p3_splits_distribution[split] += 1
        else:
            p3_splits_distribution[split] = 1
        
        if split in p3_splits_assignment:
            p3_splits_assignment[split].append(index)
        else:
            p3_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p3_all_fake_train)*0.01)
gscan_test_size = int(len(p3_all_fake_train)*0.052)
p3_all_example_id = p3_splits_assignment["train"]
random.shuffle(p3_all_example_id)
p3_train_example_id = p3_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p3_dev_example_id = p3_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p3_test_example_id = p3_all_example_id[-gscan_dev_size:]
p3_splits_assignment["train"] = p3_train_example_id
p3_splits_assignment["dev"] = p3_dev_example_id
p3_splits_assignment["test"] = p3_test_example_id

In [61]:
for split, all_ids in p3_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 565253 examples.
for dev split, we have 27633 examples.
for test split, we have 5314 examples.


In [62]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p3_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p3_id_example_map[_id])

In [63]:
# save it to the disk
p3_data_json["examples"] = updated_examples
with open("../../data-files/ReaSCAN-compositional-p3/data-compositional-splits.txt", "w") as fd:
    json.dump(p3_data_json, fd, indent=4)

#### P3-RD: Double Clause with Only Random Distractors (and some contextual distractors, which are also random)

In [64]:
p3_rd_path_to_data = "../../data-files/ReaSCAN-compositional-p3-rd/data-train.txt"
logger.info(f"Reading dataset from file: {p3_rd_path_to_data}...")
p3_rd_data_json = json.load(open(p3_rd_path_to_data, "r"))

2021-05-20 13:12 Reading dataset from file: ../../data-files/ReaSCAN-compositional-p3-rd/data-train.txt...


In [65]:
p3_rd_all_fake_train = p3_rd_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p3_rd_all_fake_train)

607500

In [66]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p3_rd_id_example_map = OrderedDict({})
p3_rd_id_splits_map = OrderedDict({})
index = 0
for example in p3_rd_data_json["examples"]["train"]:
    p3_rd_id_example_map[index] = example
    p3_rd_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [67]:
p3_rd_splits_distribution = OrderedDict({})
p3_rd_splits_assignment = OrderedDict({})
for index, splits in p3_rd_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p3_rd_splits_distribution.keys():
            p3_rd_splits_distribution[split] += 1
        else:
            p3_rd_splits_distribution[split] = 1
        
        if split in p3_rd_splits_assignment:
            p3_rd_splits_assignment[split].append(index)
        else:
            p3_rd_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p3_rd_all_fake_train)*0.01)
gscan_test_size = int(len(p3_rd_all_fake_train)*0.052)
p3_rd_all_example_id = p3_rd_splits_assignment["train"]
random.shuffle(p3_rd_all_example_id)
p3_rd_train_example_id = p3_rd_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p3_rd_dev_example_id = p3_rd_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p3_rd_test_example_id = p3_rd_all_example_id[-gscan_dev_size:]
p3_rd_splits_assignment["train"] = p3_rd_train_example_id
p3_rd_splits_assignment["dev"] = p3_rd_dev_example_id
p3_rd_splits_assignment["test"] = p3_rd_test_example_id

In [68]:
for split, all_ids in p3_rd_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 569835 examples.
for dev split, we have 31590 examples.
for test split, we have 6075 examples.


In [69]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p3_rd_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p3_rd_id_example_map[_id])

In [70]:
# save it to the disk
p3_rd_data_json["examples"] = updated_examples
with open("../../data-files/ReaSCAN-compositional-p3-rd/data-compositional-splits.txt", "w") as fd:
    json.dump(p3_rd_data_json, fd, indent=4)

#### P1+P2+P3: Compositional Splits

In [4]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
id_example_map = OrderedDict({})
id_splits_map = OrderedDict({})
index = 0
for example in data_json["examples"]["train"]:
    id_example_map[index] = example
    id_splits_map[index] = set([]) # set of splits that this example belongs to.
    
    # gscan_yellow_square_command_target_only
    if "yellow,square" in example['command'].split("that")[0]:
        id_splits_map[index].add("gscan_yellow_square_command_target_only")

    # gscan_yellow_square_command
    if "yellow,square" in example['command']:
        id_splits_map[index].add("gscan_yellow_square_command")
    
    # gscan_red_box_visual
    if "red,box" in example['command'] or \
        ((example['situation']['placed_objects']['1']['object']['shape'] == "box" and \
        example['situation']['placed_objects']['1']['object']['color'] == "red") or \
        (example['situation']['placed_objects']['2']['object']['shape'] == "box" and \
        example['situation']['placed_objects']['2']['object']['color'] == "red")):
        id_splits_map[index].add("gscan_red_box_visual")
    
    # gscan_small_cylinder_command_target_only
    if "small,cylinder" in example['command'].split("that")[0] or \
        "small,red,cylinder" in example['command'].split("that")[0] or \
        "small,blue,cylinder" in example['command'].split("that")[0] or \
        "small,yellow,cylinder" in example['command'].split("that")[0] or \
        "small,green,cylinder" in example['command'].split("that")[0]:
        id_splits_map[index].add("gscan_small_cylinder_command_target_only")
    
    # novel_yellow_square_blue_circle_coexist_shape
    if "yellow,square" in example['command'] and "blue,circle" in example['command']:
        id_splits_map[index].add("novel_yellow_square_blue_circle_coexist_shape")
    
    # novel_green_circle_box_coexist (must be down side objects)
    if "green,circle" not in example['command'].split("that")[0] and \
        "green,circle" in example['command'] and "box" in example['command']:
        id_splits_map[index].add("novel_green_circle_box_coexist_box_shape")

    # novel_same_shape_is_inside_coexist_relation
    if "same,shape" in example['command'] and "is,inside" in example['command']:
        id_splits_map[index].add("novel_same_shape_is_inside_coexist_relation")
        
    # novel_inside_of_as_yellow_box
    if "is,inside,of,a,yellow,box" in example['command'] or \
        "is,inside,of,the,yellow,box" in example['command'] or \
        "is,inside,of,a,small,yellow,box" in example['command'] or \
        "is,inside,of,the,small,yellow,box" in example['command']:
        id_splits_map[index].add("novel_inside_of_as_yellow_box")
    
    few_shot_single_clause_logic
    if example['grammer_pattern'] == "$OBJ_0 ^ $OBJ_1":
        id_splits_map[index].add("few_shot_single_clause_logic")
      
    few_shot_single_clause_logic
    if example['grammer_pattern'] == "$OBJ_0":
        id_splits_map[index].add("few_shot_none_clause_logic")
    
    index += 1

In [5]:
splits_distribution = OrderedDict({})
splits_assignment = OrderedDict({})
for index, splits in id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in splits_distribution.keys():
            splits_distribution[split] += 1
        else:
            splits_distribution[split] = 1
        
        if split in splits_assignment:
            splits_assignment[split].append(index)
        else:
            splits_assignment[split] = [index]
    else:   
        for split in splits:
            if split in splits_distribution.keys():
                splits_distribution[split] += 1
            else:
                splits_distribution[split] = 1
                
            if split in splits_assignment:
                splits_assignment[split].append(index)
            else:
                splits_assignment[split] = [index]

# Let us further segment train into dev and test!
gscan_dev_size = 3716
gscan_test_size = 19282
all_example_id = splits_assignment["train"]
random.shuffle(all_example_id)
train_example_id = all_example_id[:(-3716-19282)]
dev_example_id = all_example_id[(-3716-19282):-3716]
test_example_id = all_example_id[-3716:]
splits_assignment["train"] = train_example_id
splits_assignment["dev"] = dev_example_id
splits_assignment["test"] = test_example_id

In [6]:
for split, all_ids in splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 352392 examples.
for novel_inside_of_as_yellow_box split, we have 15950 examples.
for gscan_yellow_square_command_target_only split, we have 21801 examples.
for gscan_yellow_square_command split, we have 76979 examples.
for gscan_red_box_visual split, we have 52433 examples.
for novel_green_circle_box_coexist_box_shape split, we have 13700 examples.
for gscan_small_cylinder_command_target_only split, we have 31867 examples.
for novel_yellow_square_blue_circle_coexist_shape split, we have 8450 examples.
for novel_same_shape_is_inside_coexist_relation split, we have 4698 examples.
for dev split, we have 19282 examples.
for test split, we have 3716 examples.


In [7]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(id_example_map[_id])

In [8]:
# save it to the disk
data_json["examples"] = updated_examples
with open("../../data-files/ReaSCAN-compositional/data-compositional-splits.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)