### Scrips for generating splits
This script assums you have the main ReaSCAN generated by the generate_ReaSCAN.py script. After that, you can use this file to generate/extrapolate different splits. In the future, we may consolidate two files.

In [66]:
from collections import namedtuple, OrderedDict
import os
from typing import List
from typing import Tuple
import logging
from collections import defaultdict
from collections import Counter
import json
import torch
import numpy as np

def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter
if isnotebook():
    device = torch.device("cpu")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO,
                    datefmt="%Y-%m-%d %H:%M")
logger = logging.getLogger(__name__)

from world import *
from vocabulary import Vocabulary as ReaSCANVocabulary
from object_vocabulary import *

#### P1: gSCAN Pattern

In [67]:
p1_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p1/data-train.txt"
logger.info(f"Reading dataset from file: {p1_path_to_data}...")
p1_data_json = json.load(open(p1_path_to_data, "r"))

p1_all_fake_train = p1_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p1_all_fake_train)

2021-06-08 17:24 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p1/data-train.txt...


121500

In [68]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p1_id_example_map = OrderedDict({})
p1_id_splits_map = OrderedDict({})
index = 0
for example in p1_data_json["examples"]["train"]:
    p1_id_example_map[index] = example
    p1_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [69]:
p1_splits_distribution = OrderedDict({})
p1_splits_assignment = OrderedDict({})
for index, splits in p1_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p1_splits_distribution.keys():
            p1_splits_distribution[split] += 1
        else:
            p1_splits_distribution[split] = 1
        
        if split in p1_splits_assignment:
            p1_splits_assignment[split].append(index)
        else:
            p1_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p1_all_fake_train)*0.01)
gscan_test_size = int(len(p1_all_fake_train)*0.052)
p1_all_example_id = p1_splits_assignment["train"]
random.shuffle(p1_all_example_id)
p1_train_example_id = p1_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p1_dev_example_id = p1_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p1_test_example_id = p1_all_example_id[-gscan_dev_size:]
p1_splits_assignment["train"] = p1_train_example_id
p1_splits_assignment["dev"] = p1_dev_example_id
p1_splits_assignment["test"] = p1_test_example_id

In [70]:
for split, all_ids in p1_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 113967 examples.
for dev split, we have 6318 examples.
for test split, we have 1215 examples.


In [71]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p1_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p1_id_example_map[_id])

In [72]:
# save it to the disk
p1_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p1/data-compositional-splits.txt", "w") as fd:
    json.dump(p1_data_json, fd, indent=4)

#### P2: Single Clause

In [73]:
p2_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p2/data-train.txt"
logger.info(f"Reading dataset from file: {p2_path_to_data}...")
p2_data_json = json.load(open(p2_path_to_data, "r"))

p2_all_fake_train = p2_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p2_all_fake_train)

2021-06-08 17:26 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p2/data-train.txt...


363523

In [74]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p2_id_example_map = OrderedDict({})
p2_id_splits_map = OrderedDict({})
index = 0
for example in p2_data_json["examples"]["train"]:
    p2_id_example_map[index] = example
    p2_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [75]:
p2_splits_distribution = OrderedDict({})
p2_splits_assignment = OrderedDict({})
for index, splits in p2_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p2_splits_distribution.keys():
            p2_splits_distribution[split] += 1
        else:
            p2_splits_distribution[split] = 1
        
        if split in p2_splits_assignment:
            p2_splits_assignment[split].append(index)
        else:
            p2_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p2_all_fake_train)*0.01)
gscan_test_size = int(len(p2_all_fake_train)*0.052)
p2_all_example_id = p2_splits_assignment["train"]
random.shuffle(p2_all_example_id)
p2_train_example_id = p2_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p2_dev_example_id = p2_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p2_test_example_id = p2_all_example_id[-gscan_dev_size:]
p2_splits_assignment["train"] = p2_train_example_id
p2_splits_assignment["dev"] = p2_dev_example_id
p2_splits_assignment["test"] = p2_test_example_id

In [76]:
for split, all_ids in p2_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 340985 examples.
for dev split, we have 18903 examples.
for test split, we have 3635 examples.


In [77]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p2_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p2_id_example_map[_id])

In [78]:
# save it to the disk
p2_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p2/data-compositional-splits.txt", "w") as fd:
    json.dump(p2_data_json, fd, indent=4)

#### P3: Double Clause

In [79]:
p3_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p3/data-train.txt"
logger.info(f"Reading dataset from file: {p3_path_to_data}...")
p3_data_json = json.load(open(p3_path_to_data, "r"))

2021-06-08 17:33 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p3/data-train.txt...


In [80]:
p3_all_fake_train = p3_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p3_all_fake_train)

585963

In [81]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p3_id_example_map = OrderedDict({})
p3_id_splits_map = OrderedDict({})
index = 0
for example in p3_data_json["examples"]["train"]:
    p3_id_example_map[index] = example
    p3_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [82]:
p3_splits_distribution = OrderedDict({})
p3_splits_assignment = OrderedDict({})
for index, splits in p3_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p3_splits_distribution.keys():
            p3_splits_distribution[split] += 1
        else:
            p3_splits_distribution[split] = 1
        
        if split in p3_splits_assignment:
            p3_splits_assignment[split].append(index)
        else:
            p3_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p3_all_fake_train)*0.01)
gscan_test_size = int(len(p3_all_fake_train)*0.052)
p3_all_example_id = p3_splits_assignment["train"]
random.shuffle(p3_all_example_id)
p3_train_example_id = p3_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p3_dev_example_id = p3_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p3_test_example_id = p3_all_example_id[-gscan_dev_size:]
p3_splits_assignment["train"] = p3_train_example_id
p3_splits_assignment["dev"] = p3_dev_example_id
p3_splits_assignment["test"] = p3_test_example_id

In [83]:
for split, all_ids in p3_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 549634 examples.
for dev split, we have 30470 examples.
for test split, we have 5859 examples.


In [84]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p3_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p3_id_example_map[_id])

In [85]:
# save it to the disk
p3_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p3/data-compositional-splits.txt", "w") as fd:
    json.dump(p3_data_json, fd, indent=4)

#### P3: Double Clause (combing with P3 sharding, this section is shared across all patterns as well!)

In [56]:
import os
import json
import re
pattern = "p4"
special_condition = ""
if special_condition != "":
    prefix = f"{pattern}-{special_condition}"
else:
    prefix = pattern
sharding_dir = f"../../data-files-{prefix}/"
if pattern == "p3":
    upper_limit = 3375
elif pattern == "p2":
    upper_limit = 2025
elif pattern == "p1":
    upper_limit = 675
elif pattern == "p4":
    upper_limit = 4000

In [57]:
unique_command = set([])
for subdir, dirs, files in os.walk(sharding_dir):
    if "jobid" in subdir:
        
        # Completeness check!
        logging_file = os.path.join(subdir, "generator.log")
        with open(logging_file) as f:
            content = f.readlines()
        # you may also want to remove whitespace characters like `\n` at the end of each line
        content = [x.strip() for x in content]
        completed = False
        for c in content:
            if "==FINISH==" in c:
                completed = True
                break
        jobid = logging_file.split("/")[-2].split("-")[-1]
        print(f"jobid={jobid}, status=complete={completed}")
        if not completed:
            break
        
        # Uniqueness check!
        data_file_path = os.path.join(subdir, "data-train.txt")
        print(f"scanning for file: {data_file_path}")
        data_file = json.load(open(data_file_path, "r"))
        for example in data_file["examples"]["train"]:
            command_split = re.split(',a,|,the,', example['command'])
            command_mono = ",".join(command_split)
            unique_command.add(command_mono)
assert len(unique_command) > upper_limit

jobid=45, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-45/data-train.txt
jobid=32, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-32/data-train.txt
jobid=38, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-38/data-train.txt
jobid=19, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-19/data-train.txt
jobid=13, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-13/data-train.txt
jobid=3, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-3/data-train.txt
jobid=21, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-21/data-train.txt
jobid=9, status=complete=True
scanning for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-9/data-train.txt
jobid=4, status=complete=True
scanning for file: ../

In [58]:
len(unique_command)

4484

In [59]:
shared_example_combined = {}
per_command_mono_count = {}
for subdir, dirs, files in os.walk(sharding_dir):
    if "jobid" in subdir:
        data_file_path = os.path.join(subdir, "data-train.txt")
        print(f"Collecting for file: {data_file_path}")
        data_file = json.load(open(data_file_path, "r"))
        for example in data_file["examples"]["train"]:
            command_split = re.split(',a,|,the,', example['command'])
            command_mono = ",".join(command_split)
            if command_mono in per_command_mono_count.keys():
                if per_command_mono_count[command_mono] == 180: # for p4 this may never hit!
                    continue # we are not adding this example since redundant!
                per_command_mono_count[command_mono] += 1
            else:
                per_command_mono_count[command_mono] = 1
            if command_mono in shared_example_combined.keys():
                shared_example_combined[command_mono].append(example)
            else:
                shared_example_combined[command_mono] = [example]

Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-45/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-32/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-38/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-19/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-13/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-3/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-21/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-9/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-4/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-26/data-train.txt
Collecting for file: ../../data-files-p4/ReaSCAN-compositional-p4-jobid-14/data-train.txt
Collecting fo

In [60]:
# write to disk!
import random
shared_examples = []
commands_mono = list(shared_example_combined.keys())
random.shuffle(commands_mono)
for i in range(upper_limit):
    examples_to_include = shared_example_combined[commands_mono[i]]
    for example in examples_to_include:
        shared_examples.append(example)

In [61]:
data_file["examples"]["train"] = shared_examples

In [62]:
print(len(shared_examples))

11810


In [64]:
with open(f"../../data-files-{prefix}/ReaSCAN-compositional-{prefix}/data-train.txt", "w") as fd:
    json.dump(data_file, fd, indent=4)

#### P3-RD: Double Clause with Only Random Distractors (and some contextual distractors, which are also random)

In [86]:
p3_rd_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p3-rd/data-train.txt"
logger.info(f"Reading dataset from file: {p3_rd_path_to_data}...")
p3_rd_data_json = json.load(open(p3_rd_path_to_data, "r"))

2021-06-08 17:40 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p3-rd/data-train.txt...


In [87]:
p3_rd_all_fake_train = p3_rd_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p3_rd_all_fake_train)

607500

In [88]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p3_rd_id_example_map = OrderedDict({})
p3_rd_id_splits_map = OrderedDict({})
index = 0
for example in p3_rd_data_json["examples"]["train"]:
    p3_rd_id_example_map[index] = example
    p3_rd_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [89]:
p3_rd_splits_distribution = OrderedDict({})
p3_rd_splits_assignment = OrderedDict({})
for index, splits in p3_rd_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p3_rd_splits_distribution.keys():
            p3_rd_splits_distribution[split] += 1
        else:
            p3_rd_splits_distribution[split] = 1
        
        if split in p3_rd_splits_assignment:
            p3_rd_splits_assignment[split].append(index)
        else:
            p3_rd_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p3_rd_all_fake_train)*0.01)
gscan_test_size = int(len(p3_rd_all_fake_train)*0.052)
p3_rd_all_example_id = p3_rd_splits_assignment["train"]
random.shuffle(p3_rd_all_example_id)
p3_rd_train_example_id = p3_rd_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p3_rd_dev_example_id = p3_rd_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p3_rd_test_example_id = p3_rd_all_example_id[-gscan_dev_size:]
p3_rd_splits_assignment["train"] = p3_rd_train_example_id
p3_rd_splits_assignment["dev"] = p3_rd_dev_example_id
p3_rd_splits_assignment["test"] = p3_rd_test_example_id

In [90]:
for split, all_ids in p3_rd_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for train split, we have 569835 examples.
for dev split, we have 31590 examples.
for test split, we have 6075 examples.


In [91]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p3_rd_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p3_rd_id_example_map[_id])

In [92]:
# save it to the disk
p3_rd_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p3-rd/data-compositional-splits.txt", "w") as fd:
    json.dump(p3_rd_data_json, fd, indent=4)

#### P1+P2+P3: Compositional Splits

In [111]:
import random
import numpy as np
random.seed(42)
np.random.seed(42)

# Combine all of three together
# We downsample it to make it trainable within reasonable time frame!
p1_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p1/data-train.txt"
logger.info(f"Reading dataset from file: {p1_path_to_data}...")
p1_data_json = json.load(open(p1_path_to_data, "r"))

p2_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p2/data-train.txt"
logger.info(f"Reading dataset from file: {p2_path_to_data}...")
p2_data_json = json.load(open(p2_path_to_data, "r"))

2021-06-08 18:03 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p1/data-train.txt...
2021-06-08 18:04 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p2/data-train.txt...


In [127]:
p3_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p3/data-train.txt"
logger.info(f"Reading dataset from file: {p3_path_to_data}...")
p3_data_json = json.load(open(p3_path_to_data, "r"))

2021-06-08 18:24 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p3/data-train.txt...


In [128]:
# Combine them into a single big train!
p1_examples = p1_data_json["examples"]["train"]
p2_examples = p2_data_json["examples"]["train"]
p3_data_json["examples"]["train"].extend(p1_examples)
p3_data_json["examples"]["train"].extend(p2_examples)
data_json = p3_data_json

In [129]:
# let us downsample it to ?K
len(p3_data_json["examples"]["train"])

1070986

In [130]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
id_example_map = OrderedDict({})
id_splits_map = OrderedDict({})
index = 0
for example in data_json["examples"]["train"]:
    id_example_map[index] = example
    id_splits_map[index] = set([]) # set of splits that this example belongs to.

    # a1_novel_color_attribute
    if "yellow,square" in example['command']:
        id_splits_map[index].add("a1_novel_color_attribute")
    
    # a2_novel_color_attribute_visual
    if example["derivation"] == "$OBJ_0":
        if "red,square" in example['command'] or \
            (example['situation']['placed_objects']['0']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['0']['object']['color'] == "red"):
            id_splits_map[index].add("a2_novel_color_attribute_visual")
    elif example["derivation"] == "$OBJ_0 ^ $OBJ_1":
        if "red,square" in example['command'] or \
            (example['situation']['placed_objects']['0']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['0']['object']['color'] == "red") or \
            (example['situation']['placed_objects']['1']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['1']['object']['color'] == "red"):
            id_splits_map[index].add("a2_novel_color_attribute_visual")
    elif example["derivation"] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2":
        if "red,square" in example['command'] or \
            (example['situation']['placed_objects']['0']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['0']['object']['color'] == "red") or \
            (example['situation']['placed_objects']['1']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['1']['object']['color'] == "red") or \
            (example['situation']['placed_objects']['2']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['2']['object']['color'] == "red"):
            id_splits_map[index].add("a2_novel_color_attribute_visual")
    else:
        pass
    
    # gscan_small_cylinder_command_target_only
    if "small,cylinder" in example['command'] or \
        "small,red,cylinder" in example['command'] or \
        "small,blue,cylinder" in example['command'] or \
        "small,yellow,cylinder" in example['command'] or \
        "small,green,cylinder" in example['command']:
        id_splits_map[index].add("a3_novel_size_attribute")
    
    # novel_yellow_square_blue_circle_coexist_shape
    if "yellow,square" in example['command'] and "blue,circle" in example['command']:
        id_splits_map[index].add("b_novel_object_coexist")

    # novel_same_shape_is_inside_coexist_relation
    if "same,shape" in example['command'] and "is,inside" in example['command']:
        id_splits_map[index].add("c_novel_relation_coexist")
        
    # novel_inside_of_as_yellow_box
    if "is,inside,of,a,yellow,box" in example['command'] or \
        "is,inside,of,the,yellow,box" in example['command'] or \
        "is,inside,of,a,small,yellow,box" in example['command'] or \
        "is,inside,of,the,small,yellow,box" in example['command'] or \
        "is,inside,of,a,big,yellow,box" in example['command'] or \
        "is,inside,of,the,big,yellow,box" in example['command']:
        id_splits_map[index].add("d_novel_object_relation_pair")
    
    if example['grammer_pattern'] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2 & $OBJ_3":
        id_splits_map[index].add("e_novel_clause_length")
    
    index += 1

In [131]:
splits_distribution = OrderedDict({})
splits_assignment = OrderedDict({})
count = 0
ccount = 0
for index, splits in id_splits_map.items():
    if len(splits) == 0:
        count += 1
        split = "train" # let us split this up later!
        if split in splits_distribution.keys():
            splits_distribution[split] += 1
        else:
            splits_distribution[split] = 1
        
        if split in splits_assignment:
            splits_assignment[split].append(index)
        else:
            splits_assignment[split] = [index]
    else:
        ccount += 1
        for split in splits:
            if split in splits_distribution.keys():
                splits_distribution[split] += 1
            else:
                splits_distribution[split] = 1
                
            if split in splits_assignment:
                splits_assignment[split].append(index)
            else:
                splits_assignment[split] = [index]

# Let us further segment train into dev and test!
all_example_id = splits_assignment["train"]
gscan_dev_size = int(len(all_example_id)*0.01)
gscan_test_size = int(len(all_example_id)*0.052)
random.shuffle(all_example_id)
train_example_id = all_example_id[:(-gscan_dev_size-gscan_test_size)]
dev_example_id = all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
test_example_id = all_example_id[-gscan_dev_size:]
splits_assignment["train"] = train_example_id
splits_assignment["dev"] = dev_example_id
splits_assignment["test"] = test_example_id

In [132]:
splits_distribution

OrderedDict([('a3_novel_size_attribute', 214892),
             ('train', 575395),
             ('a2_novel_color_attribute_visual', 187302),
             ('c_novel_relation_coexist', 5940),
             ('d_novel_object_relation_pair', 37620),
             ('a1_novel_color_attribute', 127825),
             ('b_novel_object_coexist', 10260)])

In [133]:
for split, all_ids in splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

for a3_novel_size_attribute split, we have 214892 examples.
for train split, we have 539722 examples.
for a2_novel_color_attribute_visual split, we have 187302 examples.
for c_novel_relation_coexist split, we have 5940 examples.
for d_novel_object_relation_pair split, we have 37620 examples.
for a1_novel_color_attribute split, we have 127825 examples.
for b_novel_object_coexist split, we have 10260 examples.
for dev split, we have 29920 examples.
for test split, we have 5753 examples.


In [134]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(id_example_map[_id])
# save it to the disk
data_json["examples"] = updated_examples

In [74]:
with open("../../data-files-updated/ReaSCAN-compositional/data-compositional-splits-all.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [125]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in splits_assignment.items():
    if split == "train" or split == "dev" or split == "test":
        updated_examples[split] = []
        for _id in all_ids:
            updated_examples[split].append(id_example_map[_id])

In [126]:
# save it to the disk
data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional/data-compositional-splits-train.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

#### P1+P2+P3: Compositional Splits Continue
We need to make sure novel attribute splits actually require the attribute to reason, otherwise, it becomes less meaningful, and may cause accuracy inflation afterwards.

In [None]:
# Combine all of three together
# We downsample it to make it trainable within reasonable time frame!
path_to_data = "../../data-files/ReaSCAN-compositional/data-compositional-splits-all.txt"
logger.info(f"Reading dataset from file: {path_to_data}...")
data_json = json.load(open(path_to_data, "r"))

In [135]:
# a1 
a1_attribute_example_filtered = []
attribute_change = 0
for example in data_json["examples"]['a1_novel_color_attribute']:
    if example['has_attribute_distractor']:
        for k, v in example['object_expression'].items():
            if "yellow square" in v:
                if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_obj'] == k:
                    if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_attribute'] == "$COLOR":
                        a1_attribute_example_filtered += [example]
                        attribute_change += 1
    

In [136]:
print(f"Actual examples for a1 = {attribute_change}")

Actual examples for a1 = 22057


In [137]:
# a2
a2_attribute_example_filtered = []
attribute_change = 0
for example in data_json["examples"]['a2_novel_color_attribute_visual']:
    if "red,square" in example["command"]:
        if example['has_attribute_distractor']:
            for k, v in example['object_expression'].items():
                if "red square" in v:
                    if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_obj'] == k:
                        if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_attribute'] == "$COLOR":
                            a2_attribute_example_filtered += [example]
                            attribute_change += 1
    else:
        # this is for the visual part, we automatically added in.
        a2_attribute_example_filtered += [example]
        attribute_change += 1

In [138]:
print(f"Actual examples for a2 = {attribute_change}")

Actual examples for a2 = 81349


In [139]:
# a3
a3_attribute_example_filtered = []
attribute_change = 0
for example in data_json["examples"]['a3_novel_size_attribute']:
    if example['has_attribute_distractor']:
        for k, v in example['object_expression'].items():
            if "small" in v and "cylinder" in v:
                if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_obj'] == k:
                    if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_attribute'] == "$SIZE":
                        a3_attribute_example_filtered += [example]
                        attribute_change += 1

In [140]:
print(f"Actual examples for a3 = {attribute_change}")

Actual examples for a3 = 35675


In [141]:
b1_attribute_example_filtered = []
for example in data_json["examples"]['b_novel_object_coexist']:
    b1_attribute_example_filtered += [example]

In [142]:
len(b1_attribute_example_filtered)

10260

In [143]:
b2_attribute_example_filtered = []
for example in data_json["examples"]['c_novel_relation_coexist']:
    b2_attribute_example_filtered += [example]

In [144]:
len(b2_attribute_example_filtered)

5940

In [145]:
b3_attribute_example_filtered = []
for example in data_json["examples"]['d_novel_object_relation_pair']:
    b3_attribute_example_filtered += [example]

In [146]:
p1_test_example_filtered = []
p2_test_example_filtered = []
p3_test_example_filtered = []
for example in data_json["examples"]["test"]:
    if example['derivation'] == "$OBJ_0":
        p1_test_example_filtered += [example]
    elif example['derivation'] == "$OBJ_0 ^ $OBJ_1":
        p2_test_example_filtered += [example]
    elif example['derivation'] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2":
        p3_test_example_filtered += [example]
print(f"p1 test example count={len(p1_test_example_filtered)}")
print(f"p2 test example count={len(p2_test_example_filtered)}")
print(f"p3 test example count={len(p3_test_example_filtered)}")

p1 test example count=921
p2 test example count=2120
p3 test example count=2712


In [147]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = a1_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-a1/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [148]:
data_json["examples"] = {}
data_json["examples"]["test"] = a2_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-a2/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [149]:
data_json["examples"] = {}
data_json["examples"]["test"] = a3_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-a3/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [150]:
data_json["examples"] = {}
data_json["examples"]["test"] = b1_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-b1/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [151]:
data_json["examples"] = {}
data_json["examples"]["test"] = b2_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-b2/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
data_json["examples"] = {}
data_json["examples"]["test"] = b3_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-b3/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [153]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = p1_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p1-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [154]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = p2_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p2-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [155]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = p3_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p3-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

#### Novel Clause Length Split

In [156]:
path_to_data = "../../data-files-updated/ReaSCAN-compositional-p4/data-train.txt"
logger.info(f"Reading dataset from file: {path_to_data}...")
data_json = json.load(open(path_to_data, "r"))

2021-06-08 18:33 Reading dataset from file: ../../data-files-updated/ReaSCAN-compositional-p4/data-train.txt...


In [157]:
p4_test_example_filtered = data_json["examples"]["train"]
data_json["examples"] = {}
data_json["examples"]["test"] = p4_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p4-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [158]:
len(data_json["examples"]["test"])

8375