### Scrips for generating splits
This script assums you have the main ReaSCAN generated by the generate_ReaSCAN.py script. After that, you can use this file to generate/extrapolate different splits. In the future, we may consolidate two files.

In [None]:
from collections import namedtuple, OrderedDict
import os
from typing import List
from typing import Tuple
import logging
from collections import defaultdict
from collections import Counter
import json
import torch
import numpy as np

def isnotebook():
    try:
        shell = get_ipython().__class__.__name__
        if shell == 'ZMQInteractiveShell':
            return True   # Jupyter notebook or qtconsole
        elif shell == 'TerminalInteractiveShell':
            return False  # Terminal running IPython
        else:
            return False  # Other type (?)
    except NameError:
        return False      # Probably standard Python interpreter
if isnotebook():
    device = torch.device("cpu")
else:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO,
                    datefmt="%Y-%m-%d %H:%M")
logger = logging.getLogger(__name__)

from world import *
from object_vocabulary import *
from vocabulary import *
from grammer import *
from simulator import *
from relation_graph import *

#### P1: gSCAN Pattern

In [None]:
p1_path_to_data = "../../ReaSCAN-v1.0/ReaSCAN-compositional-p1/data-train.txt"
logger.info(f"Reading dataset from file: {p1_path_to_data}...")
p1_data_json = json.load(open(p1_path_to_data, "r"))

p1_all_fake_train = p1_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p1_all_fake_train)

In [None]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p1_id_example_map = OrderedDict({})
p1_id_splits_map = OrderedDict({})
index = 0
for example in p1_data_json["examples"]["train"]:
    p1_id_example_map[index] = example
    p1_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [None]:
p1_splits_distribution = OrderedDict({})
p1_splits_assignment = OrderedDict({})
for index, splits in p1_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p1_splits_distribution.keys():
            p1_splits_distribution[split] += 1
        else:
            p1_splits_distribution[split] = 1
        
        if split in p1_splits_assignment:
            p1_splits_assignment[split].append(index)
        else:
            p1_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p1_all_fake_train)*0.01)
gscan_test_size = int(len(p1_all_fake_train)*0.052)
p1_all_example_id = p1_splits_assignment["train"]
random.shuffle(p1_all_example_id)
p1_train_example_id = p1_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p1_dev_example_id = p1_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p1_test_example_id = p1_all_example_id[-gscan_dev_size:]
p1_splits_assignment["train"] = p1_train_example_id
p1_splits_assignment["dev"] = p1_dev_example_id
p1_splits_assignment["test"] = p1_test_example_id

In [None]:
for split, all_ids in p1_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

In [None]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p1_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p1_id_example_map[_id])

In [None]:
# save it to the disk
p1_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p1/data-compositional-splits.txt", "w") as fd:
    json.dump(p1_data_json, fd, indent=4)

#### P2: Single Clause

In [None]:
p2_path_to_data = "../../ReaSCAN-v1.0/ReaSCAN-compositional-p2/data-train.txt"
logger.info(f"Reading dataset from file: {p2_path_to_data}...")
p2_data_json = json.load(open(p2_path_to_data, "r"))

p2_all_fake_train = p2_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p2_all_fake_train)

In [None]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p2_id_example_map = OrderedDict({})
p2_id_splits_map = OrderedDict({})
index = 0
for example in p2_data_json["examples"]["train"]:
    p2_id_example_map[index] = example
    p2_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [None]:
p2_splits_distribution = OrderedDict({})
p2_splits_assignment = OrderedDict({})
for index, splits in p2_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p2_splits_distribution.keys():
            p2_splits_distribution[split] += 1
        else:
            p2_splits_distribution[split] = 1
        
        if split in p2_splits_assignment:
            p2_splits_assignment[split].append(index)
        else:
            p2_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p2_all_fake_train)*0.01)
gscan_test_size = int(len(p2_all_fake_train)*0.052)
p2_all_example_id = p2_splits_assignment["train"]
random.shuffle(p2_all_example_id)
p2_train_example_id = p2_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p2_dev_example_id = p2_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p2_test_example_id = p2_all_example_id[-gscan_dev_size:]
p2_splits_assignment["train"] = p2_train_example_id
p2_splits_assignment["dev"] = p2_dev_example_id
p2_splits_assignment["test"] = p2_test_example_id

In [None]:
for split, all_ids in p2_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

In [None]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p2_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p2_id_example_map[_id])

In [None]:
# save it to the disk
p2_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p2/data-compositional-splits.txt", "w") as fd:
    json.dump(p2_data_json, fd, indent=4)

#### P3: Double Clause

In [None]:
p3_path_to_data = "../../ReaSCAN-v1.0/ReaSCAN-compositional-p3/data-train.txt"
logger.info(f"Reading dataset from file: {p3_path_to_data}...")
p3_data_json = json.load(open(p3_path_to_data, "r"))

In [None]:
p3_all_fake_train = p3_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p3_all_fake_train)

In [None]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p3_id_example_map = OrderedDict({})
p3_id_splits_map = OrderedDict({})
index = 0
for example in p3_data_json["examples"]["train"]:
    p3_id_example_map[index] = example
    p3_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [None]:
p3_splits_distribution = OrderedDict({})
p3_splits_assignment = OrderedDict({})
for index, splits in p3_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p3_splits_distribution.keys():
            p3_splits_distribution[split] += 1
        else:
            p3_splits_distribution[split] = 1
        
        if split in p3_splits_assignment:
            p3_splits_assignment[split].append(index)
        else:
            p3_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p3_all_fake_train)*0.01)
gscan_test_size = int(len(p3_all_fake_train)*0.052)
p3_all_example_id = p3_splits_assignment["train"]
random.shuffle(p3_all_example_id)
p3_train_example_id = p3_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p3_dev_example_id = p3_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p3_test_example_id = p3_all_example_id[-gscan_dev_size:]
p3_splits_assignment["train"] = p3_train_example_id
p3_splits_assignment["dev"] = p3_dev_example_id
p3_splits_assignment["test"] = p3_test_example_id

In [None]:
for split, all_ids in p3_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

In [None]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p3_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p3_id_example_map[_id])

In [None]:
# save it to the disk
p3_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p3/data-compositional-splits.txt", "w") as fd:
    json.dump(p3_data_json, fd, indent=4)

#### P3: Double Clause (combing with P3 sharding, this section is shared across all patterns as well!)

In [None]:
import os
import json
import re
pattern = "p4"
special_condition = ""
if special_condition != "":
    prefix = f"{pattern}-{special_condition}"
else:
    prefix = pattern
sharding_dir = f"../../data-files-{prefix}/"
if pattern == "p3":
    upper_limit = 3375
elif pattern == "p2":
    upper_limit = 2025
elif pattern == "p1":
    upper_limit = 675
elif pattern == "p4":
    upper_limit = 4000

In [None]:
unique_command = set([])
for subdir, dirs, files in os.walk(sharding_dir):
    if "jobid" in subdir:
        
        # Completeness check!
        logging_file = os.path.join(subdir, "generator.log")
        with open(logging_file) as f:
            content = f.readlines()
        # you may also want to remove whitespace characters like `\n` at the end of each line
        content = [x.strip() for x in content]
        completed = False
        for c in content:
            if "==FINISH==" in c:
                completed = True
                break
        jobid = logging_file.split("/")[-2].split("-")[-1]
        print(f"jobid={jobid}, status=complete={completed}")
        if not completed:
            break
        
        # Uniqueness check!
        data_file_path = os.path.join(subdir, "data-train.txt")
        print(f"scanning for file: {data_file_path}")
        data_file = json.load(open(data_file_path, "r"))
        for example in data_file["examples"]["train"]:
            command_split = re.split(',a,|,the,', example['command'])
            command_mono = ",".join(command_split)
            unique_command.add(command_mono)
assert len(unique_command) > upper_limit

In [None]:
len(unique_command)

In [None]:
shared_example_combined = {}
per_command_mono_count = {}
for subdir, dirs, files in os.walk(sharding_dir):
    if "jobid" in subdir:
        data_file_path = os.path.join(subdir, "data-train.txt")
        print(f"Collecting for file: {data_file_path}")
        data_file = json.load(open(data_file_path, "r"))
        for example in data_file["examples"]["train"]:
            command_split = re.split(',a,|,the,', example['command'])
            command_mono = ",".join(command_split)
            if command_mono in per_command_mono_count.keys():
                if per_command_mono_count[command_mono] == 180: # for p4 this may never hit!
                    continue # we are not adding this example since redundant!
                per_command_mono_count[command_mono] += 1
            else:
                per_command_mono_count[command_mono] = 1
            if command_mono in shared_example_combined.keys():
                shared_example_combined[command_mono].append(example)
            else:
                shared_example_combined[command_mono] = [example]

In [None]:
# write to disk!
import random
shared_examples = []
commands_mono = list(shared_example_combined.keys())
random.shuffle(commands_mono)
for i in range(upper_limit):
    examples_to_include = shared_example_combined[commands_mono[i]]
    for example in examples_to_include:
        shared_examples.append(example)

In [None]:
data_file["examples"]["train"] = shared_examples

In [None]:
print(len(shared_examples))

In [None]:
with open(f"../../data-files-{prefix}/ReaSCAN-compositional-{prefix}/data-train.txt", "w") as fd:
    json.dump(data_file, fd, indent=4)

#### P3-RD: Double Clause with Only Random Distractors (and some contextual distractors, which are also random)

In [None]:
p3_rd_path_to_data = "../../data-files-updated/ReaSCAN-compositional-p3-rd/data-train.txt"
logger.info(f"Reading dataset from file: {p3_rd_path_to_data}...")
p3_rd_data_json = json.load(open(p3_rd_path_to_data, "r"))

In [None]:
p3_rd_all_fake_train = p3_rd_data_json["examples"]["train"]
# for dev and test, it is simple, let us just shuffle, and random select.
len(p3_rd_all_fake_train)

In [None]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
p3_rd_id_example_map = OrderedDict({})
p3_rd_id_splits_map = OrderedDict({})
index = 0
for example in p3_rd_data_json["examples"]["train"]:
    p3_rd_id_example_map[index] = example
    p3_rd_id_splits_map[index] = set([]) # set of splits that this example belongs to.
    index += 1

In [None]:
p3_rd_splits_distribution = OrderedDict({})
p3_rd_splits_assignment = OrderedDict({})
for index, splits in p3_rd_id_splits_map.items():
    if len(splits) == 0:
        split = "train" # let us split this up later!
        if split in p3_rd_splits_distribution.keys():
            p3_rd_splits_distribution[split] += 1
        else:
            p3_rd_splits_distribution[split] = 1
        
        if split in p3_rd_splits_assignment:
            p3_rd_splits_assignment[split].append(index)
        else:
            p3_rd_splits_assignment[split] = [index]
    else:   
        assert False

# Let us further segment train into dev and test!
gscan_dev_size = int(len(p3_rd_all_fake_train)*0.01)
gscan_test_size = int(len(p3_rd_all_fake_train)*0.052)
p3_rd_all_example_id = p3_rd_splits_assignment["train"]
random.shuffle(p3_rd_all_example_id)
p3_rd_train_example_id = p3_rd_all_example_id[:(-gscan_dev_size-gscan_test_size)]
p3_rd_dev_example_id = p3_rd_all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
p3_rd_test_example_id = p3_rd_all_example_id[-gscan_dev_size:]
p3_rd_splits_assignment["train"] = p3_rd_train_example_id
p3_rd_splits_assignment["dev"] = p3_rd_dev_example_id
p3_rd_splits_assignment["test"] = p3_rd_test_example_id

In [None]:
for split, all_ids in p3_rd_splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

In [None]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in p3_rd_splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(p3_rd_id_example_map[_id])

In [None]:
# save it to the disk
p3_rd_data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional-p3-rd/data-compositional-splits.txt", "w") as fd:
    json.dump(p3_rd_data_json, fd, indent=4)

#### P1+P2+P3: Compositional Splits

In [None]:
import random
import numpy as np
random.seed(42)
np.random.seed(42)

# Combine all of three together
# We downsample it to make it trainable within reasonable time frame!
p1_path_to_data = "../../ReaSCAN-v1.0/ReaSCAN-compositional-p1/data-train.txt"
print(f"Reading dataset from file: {p1_path_to_data}...")
p1_data_json = json.load(open(p1_path_to_data, "r"))

p2_path_to_data = "../../ReaSCAN-v1.0/ReaSCAN-compositional-p2/data-train.txt"
print(f"Reading dataset from file: {p2_path_to_data}...")
p2_data_json = json.load(open(p2_path_to_data, "r"))

In [None]:
p3_path_to_data = "../../ReaSCAN-v1.0/ReaSCAN-compositional-p3/data-train.txt"
print(f"Reading dataset from file: {p3_path_to_data}...")
p3_data_json = json.load(open(p3_path_to_data, "r"))

In [None]:
# Combine them into a single big train!
p1_examples = p1_data_json["examples"]["train"]
p2_examples = p2_data_json["examples"]["train"]
p3_data_json["examples"]["train"].extend(p1_examples)
p3_data_json["examples"]["train"].extend(p2_examples)
data_json = p3_data_json

In [None]:
# let us downsample it to ?K
len(data_json["examples"]["train"])

In [None]:
# For generating the splits, we actually have to go through compositional splits first
# and then consider random splits like dev and test. Because, we don't want things mixed up
# in the dev and test. Dev and test should only contain commands that appear in the train,
# so a total random partition at the end should work.

# We do the splits step-by-step!
id_example_map = OrderedDict({})
id_splits_map = OrderedDict({})
index = 0
for example in data_json["examples"]["train"]:
    id_example_map[index] = example
    id_splits_map[index] = set([]) # set of splits that this example belongs to.

    # A1
    if "yellow,square" in example['command']:
        id_splits_map[index].add("a1_novel_color_attribute")
    
    # A2
    if example["derivation"] == "$OBJ_0":
        if "red,square" in example['command'] or \
            (example['situation']['placed_objects']['0']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['0']['object']['color'] == "red"):
            id_splits_map[index].add("a2_novel_color_attribute_visual")
    elif example["derivation"] == "$OBJ_0 ^ $OBJ_1":
        if "red,square" in example['command'] or \
            (example['situation']['placed_objects']['0']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['0']['object']['color'] == "red") or \
            (example['situation']['placed_objects']['1']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['1']['object']['color'] == "red"):
            id_splits_map[index].add("a2_novel_color_attribute_visual")
    elif example["derivation"] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2":
        if "red,square" in example['command'] or \
            (example['situation']['placed_objects']['0']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['0']['object']['color'] == "red") or \
            (example['situation']['placed_objects']['1']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['1']['object']['color'] == "red") or \
            (example['situation']['placed_objects']['2']['object']['shape'] == "square" and \
             example['situation']['placed_objects']['2']['object']['color'] == "red"):
            id_splits_map[index].add("a2_novel_color_attribute_visual")
    else:
        pass
    
    # A3
    if "small,cylinder" in example['command'] or \
        "small,red,cylinder" in example['command'] or \
        "small,blue,cylinder" in example['command'] or \
        "small,yellow,cylinder" in example['command'] or \
        "small,green,cylinder" in example['command']:
        id_splits_map[index].add("a3_novel_size_attribute")
    
    # B1: we default to active generation process, not holding out.
    
    # B2
    if "same,size" in example['command'] and "inside,of" in example['command']:
        id_splits_map[index].add("c_novel_relation_coexist")
    
    if example['grammer_pattern'] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2 & $OBJ_3":
        id_splits_map[index].add("e_novel_clause_length")
    
    # C1 and C2: we default to active generation process, not holding out.
    
    index += 1

In [None]:
splits_distribution = OrderedDict({})
splits_assignment = OrderedDict({})
count = 0
ccount = 0
for index, splits in id_splits_map.items():
    if len(splits) == 0:
        count += 1
        split = "train" # let us split this up later!
        if split in splits_distribution.keys():
            splits_distribution[split] += 1
        else:
            splits_distribution[split] = 1
        
        if split in splits_assignment:
            splits_assignment[split].append(index)
        else:
            splits_assignment[split] = [index]
    else:
        ccount += 1
        for split in splits:
            if split in splits_distribution.keys():
                splits_distribution[split] += 1
            else:
                splits_distribution[split] = 1
                
            if split in splits_assignment:
                splits_assignment[split].append(index)
            else:
                splits_assignment[split] = [index]

# Let us further segment train into dev and test!
all_example_id = splits_assignment["train"]
gscan_dev_size = int(len(all_example_id)*0.01)
gscan_test_size = int(len(all_example_id)*0.052)
random.shuffle(all_example_id)
train_example_id = all_example_id[:(-gscan_dev_size-gscan_test_size)]
dev_example_id = all_example_id[(-gscan_dev_size-gscan_test_size):-gscan_dev_size]
test_example_id = all_example_id[-gscan_dev_size:]
splits_assignment["train"] = train_example_id
splits_assignment["dev"] = dev_example_id
splits_assignment["test"] = test_example_id

In [None]:
splits_distribution

In [None]:
for split, all_ids in splits_assignment.items():
    print(f"for {split} split, we have {len(all_ids)} examples.")

In [None]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in splits_assignment.items():
    updated_examples[split] = []
    for _id in all_ids:
        updated_examples[split].append(id_example_map[_id])
# save it to the disk
data_json["examples"] = updated_examples

In [None]:
with open("../../data-files-updated/ReaSCAN-compositional/data-compositional-splits-all.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
# remake our data file accordingly.
updated_examples = OrderedDict({})
for split, all_ids in splits_assignment.items():
    if split == "train" or split == "dev" or split == "test":
        updated_examples[split] = []
        for _id in all_ids:
            updated_examples[split].append(id_example_map[_id])

In [None]:
# save it to the disk
data_json["examples"] = updated_examples
with open("../../data-files-updated/ReaSCAN-compositional/data-compositional-splits-train.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

#### P1+P2+P3: Compositional Splits Continue
We need to make sure novel attribute splits actually require the attribute to reason, otherwise, it becomes less meaningful, and may cause accuracy inflation afterwards.

In [None]:
# Combine all of three together
# We downsample it to make it trainable within reasonable time frame!
path_to_data = "../../data-files/ReaSCAN-compositional/data-compositional-splits-all.txt"
logger.info(f"Reading dataset from file: {path_to_data}...")
data_json = json.load(open(path_to_data, "r"))

In [None]:
data_json["examples"].keys()

In [None]:
# a1 
a1_attribute_example_filtered = []
attribute_change = 0
for example in data_json["examples"]['a1_novel_color_attribute']:
    if example['has_attribute_distractor']:
        for k, v in example['object_expression'].items():
            if "yellow square" in v:
                if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_obj'] == k:
                    if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_attribute'] == "$COLOR":
                        a1_attribute_example_filtered += [example]
                        attribute_change += 1
    

In [None]:
print(f"Actual examples for a1 = {attribute_change}")

In [None]:
# a2
a2_attribute_example_filtered = []
attribute_change = 0
for example in data_json["examples"]['a2_novel_color_attribute_visual']:
    if "red,square" in example["command"]:
        if example['has_attribute_distractor']:
            for k, v in example['object_expression'].items():
                if "red square" in v:
                    if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_obj'] == k:
                        if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_attribute'] == "$COLOR":
                            a2_attribute_example_filtered += [example]
                            attribute_change += 1
    else:
        # this is for the visual part, we automatically added in.
        a2_attribute_example_filtered += [example]
        attribute_change += 1

In [None]:
print(f"Actual examples for a2 = {attribute_change}")

In [None]:
# a3
a3_attribute_example_filtered = []
attribute_change = 0
for example in data_json["examples"]['a3_novel_size_attribute']:
    if example['has_attribute_distractor']:
        for k, v in example['object_expression'].items():
            if "small" in v and "cylinder" in v:
                if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_obj'] == k:
                    if example['attribute_distractor_metadata'][0]['distractor_metadata'][0]['modified_attribute'] == "$SIZE":
                        a3_attribute_example_filtered += [example]
                        attribute_change += 1

In [None]:
print(f"Actual examples for a3 = {attribute_change}")

In [None]:
b1_attribute_example_filtered = []
for example in data_json["examples"]['b_novel_object_coexist']:
    b1_attribute_example_filtered += [example]

In [None]:
len(b1_attribute_example_filtered)

In [None]:
b2_attribute_example_filtered = []
for example in data_json["examples"]['c_novel_relation_coexist']:
    b2_attribute_example_filtered += [example]

In [None]:
len(b2_attribute_example_filtered)

In [None]:
b3_attribute_example_filtered = []
for example in data_json["examples"]['d_novel_object_relation_pair']:
    b3_attribute_example_filtered += [example]

In [None]:
p1_test_example_filtered = []
p2_test_example_filtered = []
p3_test_example_filtered = []
for example in data_json["examples"]["test"]:
    if example['derivation'] == "$OBJ_0":
        p1_test_example_filtered += [example]
    elif example['derivation'] == "$OBJ_0 ^ $OBJ_1":
        p2_test_example_filtered += [example]
    elif example['derivation'] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2":
        p3_test_example_filtered += [example]
print(f"p1 test example count={len(p1_test_example_filtered)}")
print(f"p2 test example count={len(p2_test_example_filtered)}")
print(f"p3 test example count={len(p3_test_example_filtered)}")

In [None]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = a1_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-a1/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
data_json["examples"] = {}
data_json["examples"]["test"] = a2_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-a2/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
data_json["examples"] = {}
data_json["examples"]["test"] = a3_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-a3/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
data_json["examples"] = {}
data_json["examples"]["test"] = b1_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-b1/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
data_json["examples"] = {}
data_json["examples"]["test"] = b2_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-b2/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
data_json["examples"] = {}
data_json["examples"]["test"] = b3_attribute_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-b3/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = p1_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p1-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = p2_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p2-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
# Let us write each splits into a different file, so it can be loaded much faster!
data_json["examples"] = {}
data_json["examples"]["test"] = p3_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p3-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

#### Novel Clause Length Split

In [None]:
path_to_data = "../../data-files-updated/ReaSCAN-compositional-p4/data-train.txt"
logger.info(f"Reading dataset from file: {path_to_data}...")
data_json = json.load(open(path_to_data, "r"))

In [None]:
p4_test_example_filtered = data_json["examples"]["train"]
data_json["examples"] = {}
data_json["examples"]["test"] = p4_test_example_filtered
with open("../../data-files-updated/ReaSCAN-compositional-p4-test/data-compositional.txt", "w") as fd:
    json.dump(data_json, fd, indent=4)

In [None]:
len(data_json["examples"]["test"])

#### Explore more possibilities of harder splits!

Command constant but more complex distractor sampling stratigies!

In [None]:
# test out the vocabulary
intransitive_verbs = ["walk"]
transitive_verbs = ["push", "pull"]
adverbs = ["while zigzagging", "while spinning", "cautiously", "hesitantly"]
nouns = ["circle", "cylinder", "square", "box"]
color_adjectives = ["red", "blue", "green", "yellow"]
size_adjectives = ["big", "small"]
relative_pronouns = ["that is"]
relation_clauses = ["in the same row as", 
                    "in the same column as", 
                    "in the same color as", 
                    "in the same shape as", 
                    "in the same size as",
                    "inside of"]
vocabulary = Vocabulary.initialize(intransitive_verbs=intransitive_verbs,
                                   transitive_verbs=transitive_verbs, adverbs=adverbs, nouns=nouns,
                                   color_adjectives=color_adjectives,
                                   size_adjectives=size_adjectives, 
                                   relative_pronouns=relative_pronouns, 
                                   relation_clauses=relation_clauses)

min_object_size = 1
max_object_size = 4
object_vocabulary = ObjectVocabulary(shapes=vocabulary.get_semantic_shapes(),
                                     colors=vocabulary.get_semantic_colors(),
                                     min_size=min_object_size, max_size=max_object_size)

grammer = Grammer(vocabulary)

simulator = Simulator(
    object_vocabulary, vocabulary, 
    grid_size=6, 
    n_object_max=13,
)

In [None]:
# Let us explore different distractor sampling!
pattern = ""
if pattern == "p4":
    ReaSCAN_data_file = f"ReaSCAN-compositional{pattern}/data-train.txt"
else:
    ReaSCAN_data_file = f"ReaSCAN-compositional{pattern}/data-compositional-splits.txt"
ReaSCAN_data_json = json.load(open(os.path.join("../../data-files-updated/", ReaSCAN_data_file)))

In [None]:
synthetic_tasks = []

per_command_world_target_count = 3 # Need to increase this for some tasks!
per_command_world_retry_max = 200
sampled_command_count = 5000

examples = ReaSCAN_data_json["examples"]["train"]
random.shuffle(examples)
sampled_examples = random.sample(examples, k=sampled_command_count)
progress = 0

for example_selected in sampled_examples:
    
    if example_selected['grammer_pattern'] != '$OBJ_0 ^ $OBJ_1 & $OBJ_2':
        continue
    
    progress += 1
    if progress % 10 == 0:
        print(f"count={len(synthetic_tasks)}")
        print(f"progress={progress}")
    if len(synthetic_tasks) > 10000:
        break # we get enough!
        
    rel_map = OrderedDict({})
    for ele in example_selected["relation_map"]:
        rel_map[tuple(ele[0])] = ele[1]
    example_struct = {'obj_pattern_map': example_selected["object_pattern_map"],
     'rel_map': rel_map,
     'obj_map': example_selected["object_expression"],
     'grammer_pattern': example_selected['grammer_pattern'],
     'adverb': example_selected['adverb_in_command'],
     'verb': example_selected['verb_in_command']}
    
    obj_pattern_map = example_struct["obj_pattern_map"]
    rel_map = example_struct["rel_map"]
    obj_map = example_struct["obj_map"]
    grammer_pattern = example_struct["grammer_pattern"]
    verb = example_struct["verb"]
    adverb = example_struct["adverb"]

    for _ in range(per_command_world_target_count):
        for i in range(per_command_world_retry_max): # retry essentially!
            sampled_world = simulator.sample_situations_from_grounded_grammer(
                    copy.deepcopy(grammer_pattern), 
                    copy.deepcopy(obj_pattern_map), 
                    copy.deepcopy(rel_map), 
                    copy.deepcopy(obj_map),
                    is_plot=False,
                    include_relation_distractor=True, 
                    include_attribute_distractor=False, 
                    include_isomorphism_distractor=False, 
                    include_random_distractor=True,
                    full_relation_probability=1.0, # 0.5 seems to work as well!
                    debug=False
                )
            
            # print(sampled_world)
            # _ = simulator._world.render_simple()
            
            assert len(sampled_world['obj_map']) == len(simulator._world.get_current_situation().to_representation()["placed_objects"])

            graph = ReaSCANGraph(
                objects=sampled_world["obj_map"], 
                object_patterns=sampled_world["obj_pattern_map"], 
                vocabulary=vocabulary,
                positions=sampled_world["pos_map"], 
                referred_object=sampled_world["referred_obj"],
                debug=False
            )

            pattern_graph = ReaSCANGraph(
                objects=obj_map, 
                object_patterns=None,
                vocabulary=vocabulary,
                relations=rel_map, 
                referred_object='$OBJ_0', 
                debug=False
            )

            potential_referent_target = graph.find_referred_object_super_fast(
                pattern_graph, referred_object='$OBJ_0', 
                debug=False
            )

            if len(potential_referent_target) == 1 and '$OBJ_0' in potential_referent_target:
                # test_unique_find += 1
                # print(f"{test_unique_find} / {i+1} unique solution find!")

                # Form the command with grounded determiners!
                obj_determiner_map = graph.find_determiners(
                    pattern_graph, 
                    referred_object='$OBJ_0', 
                    debug=False,
                )
                command_str = grammer.repre_str_command(
                    grammer_pattern, rel_map, obj_map, 
                    obj_determiner_map, 
                    verb,
                    adverb,
                )

                # Form the golden label for the action list!
                is_transitive = False
                if verb in simulator.vocabulary.get_transitive_verbs():
                    is_transitive = True
                # Direct walk.
                action = "walk" # this is definit!
                primitive_command = simulator.vocabulary.translate_word(action)
                target_position = sampled_world["situation"].target_object.position
                simulator._world.go_to_position(
                    position=target_position, manner=adverb, 
                    primitive_command=primitive_command
                )
                # Object actions.
                if is_transitive:
                    semantic_action = simulator.vocabulary.translate_word(verb)
                    simulator._world.move_object_to_wall(action=semantic_action, manner=adverb)
                target_commands, _ = simulator._world.get_current_observations()

                has_relation_distractor = False
                full_relation_distractor = True
                for rel_bool in sampled_world["distractor_switch_map"]["relation"]:
                    if rel_bool:
                        has_relation_distractor = True
                    else:
                        full_relation_distractor = False

                # Save all relevant information for a task.
                task_struct = OrderedDict({
                    "command": ",".join(command_str.split(" ")),
                    "grammer_pattern": grammer_pattern,
                    "meaning": ",".join(command_str.split(" ")),
                    "derivation": grammer_pattern,
                    "situation": sampled_world["situation"].to_representation(),
                    "target_commands": ",".join(target_commands),
                    "verb_in_command": verb,
                    "adverb_in_command": adverb,
                    "referred_target": obj_map["$OBJ_0"],
                    "object_pattern_map": obj_pattern_map,
                    "relation_map": [(k, v) for k, v in rel_map.items()],
                    "object_expression": obj_map,
                    "n_object": len(sampled_world["obj_map"]),
                    "n_distractor": len(sampled_world["obj_map"])-len(obj_map),
                    "full_relation_distractor": full_relation_distractor,
                    "has_relation_distractor": has_relation_distractor,
                    "has_attribute_distractor": sampled_world["distractor_switch_map"]["attribute"],
                    "has_isomorphism_distractor": sampled_world["distractor_switch_map"]["isomorphism"],
                    "has_random_distractor": True if sampled_world["n_random_distractor"] != -1 else False,
                    "n_random_distractor": sampled_world["n_random_distractor"] if sampled_world["n_random_distractor"] != -1 else 0,
                    "relation_distractor_metadata": sampled_world["relation_distractor_metadata"],
                    "attribute_distractor_metadata": sampled_world["attribute_distractor_metadata"],
                    "isomorphism_distractor_metadata": sampled_world["isomorphism_distractor_metadata"],
                    "random_distractor_metadata": sampled_world["random_distractor_metadata"],
                })
                synthetic_tasks += [task_struct]
                break


In [None]:
synthetic_tasks = {
    "test" : synthetic_tasks
}
dataset_representation = {
    "grid_size": 6,
    "type_grammar": "ReaSCAN-Grammer",
    "min_object_size": 1,
    "max_object_size": 4,
    "percentage_train": 0.0,
    "examples": synthetic_tasks,
    "intransitive_verbs": intransitive_verbs,
    "transitive_verbs": transitive_verbs,
    "adverbs": adverbs,
    "nouns": nouns,
    "color_adjectives": color_adjectives,
    "size_adjectives": size_adjectives,
    "relative_pronouns": relative_pronouns,
    "relation_clauses": relation_clauses,
}

In [None]:
split_name = "-f1"
with open(f"../../data-files-updated/ReaSCAN-compositional{split_name}/data-compositional-splits.txt", "w") as fd:
    json.dump(dataset_representation, fd, indent=4)

Unseen co-occurence of relations and objects but with seen atomic concepts

In [None]:
# Currently, we hard-code the pattern!
grammer_pattern = '$OBJ_0 ^ $OBJ_1 & $OBJ_2'
logger.info(f"Including pattern:= {grammer_pattern}...")
# Sampling relations
relations = grammer.sample_object_relation_grammer(
    '$OBJ_0', 
    grammer.build_dependency_graph(grammer_pattern))
command_structs = {}
for relation in relations:
    obj_pattern_map = relation[0]
    rel_map = relation[1]
    grammer_bindings = grammer.grounding_grammer_with_vocabulary(grammer_pattern, obj_pattern_map, rel_map)
    for obj_map in grammer_bindings:
        # here, we also sample the verb and adverb bindings!
        adverb_enhance_list = vocabulary.get_adverbs()
        adverb_enhance_list += [""]
        command_struct = OrderedDict({
            "obj_pattern_map" : obj_pattern_map,
            "rel_map" : rel_map,
            "obj_map" : obj_map,
            "grammer_pattern" : grammer_pattern,
            "adverb" : random.choice(adverb_enhance_list),
            "verb" : random.choice(vocabulary.get_transitive_verbs() + vocabulary.get_intransitive_verbs()),
        })
        command_str = grammer.repre_str_command(
            grammer_pattern, rel_map, obj_map, 
            {"$OBJ_0" : "the", "$OBJ_1" : "a", "$OBJ_2" : "a"}, 
            command_struct["verb"],
            command_struct["adverb"],
        )
        command_structs[command_str] = command_struct

In [None]:
seen_command_structs = {}
seen_concepts = {} # add in seen concepts, so we can select concepts that are seen, but new composites!
seen_object_co = set([])
seen_rel_co = set([])
seen_rel_obj_co = set([])

for example_selected in ReaSCAN_data_json["examples"]["train"]:
    rel_map = OrderedDict({})
    for ele in example_selected["relation_map"]:
        rel_map[tuple(ele[0])] = ele[1]
    example_struct = OrderedDict({
        'obj_pattern_map': example_selected["object_pattern_map"],
        'rel_map': rel_map,
        'obj_map': example_selected["object_expression"],
        'grammer_pattern': example_selected['grammer_pattern'],
        'adverb': example_selected['adverb_in_command'],
        'verb': example_selected['verb_in_command']
    })
    obj_co = []
    for k, v in example_selected["object_expression"].items():
        if v not in seen_concepts:
            seen_concepts[v] = 1
        else:
            seen_concepts[v] += 1
        obj_co += [v]
    obj_co.sort()
    seen_object_co.add(tuple(obj_co))
    
    rel_co = []
    for k, v in rel_map.items():
        if v not in seen_concepts:
            seen_concepts[v] = 1
        else:
            seen_concepts[v] += 1
        rel_co += [v]
    rel_co.sort()
    seen_rel_co.add(tuple(rel_co))
    
    if example_selected['grammer_pattern'] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2":
        pair = (rel_map[("$OBJ_0", "$OBJ_1")], example_selected["object_expression"]["$OBJ_1"])
        seen_rel_obj_co.add(pair)
        pair = (rel_map[("$OBJ_0", "$OBJ_2")], example_selected["object_expression"]["$OBJ_2"])
        seen_rel_obj_co.add(pair)
    elif example_selected['grammer_pattern'] == "$OBJ_0 ^ $OBJ_1":
        pair = (rel_map[("$OBJ_0", "$OBJ_1")], example_selected["object_expression"]["$OBJ_1"])
        seen_rel_obj_co.add(pair)

    # if example_selected['grammer_pattern'] == "$OBJ_0 ^ $OBJ_1 & $OBJ_2":
    command_str = grammer.repre_str_command(
        example_selected['grammer_pattern'], rel_map, example_selected["object_expression"], 
        {"$OBJ_0" : "the", "$OBJ_1" : "a", "$OBJ_2" : "a"}, 
        example_selected['verb_in_command'],
        example_selected['adverb_in_command'],
    )
    seen_command_structs[command_str] = example_struct

In [None]:
unseen_obj_co_command_structs = []
unseen_obj_rel_co_command_structs = []
unseen_rel_co_command_structs = []
unseen_obj_rel_pair_command_structs = []
for k, v in command_structs.items():

    if k not in seen_command_structs:

        # we need to ensure concepts are seen before though!
        concept_seen = True
        obj_co = []
        for kk, vv in v['obj_map'].items():
            if vv not in seen_concepts.keys():
                concept_seen = False
                break
            obj_co += [vv]
        
        rel_co = []
        for kk, vv in v['rel_map'].items():
            if vv not in seen_concepts.keys():
                concept_seen = False
                break
            rel_co += [vv]

        pair = (v['rel_map'][("$OBJ_0", "$OBJ_1")], v["obj_map"]["$OBJ_1"])
        rel_obj_co_1 = pair
        pair = (v['rel_map'][("$OBJ_0", "$OBJ_2")], v["obj_map"]["$OBJ_2"])
        rel_obj_co_2 = pair

        if concept_seen:
            obj_co.sort()
            obj_co = tuple(obj_co)
            rel_co.sort()
            rel_co = tuple(rel_co)
            if rel_co not in seen_rel_co and obj_co in seen_object_co:
                unseen_rel_co_command_structs += [v]
            if obj_co not in seen_object_co and rel_co in seen_rel_co:
                unseen_obj_co_command_structs += [v]
            if obj_co not in seen_object_co and rel_co not in seen_rel_co:
                unseen_obj_rel_co_command_structs += [v]

In [None]:
synthetic_tasks = []
progress = 0

per_command_world_target_count = 3 # Need to increase this for some tasks!
per_command_world_retry_max = 200
sampled_command_count = 5000

for example_selected in unseen_obj_co_command_structs:

    progress += 1
    if progress % 50 == 0:
        print(f"count={len(synthetic_tasks)}")
        print(f"progress={progress}")
    if len(synthetic_tasks) > 10000:
        break # we get enough!

    obj_pattern_map = example_selected["obj_pattern_map"]
    rel_map = example_selected["rel_map"]
    obj_map = example_selected["obj_map"]
    grammer_pattern = example_selected["grammer_pattern"]
    verb = example_selected["verb"]
    adverb = example_selected["adverb"]

    for _ in range(per_command_world_target_count):
        for i in range(per_command_world_retry_max): # retry essentially!
            sampled_world = simulator.sample_situations_from_grounded_grammer(
                    copy.deepcopy(grammer_pattern), 
                    copy.deepcopy(obj_pattern_map), 
                    copy.deepcopy(rel_map), 
                    copy.deepcopy(obj_map),
                    is_plot=False,
                    include_relation_distractor=True, 
                    include_attribute_distractor=True, 
                    include_isomorphism_distractor=True, 
                    include_random_distractor=True,
                    full_relation_probability=1.0, # 0.5 seems to work as well!
                    debug=False
                )

            assert len(sampled_world['obj_map']) == len(simulator._world.get_current_situation().to_representation()["placed_objects"])

            graph = ReaSCANGraph(
                objects=sampled_world["obj_map"], 
                object_patterns=sampled_world["obj_pattern_map"], 
                vocabulary=vocabulary,
                positions=sampled_world["pos_map"], 
                referred_object=sampled_world["referred_obj"],
                debug=False
            )

            pattern_graph = ReaSCANGraph(
                objects=obj_map, 
                object_patterns=None,
                vocabulary=vocabulary,
                relations=rel_map, 
                referred_object='$OBJ_0', 
                debug=False
            )

            potential_referent_target = graph.find_referred_object_super_fast(
                pattern_graph, referred_object='$OBJ_0', 
                debug=False
            )

            if len(potential_referent_target) == 1 and '$OBJ_0' in potential_referent_target:
                # print(f"{test_unique_find} / {i+1} unique solution find!")
                # Form the command with grounded determiners!
                obj_determiner_map = graph.find_determiners(
                    pattern_graph, 
                    referred_object='$OBJ_0', 
                    debug=False,
                )
                command_str = grammer.repre_str_command(
                    grammer_pattern, rel_map, obj_map, 
                    obj_determiner_map, 
                    verb,
                    adverb,
                )

                # Form the golden label for the action list!
                is_transitive = False
                if verb in simulator.vocabulary.get_transitive_verbs():
                    is_transitive = True
                # Direct walk.
                action = "walk" # this is definit!
                primitive_command = simulator.vocabulary.translate_word(action)
                target_position = sampled_world["situation"].target_object.position
                simulator._world.go_to_position(
                    position=target_position, manner=adverb, 
                    primitive_command=primitive_command
                )
                # Object actions.
                if is_transitive:
                    semantic_action = simulator.vocabulary.translate_word(verb)
                    simulator._world.move_object_to_wall(action=semantic_action, manner=adverb)
                target_commands, _ = simulator._world.get_current_observations()

                has_relation_distractor = False
                full_relation_distractor = True
                for rel_bool in sampled_world["distractor_switch_map"]["relation"]:
                    if rel_bool:
                        has_relation_distractor = True
                    else:
                        full_relation_distractor = False

                # Save all relevant information for a task.
                task_struct = OrderedDict({
                    "command": ",".join(command_str.split(" ")),
                    "grammer_pattern": grammer_pattern,
                    "meaning": ",".join(command_str.split(" ")),
                    "derivation": grammer_pattern,
                    "situation": sampled_world["situation"].to_representation(),
                    "target_commands": ",".join(target_commands),
                    "verb_in_command": verb,
                    "adverb_in_command": adverb,
                    "referred_target": obj_map["$OBJ_0"],
                    "object_pattern_map": obj_pattern_map,
                    "relation_map": [(k, v) for k, v in rel_map.items()],
                    "object_expression": obj_map,
                    "n_object": len(sampled_world["obj_map"]),
                    "n_distractor": len(sampled_world["obj_map"])-len(obj_map),
                    "full_relation_distractor": full_relation_distractor,
                    "has_relation_distractor": has_relation_distractor,
                    "has_attribute_distractor": sampled_world["distractor_switch_map"]["attribute"],
                    "has_isomorphism_distractor": sampled_world["distractor_switch_map"]["isomorphism"],
                    "has_random_distractor": True if sampled_world["n_random_distractor"] != -1 else False,
                    "n_random_distractor": sampled_world["n_random_distractor"] if sampled_world["n_random_distractor"] != -1 else 0,
                    "relation_distractor_metadata": sampled_world["relation_distractor_metadata"],
                    "attribute_distractor_metadata": sampled_world["attribute_distractor_metadata"],
                    "isomorphism_distractor_metadata": sampled_world["isomorphism_distractor_metadata"],
                    "random_distractor_metadata": sampled_world["random_distractor_metadata"],
                })
                synthetic_tasks += [task_struct]
                break

In [None]:
synthetic_tasks = {
    "test" : synthetic_tasks
}
dataset_representation = {
    "grid_size": 6,
    "type_grammar": "ReaSCAN-Grammer",
    "min_object_size": 1,
    "max_object_size": 4,
    "percentage_train": 0.0,
    "examples": synthetic_tasks,
    "intransitive_verbs": intransitive_verbs,
    "transitive_verbs": transitive_verbs,
    "adverbs": adverbs,
    "nouns": nouns,
    "color_adjectives": color_adjectives,
    "size_adjectives": size_adjectives,
    "relative_pronouns": relative_pronouns,
    "relation_clauses": relation_clauses,
}

In [None]:
split_name = "-d2"
with open(f"../../data-files-updated/ReaSCAN-compositional{split_name}/data-compositional-splits.txt", "w") as fd:
    json.dump(dataset_representation, fd, indent=4)

Two "that is" clauses

In [None]:
possible_command_structs = []
for example_selected in ReaSCAN_data_json["examples"]["train"]:
    rel_map = OrderedDict({})
    for ele in example_selected["relation_map"]:
        rel_map[tuple(ele[0])] = ele[1]
    example_struct = OrderedDict({
        'obj_pattern_map': example_selected["object_pattern_map"],
        'rel_map': rel_map,
        'obj_map': example_selected["object_expression"],
        'grammer_pattern': example_selected['grammer_pattern'], # force it to be double recursive
        'adverb': example_selected['adverb_in_command'],
        'verb': example_selected['verb_in_command']
    })
    
    # the second object cannot be box!
    if example_struct['grammer_pattern'] == '$OBJ_0 ^ $OBJ_1 & $OBJ_2':
        if "box" not in example_struct["obj_map"]['$OBJ_1']:
            
            # other filters as well!
            if rel_map[("$OBJ_0", "$OBJ_1")] in ["$SAME_ROW", "$SAME_COLUMN"] and \
                rel_map[("$OBJ_0", "$OBJ_2")] in ["$SAME_ROW", "$SAME_COLUMN"]:
                example_struct['grammer_pattern'] = '$OBJ_0 ^ $OBJ_1 ^ $OBJ_2'
                rel_map_new = OrderedDict({})
                rel_map_new[("$OBJ_0", "$OBJ_1")] = example_struct['rel_map'][("$OBJ_0", "$OBJ_1")]
                rel_map_new[("$OBJ_1", "$OBJ_2")] = example_struct['rel_map'][("$OBJ_0", "$OBJ_2")]
                example_struct['rel_map'] = rel_map_new
                possible_command_structs += [example_struct]

In [None]:
synthetic_tasks = []
progress = 0
for example_selected in possible_command_structs:

    progress += 1
    if progress % 10 == 0:
        print(f"count={len(synthetic_tasks)}")
        print(f"progress={progress}")
    if len(synthetic_tasks) > 8000:
        break # we get enough!
        
    obj_pattern_map = example_selected["obj_pattern_map"]
    rel_map = example_selected["rel_map"]
    obj_map = example_selected["obj_map"]
    grammer_pattern = example_selected["grammer_pattern"]
    verb = example_selected["verb"]
    adverb = example_selected["adverb"]

    for _ in range(per_command_world_target_count):
        for i in range(per_command_world_retry_max): # retry essentially!
            sampled_world = simulator.sample_situations_from_grounded_grammer(
                    copy.deepcopy(grammer_pattern), 
                    copy.deepcopy(obj_pattern_map), 
                    copy.deepcopy(rel_map), 
                    copy.deepcopy(obj_map),
                    is_plot=False,
                    include_relation_distractor=True, 
                    include_attribute_distractor=True, 
                    include_isomorphism_distractor=True, 
                    include_random_distractor=True,
                    full_relation_probability=1.0, # 0.5 seems to work as well!
                    debug=False
                )

            assert len(sampled_world['obj_map']) == len(simulator._world.get_current_situation().to_representation()["placed_objects"])

            graph = ReaSCANGraph(
                objects=sampled_world["obj_map"], 
                object_patterns=sampled_world["obj_pattern_map"], 
                vocabulary=vocabulary,
                positions=sampled_world["pos_map"], 
                referred_object=sampled_world["referred_obj"],
                debug=False
            )

            pattern_graph = ReaSCANGraph(
                objects=obj_map, 
                object_patterns=None,
                vocabulary=vocabulary,
                relations=rel_map, 
                referred_object='$OBJ_0', 
                debug=False
            )

            potential_referent_target = graph.find_referred_object_super_fast(
                pattern_graph, referred_object='$OBJ_0', 
                pattern=grammer_pattern,
                debug=False
            )

            if len(potential_referent_target) == 1 and '$OBJ_0' in potential_referent_target:
                # print(f"{test_unique_find} / {i+1} unique solution find!")
                # Form the command with grounded determiners!
                obj_determiner_map = graph.find_determiners(
                    pattern_graph, 
                    referred_object='$OBJ_0', 
                    debug=False,
                )
                command_str = grammer.repre_str_command(
                    grammer_pattern, rel_map, obj_map, 
                    obj_determiner_map, 
                    verb,
                    adverb,
                )

                # Form the golden label for the action list!
                is_transitive = False
                if verb in simulator.vocabulary.get_transitive_verbs():
                    is_transitive = True
                # Direct walk.
                action = "walk" # this is definit!
                primitive_command = simulator.vocabulary.translate_word(action)
                target_position = sampled_world["situation"].target_object.position
                simulator._world.go_to_position(
                    position=target_position, manner=adverb, 
                    primitive_command=primitive_command
                )
                # Object actions.
                if is_transitive:
                    semantic_action = simulator.vocabulary.translate_word(verb)
                    simulator._world.move_object_to_wall(action=semantic_action, manner=adverb)
                target_commands, _ = simulator._world.get_current_observations()

                has_relation_distractor = False
                full_relation_distractor = True
                for rel_bool in sampled_world["distractor_switch_map"]["relation"]:
                    if rel_bool:
                        has_relation_distractor = True
                    else:
                        full_relation_distractor = False

                # Save all relevant information for a task.
                task_struct = OrderedDict({
                    "command": ",".join(command_str.split(" ")),
                    "grammer_pattern": grammer_pattern,
                    "meaning": ",".join(command_str.split(" ")),
                    "derivation": grammer_pattern,
                    "situation": sampled_world["situation"].to_representation(),
                    "target_commands": ",".join(target_commands),
                    "verb_in_command": verb,
                    "adverb_in_command": adverb,
                    "referred_target": obj_map["$OBJ_0"],
                    "object_pattern_map": obj_pattern_map,
                    "relation_map": [(k, v) for k, v in rel_map.items()],
                    "object_expression": obj_map,
                    "n_object": len(sampled_world["obj_map"]),
                    "n_distractor": len(sampled_world["obj_map"])-len(obj_map),
                    "full_relation_distractor": full_relation_distractor,
                    "has_relation_distractor": has_relation_distractor,
                    "has_attribute_distractor": sampled_world["distractor_switch_map"]["attribute"],
                    "has_isomorphism_distractor": sampled_world["distractor_switch_map"]["isomorphism"],
                    "has_random_distractor": True if sampled_world["n_random_distractor"] != -1 else False,
                    "n_random_distractor": sampled_world["n_random_distractor"] if sampled_world["n_random_distractor"] != -1 else 0,
                    "relation_distractor_metadata": sampled_world["relation_distractor_metadata"],
                    "attribute_distractor_metadata": sampled_world["attribute_distractor_metadata"],
                    "isomorphism_distractor_metadata": sampled_world["isomorphism_distractor_metadata"],
                    "random_distractor_metadata": sampled_world["random_distractor_metadata"],
                })
                synthetic_tasks += [task_struct]
                break

In [None]:
synthetic_tasks = {
    "test" : synthetic_tasks
}
dataset_representation = {
    "grid_size": 6,
    "type_grammar": "ReaSCAN-Grammer",
    "min_object_size": 1,
    "max_object_size": 4,
    "percentage_train": 0.0,
    "examples": synthetic_tasks,
    "intransitive_verbs": intransitive_verbs,
    "transitive_verbs": transitive_verbs,
    "adverbs": adverbs,
    "nouns": nouns,
    "color_adjectives": color_adjectives,
    "size_adjectives": size_adjectives,
    "relative_pronouns": relative_pronouns,
    "relation_clauses": relation_clauses,
}

In [None]:
split_name = "-e"
with open(f"../../data-files-updated/ReaSCAN-compositional{split_name}/data-compositional-splits.txt", "w") as fd:
    json.dump(dataset_representation, fd, indent=4)