In [1]:
import json
import numpy as np
from typing import Dict, List, Union
import logging
import argparse

FORMAT = "%(asctime)-15s %(message)s"
logging.basicConfig(format=FORMAT, level=logging.DEBUG,
                    datefmt="%Y-%m-%d %H:%M")
logger = logging.getLogger(__name__)

In [2]:
def parse_sparse_situation(situation_representation: dict, grid_size: int) -> np.ndarray:
    """
    Each grid cell in a situation is fully specified by a vector:
    [_ _ _ _ _ _ _   _       _      _       _   _ _ _ _]
     1 2 3 4 r g b circle square cylinder agent E S W N
     _______ _____ ______________________ _____ _______
       size  color        shape           agent agent dir.
    :param situation_representation: data from dataset.txt at key "situation".
    :param grid_size: int determining row/column number.
    :return: grid to be parsed by computational models.
    """
    num_object_attributes = len([int(bit) for bit in situation_representation["target_object"]["vector"]])
    # Object representation + agent bit + agent direction bits (see docstring).
    num_grid_channels = num_object_attributes + 1 + 4

    # Initialize the grid.
    grid = np.zeros([grid_size, grid_size, num_grid_channels], dtype=int)

    # Place the agent.
    agent_row = int(situation_representation["agent_position"]["row"])
    agent_column = int(situation_representation["agent_position"]["column"])
    agent_direction = int(situation_representation["agent_direction"])
    agent_representation = np.zeros([num_grid_channels], dtype=np.int)
    agent_representation[-5] = 1
    agent_representation[-4 + agent_direction] = 1
    grid[agent_row, agent_column, :] = agent_representation

    # Loop over the objects in the world and place them.
    placed_position = set([])
    for placed_object in situation_representation["placed_objects"].values():
        object_vector = np.array([int(bit) for bit in placed_object["vector"]], dtype=np.int)
        object_row = int(placed_object["position"]["row"])
        object_column = int(placed_object["position"]["column"])
        placed_position.add((object_row, object_column))
        if (object_row, object_column) not in placed_position:
            grid[object_row, object_column, :] = np.concatenate([object_vector, np.zeros([5], dtype=np.int)])
        else:
            overlay = np.concatenate([object_vector, np.zeros([5], dtype=np.int)])
            grid[object_row, object_column, :] += overlay # simply add it.
    return grid


def data_loader(file_path: str) -> Dict[str, Union[List[str], np.ndarray]]:
    """
    Loads grounded SCAN dataset from text file and ..
    :param file_path: Full path to file containing dataset (dataset.txt)
    :returns: dict with as keys all splits and values list of example dicts with input, target and situation.
    """
    with open(file_path, 'r') as infile:
        all_data = json.load(infile)
        grid_size = int(all_data["grid_size"])
        splits = list(all_data["examples"].keys())
        logger.info("Found data splits: {}".format(splits))
        loaded_data = {}
        for split in splits:
            loaded_data[split] = []
            logger.info("Now loading data for split: {}".format(split))
            for data_example in all_data["examples"][split]:
                input_command = data_example["command"].split(',')
                target_command = data_example["target_commands"].split(',')
                situation = parse_sparse_situation(situation_representation=data_example["situation"],
                                                   grid_size=grid_size)
                loaded_data[split].append({"input": input_command,
                                           "target": target_command,
                                           "situation": situation.tolist()})  # .tolist() necessary to be serializable
            logger.info("Loaded {} examples in split {}.\n".format(len(loaded_data[split]), split))
    return loaded_data


Loading ReaSCAN-novel-action-length

In [3]:
data = data_loader("../../../data-files/ReaSCAN-novel-action-length/data-compositional-splits.txt")

2021-09-10 17:10 Found data splits: ['train', 'test', 'dev', 'new_action_length']
2021-09-10 17:10 Now loading data for split: train
2021-09-10 17:11 Loaded 85000 examples in split train.

2021-09-10 17:11 Now loading data for split: test
2021-09-10 17:11 Loaded 6694 examples in split test.

2021-09-10 17:11 Now loading data for split: dev
2021-09-10 17:11 Loaded 6694 examples in split dev.

2021-09-10 17:11 Now loading data for split: new_action_length
2021-09-10 17:11 Loaded 1110 examples in split new_action_length.



In [4]:
for split, dt in data.items():
    with open('./data-files/ReaSCAN-novel-action-length/' + split + '.json', 'w') as f:
        for line in dt:
            f.write(json.dumps(line) + '\n')

Loading ReaSCAN-novel-attribute

In [5]:
data = data_loader("../../../data-files/ReaSCAN-novel-attribute/data-compositional-splits.txt")

2021-09-10 17:13 Found data splits: ['train', 'test', 'dev', 'new_color', 'new_size']
2021-09-10 17:13 Now loading data for split: train
2021-09-10 17:14 Loaded 76301 examples in split train.

2021-09-10 17:14 Now loading data for split: test
2021-09-10 17:14 Loaded 4951 examples in split test.

2021-09-10 17:14 Now loading data for split: dev
2021-09-10 17:14 Loaded 4967 examples in split dev.

2021-09-10 17:14 Now loading data for split: new_color
2021-09-10 17:14 Loaded 7177 examples in split new_color.

2021-09-10 17:14 Now loading data for split: new_size
2021-09-10 17:14 Loaded 7261 examples in split new_size.



In [6]:
for split, dt in data.items():
    with open('./data-files/ReaSCAN-novel-attribute/' + split + '.json', 'w') as f:
        for line in dt:
            f.write(json.dumps(line) + '\n')

Loading ReaSCAN-novel-direction

In [7]:
data = data_loader("../../../data-files/ReaSCAN-novel-direction/data-compositional-splits.txt")

2021-09-10 17:14 Found data splits: ['train', 'test', 'dev', 'new_direction']
2021-09-10 17:14 Now loading data for split: train
2021-09-10 17:14 Loaded 85000 examples in split train.

2021-09-10 17:14 Now loading data for split: test
2021-09-10 17:14 Loaded 5543 examples in split test.

2021-09-10 17:14 Now loading data for split: dev
2021-09-10 17:14 Loaded 5543 examples in split dev.

2021-09-10 17:14 Now loading data for split: new_direction
2021-09-10 17:14 Loaded 8000 examples in split new_direction.



In [8]:
for split, dt in data.items():
    with open('./data-files/ReaSCAN-novel-direction/' + split + '.json', 'w') as f:
        for line in dt:
            f.write(json.dumps(line) + '\n')