In [5]:
import pandas as pd
import random 
import copy
import uuid

from common_bench.utils.py_io import *

In [6]:
df_tomi = pd.read_csv("data/tomi/tomi.csv")
tomi_data = df_tomi.to_dict(orient="records")
tomi_data[1]

{'story': 'Alexander entered the playroom. Jack entered the playroom. The potato is in the box. Emily entered the basement. Jack moved the potato to the suitcase. Alexander exited the playroom. Emily exited the basement. Alexander entered the playroom.',
 'question': 'Where does Alexander think that Jack searches for the potato?',
 'answer': 'suitcase',
 'i': 1,
 'dataGenSeq': 'enter_agent_1,enter_agent_0,agent_0_moves_obj,agent_2_enters,agent_1_exits,agent_1_reenters_loc,agent_2_exits',
 'sType': 'true_belief',
 'qTypeRaw': 'second_order_0_no_tom',
 'qTypeTomOrNot': False,
 'qOrder': 'second_order',
 'storyHasToM': False,
 'answerMem': 'box',
 'answerReal': 'suitcase',
 'qToMandOmniReader': False,
 'answerMemOrReal': 'reality',
 'cands': '["box", "suitcase"]',
 'correct': 1,
 'falseTrueBelief': True,
 'factVsMind': 'mind'}

In [9]:
tomi_test = []
for data in tomi_data:
    item = {
        "uuid": str(uuid.uuid4()),
        "story": data["story"],
        "question": data["question"],
        "answer": data["answer"],
        "options": [],
        "metadata": {
            "task": "tomi",
            "storyHasToM": data["storyHasToM"],
            "sType": data["sType"],
            "qOrder": data["qOrder"],
            "qTypeRaw": data["qTypeRaw"],
            "qTypeTomOrNot": data["qTypeTomOrNot"],
            "factVsMind": data["factVsMind"],
        }
    }
    tomi_test.append(item)

In [10]:
tomi_test[0]

{'uuid': '06bc4a35-e464-4f0c-882d-7e8a090cf93c',
 'story': 'James entered the living room. Hunter entered the living room. The pajamas is in the bucket. James moved the pajamas to the drawer. Hunter exited the living room. Olivia entered the pantry.',
 'question': 'Where is the pajamas really?',
 'answer': 'drawer',
 'options': [],
 'metadata': {'task': 'tomi',
  'storyHasToM': False,
  'sType': 'true_belief',
  'qOrder': 'reality',
  'qTypeRaw': 'reality',
  'qTypeTomOrNot': False,
  'factVsMind': 'fact'}}

In [11]:
write_jsonl(tomi_test, "data/tomi/test.jsonl")

In [37]:
rels = [
    "son", "daughter",
    "brother", "sister",
    "father", "mother",
    "husband", "wife",
    "grandfather", "grandmother",
    "grandson", "granddaughter",
    "uncle", "aunt",
    "son-in-law", "daughter-in-law",
    "father-in-law", "mother-in-law",
    "brother-in-law", "sister-in-law",
    "nephew", "niece"
]

persons = [
    'A', 'B', 'C', 'D', 
    'H', 'J', 'K', 'L', 
    'M', 'N', 'O', 'P', 
    'Q', 'R', 'S', 'T',
    'V', 'X', 'Y', 'Z',]

def get_knowledge(tokens):
    entitiy = []
    relation = None
    for tok in tokens:
        if tok.isdigit():
            entitiy.append(persons[int(tok)-1])
        if tok in rels:
            relation = tok
    assert len(entitiy) == 2
    if relation is None:
        print(tokens)
    return ' '.join(entitiy), relation


In [38]:
def simplify(dataset):
    simple_dataset = copy.deepcopy(dataset)
    for data in simple_dataset:
        facts = []
        for fact in data['facts']:
            tokens = fact.split()
            entity, relation = get_knowledge(tokens)
            facts.append((entity, relation))
        data['facts'] = facts
        for question in data['questions']:
            answer = question[1]
            tokens = answer.split()
            entity, relation = get_knowledge(tokens)
            question[0] = f"How are {entity[0]} and {entity[-1]} related to each other ?"
            question[1] = entity
            assert len(question[0].split()) == 10
            question.append(relation)
    return simple_dataset

In [39]:
simple_clutrr_4 = simplify(clutrr_4)

In [40]:
write_jsonl(simple_clutrr_4, "data/clutrr_4_hop/dev.jsonl")

In [41]:
simple_clutrr_6 = simplify(clutrr_6)

In [42]:
write_jsonl(simple_clutrr_6, "data/clutrr_6_hop/dev.jsonl")