In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
import json
import pandas as pd
import os.path as osp
from glob import glob
from data_utils import parse_round, gameid_order

sys.path.append('../')
from configuration import Config

config = Config()

In [3]:
# Data comes from this paper: https://www.aclweb.org/anthology/P19-1059/
files = glob(osp.join(config.colorgrid_data, 'examples', 'games', 'json', 'colorGrids', 'raw', '*/*'))

In [4]:
# collect game data

all_games = dict()

for file in files:
    # every file is a game

    with open(file, "r") as f:
        game = json.load(f)

    records = game["records"]
    gameid = game["gameid"]
    for i in range(len(records)):
        records[i]["gameid"] = gameid

    all_games[gameid] = records

In [5]:
# collect, parse and filter rounds for all games

# store all rounds in flat list
all_rounds = []
for rounds in all_games.values():
    all_rounds += rounds

# reformat rounds
parsed_rounds = [parse_round(r) for r in all_rounds]

# filter out invalid rounds
valid_rounds = [
    p
    for p in parsed_rounds
    if (p["success"] != None)  # keep only if states included
    if (len(p["utterances"]) > 0)  # keep only if utterances included
]

print('rounds (total):', len(parsed_rounds))
print('valid: (valid):', len(valid_rounds))

rounds (total): 10961
valid: (valid): 10925


In [6]:
# build data frame (with rounds in separate rows)
rounds_df = pd.DataFrame(valid_rounds)

# to reproduce our order:
rounds_df = rounds_df.sort_values(by=['gameid', 'roundNum'], key=lambda column: column.map(lambda e: gameid_order.get(e))).reset_index(drop=True)

rounds_df['round_id'] = rounds_df.apply(lambda x: f'{x.gameid}_{x.roundNum}', axis=1)
rounds_df['n_utterances'] = rounds_df.utterances.map(len)  # number of utterances in this round
game_idx = pd.factorize(rounds_df['gameid'])[0] + 1  # offset to match roundNum (starts with 1)
rounds_df['game_idx'] = game_idx

# reorder columns
rounds_df = rounds_df[[
    'gameid', 
    'game_idx',
    'roundNum', 
    'round_id',
    'condition', 
    'success',
    'utterances', 
    'n_utterances', 
    'objs', 
    'target',
    'speaker_order', 
    'listener_order', 
    'listener_clicked', 
]]

In [7]:
out_path = osp.join(config.data_dir, 'color_grid_data.json')
rounds_df.to_json(out_path, orient='records')