In [119]:
import numpy as np
import pickle
import json
import os
import pandas as pd
from tqdm.notebook import tqdm
from processor import Log
from collections import defaultdict

In [61]:
def load_logs(log_repository, data_path):
    filepath = os.path.join(data_path, log_repository)
    print("Loading logs from {}...".format(filepath))

    missing_counter = 0
    file_count = 0
    for _, _, files in os.walk(filepath):
        file_count += len(files)
    print("{} files found.".format(file_count))
    logs = []
    for root, dirs, files in os.walk(filepath):
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(root, file), 'r') as logfile:
                    log = Log(json.load(logfile))
                    if log.complete:
                        logs.append(log)

    print("DONE. Loaded {} completed game logs.".format(len(logs)))
    return logs


def collect_dataset(logs):
    labels = ["Game_ID", "Game_Domain_ID", "Game_Domain_1", "Game_Domain_2", "Game_Duration", "Game_Score", \
              "Feedback_A", "Feedback_B", 'Agent_1', "Agent_2", \
              "Round_Nr", "Round_Duration", "Round_Scores", "Round_Images_A", "Round_Images_B", \
              "Round_Common", "Round_Highlighted_A", "Round_Highlighted_B", \
              "Message_Nr", "Message_Timestamp", "Message_Turn", "Message_Agent_ID", \
              "Message_Speaker", "Message_Type", "Message_Text"]
    dataset = []
    for log in logs:
        game_data = [log.game_id, log.domain_id, log.domains[0], log.domains[1], log.duration.total_seconds(),
                     log.total_score, log.feedback["A"], log.feedback["B"], log.agent_ids[0], log.agent_ids[1]]
        for game_round in log.rounds:
            round_data = [game_round.round_nr - 1, game_round.duration.total_seconds(), game_round.total_score,
                          game_round.images["A"], game_round.images["B"], game_round.common,
                          game_round.highlighted["A"], game_round.highlighted["B"]]
            for message in game_round.messages:
                message_data = [message.message_id, message.timestamp, message.turn, message.agent_id, \
                                message.speaker, message.type, message.text]
                dataset.append(game_data + round_data + message_data)

    df = pd.DataFrame(dataset, columns=labels)

    return df


def remove_prefixing_zeros(s):
    if s[0] == 0:
        return remove_prefixing_zeros(s[1:])
    else:
        return s


def path_to_id(path):
    id_str = path.split('/')[1].split('.')[0].split('_')[-1]
    id_str = remove_prefixing_zeros(id_str)
    return int(id_str)


---

In [28]:
with open('visual_genome/attributes.json', 'r') as f:
    A = json.loads(f.read())
    
with open('visual_genome/relationships.json', 'r') as f:
    R = json.loads(f.read())


In [79]:
logs = load_logs('logs', '')
dataset = collect_dataset(logs)

F = ['Round_Images_A', 'Round_Images_B']
f2i = {field: i for i, field in enumerate(F)}

Loading logs from logs...
2502 files found.
DONE. Loaded 2502 completed game logs.


In [93]:
all_pb_images = set()

for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
    for field in F:
        for im in row[field]:
            all_pb_images.add(im)

all_pb_images = list(all_pb_images)


HBox(children=(IntProgress(value=0, max=294370), HTML(value='')))




['person_couch/COCO_train2014_000000388344.jpg',
 'person_bench/COCO_train2014_000000449919.jpg',
 'person_bench/COCO_train2014_000000015354.jpg',
 'couch_laptop/COCO_train2014_000000297632.jpg',
 'person_elephant/COCO_train2014_000000065220.jpg',
 'person_refrigerator/COCO_train2014_000000005373.jpg',
 'car_motorcycle/COCO_train2014_000000480807.jpg',
 'person_motorcycle/COCO_train2014_000000226176.jpg',
 'person_umbrella/COCO_train2014_000000568064.jpg',
 'person_elephant/COCO_train2014_000000571351.jpg',
 'person_oven/COCO_train2014_000000136200.jpg',
 'person_umbrella/COCO_train2014_000000205931.jpg',
 'dining_table_refrigerator/COCO_train2014_000000130011.jpg',
 'person_tv/COCO_train2014_000000544994.jpg',
 'person_tv/COCO_train2014_000000420617.jpg',
 'person_bicycle/COCO_train2014_000000326202.jpg',
 'person_bed/COCO_train2014_000000528563.jpg',
 'person_truck/COCO_train2014_000000087219.jpg',
 'couch_dining_table/COCO_train2014_000000180606.jpg',
 'person_cake/COCO_train2014_00

In [100]:
all_pb_image_ids = {path_to_id(path): path for path in all_pb_images}

In [101]:
with open('visual_genome/image_data.json', 'r') as f:
    idata = json.loads(f.read())
    
coco2imageid = {}
image2cocoid = {}

for idatum in idata:
    coco_id = idatum['coco_id']
    if coco_id and coco_id in all_pb_image_ids:
        coco2imageid[coco_id] = idatum['image_id']
        image2cocoid[idatum['image_id']] = coco_id

In [99]:
relevant_ids = []
for path in all_pb_images:
    try:
        relevant_ids.append(coco2imageid[path_to_id(path)])
    except KeyError:
        relevant_ids.append(None)
                                        

In [126]:
attributes = defaultdict(set)

for entry in A:
    if entry['image_id'] not in relevant_ids:
        continue

    coco_id = image2cocoid[entry['image_id']]
    im_path = all_pb_image_ids[coco_id]

    for att in entry['attributes']:
        try:
            entity_attributes = set([a.lower() for a in att['attributes']])
        except KeyError:
            entity_attributes = set()

        entity_names = set()
        for phrase in att['names']:
            for tok in phrase.split(' '):
                entity_names.add(tok.lower())
        
        attributes[im_path] |= (entity_names | entity_attributes)

In [140]:
relations = defaultdict(set)

for entry in R:
    if entry['image_id'] not in relevant_ids:
        continue

    coco_id = image2cocoid[entry['image_id']]
    im_path = all_pb_image_ids[coco_id]
    
    for rel in entry['relationships']:
        predicate = rel['predicate'].lower().split(' ')
        subject = rel['subject']['name'].lower().split(' ')
        object = rel['object']['name'].lower().split(' ')
        
        relations[im_path] |= set(predicate + subject + object)
        
#         try:
#             entity_attributes = set(att['attributes'])
#         except KeyError:
#             entity_attributes = set()

#         entity_names = set()
#         for phrase in att['names']:
#             for tok in phrase.split(' '):
#                 entity_names.add(tok)
        
#         attributes[im_path] |= (entity_names | entity_attributes)

In [145]:
with open('visual_genome/attributes.dict', 'wb') as f:
    pickle.dump(attributes, file=f)
    
with open('visual_genome/relationships.dict', 'wb') as f:
    pickle.dump(relations, file=f)

In [148]:
with open('visual_genome/attributes.dict', 'rb') as f:
    vg_attributes = pickle.load(f)

In [144]:
relations['person_train/COCO_train2014_000000010275.jpg']

{'and',
 'barrette',
 'beige',
 'black',
 'both',
 'brick',
 'broken',
 'brown',
 'camel',
 'chin',
 'coat',
 'glasses',
 'grey',
 'hair',
 'hand',
 'has',
 'head',
 'holding',
 'in',
 'jacket',
 'lady',
 'looks',
 'man',
 'no',
 'nt',
 'of',
 'on',
 'open',
 'out',
 'outside',
 'people',
 'person',
 'pony',
 'rain',
 'red',
 'says',
 'sign',
 'sleeping',
 'smoking',
 'sticker',
 'striped',
 'stripes',
 'tail',
 'tan',
 'umbrella',
 'umbrellas',
 'wall',
 'wears',
 'white',
 'window',
 'winter',
 'with',
 'woman',
 'working'}