In [None]:
import numpy as np
import pandas as pd
import random
import math
import os
import json
from pb_processor import Log

In [None]:
def load_logs(dir_path):
    print('>> Loading logs from "{}"'.format(dir_path))

    file_count = 0
    for _, _, files in os.walk(dir_path):
        for file in files:
            file_count += int(file.endswith('.json'))
    print('{} files found.'.format(file_count))

    logs = {}
    for root, _, files in os.walk(dir_path):
        for file in files:
            if file.endswith('.json'):
                with open(os.path.join(root, file), 'r') as logfile:
                    log = Log(json.load(logfile))
                    if log.complete:
                        logs[log.game_id] = log

    print('DONE. Loaded {} completed game logs.'.format(len(logs)))
    return logs

In [None]:
logs = load_logs('downloaded_data/photobook/logs/')

In [None]:
_ids = sorted(logs.keys())

train, val = np.split(np.array(_ids), 
         [math.ceil((len(_ids) / 100 * 70))]
)
len(train) / len(_ids), len(val) / len(_ids)

In [None]:
for IDS, SPLIT in [(val, 'analysis'), (train, 'train')]:

    utterances = []
    indices_in_game = []
    indices_in_round = []
    round_numbers = []
    game_ids = []
    speakers = []
    game_scores = []
    round_scores = []
    game_durations = []
    round_durations = []

    for game_id, game in logs.items():

        if game_id not in IDS:
            continue

        idx_in_game = 1
        for round_nr, round in enumerate(game.rounds, start=1):
            idx_in_round = 1
            for msg in round.messages:
                if msg.type != 'text':
                    continue
                utterance = msg.text.strip()
                if not utterance:
                    continue

                utterances.append(utterance)
                indices_in_game.append(idx_in_game)
                indices_in_round.append(idx_in_round)
                round_numbers.append(round_nr)
                game_ids.append(game_id)
                speakers.append(msg.speaker)
                game_scores.append(game.total_score)
                round_scores.append(round.total_score)
                game_durations.append(game.duration)
                round_durations.append(round.duration)

                idx_in_game += 1
                idx_in_round += 1

    dataset = list(zip(game_ids, speakers, indices_in_game, indices_in_round, round_numbers, utterances,
                       game_scores, round_scores, game_durations, round_durations))
    dataframe = pd.DataFrame(
        dataset,
        columns=['dialogue_id', 'speaker', 'position_in_dialogue', 'position_in_round', 'round_number', 'text',
                 'game_score', 'round_score', 'game_duration', 'round_duration']
    )
    
    dataframe.to_csv('downloaded_data/photobook/{}.csv'.format(SPLIT), index=False)

In [None]:
len(val), len(train)  # number of games