In [1]:
%autoreload 2

In [2]:
from ast import literal_eval
from collections import defaultdict, Counter
from contextlib import contextmanager
from itertools import combinations, chain
import hashlib
import json
import os
import shelve
from types import SimpleNamespace
import typing


import colorcet as cc
import krippendorff
from IPython.display import display, HTML, Markdown
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from statannotations.Annotator import Annotator
import statsmodels.api as sm
import statsmodels.formula.api as smf
from tabulate import tabulate


# import arviz as az
# az.style.use("arviz-darkgrid")
# from cmdstanpy import CmdStanModel

from sklearn.model_selection import train_test_split


In [3]:
class ExtendedSimpleNamespace(SimpleNamespace):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def __contains__(self, key):
        return hasattr(self, key)

    def __getitem__(self, key):
        return getattr(self, key)

    def get(self, key, default=None):
        return getattr(self, key, default)

    def keys(self):
        return [k for k in vars(self).keys() if not k.startswith("_")]
        

In [4]:
TOP_30_UNMATCHED_FULL_GAME_IDS = [
    '(1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1)-unmatched',
    '(1, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0)-unmatched',
    '(1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0)-unmatched',
    '(1, 1, 4, 0, 0, 1, 1, 1, 0, 1, 0, 0)-unmatched',
    '(1, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0, 0)-unmatched',
    '(1, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1)-unmatched',
    '(1, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0)-unmatched',
    '(1, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 1)-unmatched',
    '(1, 1, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0)-unmatched',
    '(1, 0, 4, 0, 1, 1, 0, 1, 0, 1, 0, 0)-unmatched',
    '(1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0)-unmatched',
    '(1, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0)-unmatched',
    '(1, 1, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0)-unmatched',
    '(1, 1, 4, 0, 2, 0, 0, 0, 1, 0, 0, 0)-unmatched',
    '(1, 1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0)-unmatched',
    '(1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0)-unmatched',
    '(1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 1, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 1, 3, 1, 0, 0, 1, 0, 0, 0, 1, 0)-unmatched',
    '(1, 1, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0)-unmatched',
    '(1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0)-unmatched',
    '(1, 1, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 1, 3, 0, 0, 2, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0)-unmatched',
    '(1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0)-unmatched',
    '(1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0)-unmatched',
 ]

In [6]:
CURRENT_DATA_PATH = './human_evals_data/real-complete_only-main-pilot_02-02_data.json'
FULL_GAME_ID_TO_FITNESS_PATH = './human_evals_data/full_game_id_to_fitness.csv'
FULL_GAME_ID_TO_NODE_COUNT_PATH = './human_evals_data/node_counts.csv'
CURRENT_DF_PATH = './human_evals_data/current_participants_df.csv'

GAME_KEYS = [
    'id', 'real', 'matched', 
    'confident', 'fun_play', 'fun_watch', 
    'capability', 'goldilocks', 'creativity',
    'human-likeness', 'explain', 'overall'
]

RENAMED_GAME_KEYS = {
    'id': 'game_id',
    'human-likeness': 'human_likeness'
}

NUMBER_SCORE_ATTRIBUTES = [
    'confident', 'fun_play', 
    'fun_watch',  'capability', 
    'goldilocks', 'creativity',
    'human_likeness',
]

NUMBER_SCORE_AXIS_NAMES = {
    'confident': 'How confident are you that you understand the game?',
    'fun_play': 'How fun would it be to play the game yourself?',
    'fun_watch': 'How fun would it be to watch someone else play the game?',
    'capability': 'How helpful would it be for learning to interact',
    'goldilocks': 'Too easy, appropriately difficult, or too hard for you?',
    'creativity': 'How creatively designed is the game?',
    'human_likeness': 'How human-like do you think this game is?',
}


def load_data(data_path: str) -> typing.Tuple[typing.List[ExtendedSimpleNamespace], typing.List[ExtendedSimpleNamespace]]:
    with open(data_path, 'r') as f:
        raw_data = json.load(f, object_hook=lambda d: ExtendedSimpleNamespace(**d))

    prolific_participants_data = []
    non_prolific_participants_data = []

    for participant_data in raw_data:
        if participant_data.data.recruitment_service == 'prolific':
            prolific_participants_data.append(participant_data)
        else:
            non_prolific_participants_data.append(participant_data)

    return prolific_participants_data, non_prolific_participants_data


def real_matched_to_str(series: pd.Series) -> str:
    if series.real:
        return 'real'

    if series.matched:
        return 'matched'

    return 'unmatched'


def full_game_id_to_str(series: pd.Series) -> str:
    return f"{series.game_id}-{series.game_type}"


def loaded_data_to_df(loaded_data: typing.List[ExtendedSimpleNamespace]):
    rows = []

    for participant in loaded_data:
        participant_id = participant.id

        for game_result in participant.data.single_game_results:
            rows.append({
                'participant_id': participant_id,
                **{RENAMED_GAME_KEYS.get(key, key): game_result[key] for key in GAME_KEYS}
            })
    
    df = pd.DataFrame(rows)
    df = df.assign(game_type=df.apply(real_matched_to_str, axis=1))
    df = df.assign(full_game_id=df.apply(full_game_id_to_str, axis=1), **{f'normalized_{attribute}': 0.0 for attribute in NUMBER_SCORE_ATTRIBUTES},
                   num_preferences=[literal_eval(x)[2] for x in df.game_id])
    
    full_game_id_to_node_count_df = pd.read_csv(FULL_GAME_ID_TO_NODE_COUNT_PATH)
    df = pd.merge(df, full_game_id_to_node_count_df, on='full_game_id', how='left', suffixes=('', '_y'))
    df = df.drop(columns=[c for c in df.columns if c.endswith('_y')])

    participant_ids = df.participant_id.unique()

    for participant_id in participant_ids:
        for attribute in NUMBER_SCORE_ATTRIBUTES:
            participant_attr_mean = df[df.participant_id == participant_id][attribute].mean()
            participant_attr_std = df[df.participant_id == participant_id][attribute].std()

            if participant_attr_std == 0:
                df.loc[df.participant_id == participant_id, f'normalized_{attribute}'] = 0
            else:
                df.loc[df.participant_id == participant_id, f'normalized_{attribute}'] = (df.loc[df.participant_id == participant_id, attribute] - participant_attr_mean) / participant_attr_std


    df = df.assign(**{f'fully_normalized_{attribute}': (df[attribute] - df[attribute].mean()) / df[attribute].std() for attribute in NUMBER_SCORE_ATTRIBUTES})

    top_30_df = df[df.full_game_id.isin(TOP_30_UNMATCHED_FULL_GAME_IDS)].copy()
    top_30_df.game_type = 'unmatched_top_30'
    top_30_df = top_30_df.assign(full_game_id=top_30_df.apply(full_game_id_to_str, axis=1))

    df = pd.concat([df, top_30_df])
    df = df.assign(
        participant_id=pd.Categorical(df.participant_id, categories=sorted(df.participant_id.unique()), ordered=True),
        game_type=pd.Categorical(df.game_type, categories=['real', 'matched', 'unmatched', 'unmatched_top_30'], ordered=True),
    )
    
    full_game_id_to_fitness_df = pd.read_csv(FULL_GAME_ID_TO_FITNESS_PATH)
    df = pd.merge(df, full_game_id_to_fitness_df, on='full_game_id', how='left')
    df = df.assign(
        normalized_fitness=(df.fitness - df.fitness.mean()) / (df.fitness.std()),
        normalized_archive_distance=(df.archive_distance - df.archive_distance.mean()) / (df.archive_distance.std()),
        full_game_id=pd.Categorical(df.full_game_id, categories=sorted(df.full_game_id.unique()), ordered=True),
    )

    return df


def load_data_to_df(data_path: str, prolific_only: bool = True):
    prolific_participants_data, non_prolific_participants_data = load_data(data_path)
    if not prolific_only:
        prolific_participants_data.extend(non_prolific_participants_data)

    prolific_participants_data = list(filter(lambda p: p.data.get('quiz_passed', True), prolific_participants_data))

    return prolific_participants_data, loaded_data_to_df(prolific_participants_data)


current_participants_raw_data, current_participants_df = load_data_to_df(CURRENT_DATA_PATH)
print(f'Loaded current participant data with shape {current_participants_df.shape} from {CURRENT_DATA_PATH}')


print(f'Dataframe shape: {current_participants_df.shape}')
current_participants_df.to_csv(CURRENT_DF_PATH, index=False)
current_participants_df.head()


Loaded current participant data with shape (1292, 35) from ./human_evals_data/real-complete_only-main-pilot_02-02_data.json
Dataframe shape: (1292, 35)


Unnamed: 0,participant_id,game_id,real,matched,confident,fun_play,fun_watch,capability,goldilocks,creativity,...,fully_normalized_fun_play,fully_normalized_fun_watch,fully_normalized_capability,fully_normalized_goldilocks,fully_normalized_creativity,fully_normalized_human_likeness,fitness,archive_distance,normalized_fitness,normalized_archive_distance
0,00c9bf44-28f3-469a-8a71-ea972af61bab-p102,"(1, 1, 4, 0, 2, 0, 0, 0, 1, 0, 0, 0)",False,False,4,3,3,5,3,2,...,0.636636,0.694003,1.696015,0.32443,-0.21321,-0.387244,36.53178,4,-0.094034,1.934673
1,00c9bf44-28f3-469a-8a71-ea972af61bab-p102,"(1, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0, 0)",False,False,4,3,3,4,3,3,...,0.636636,0.694003,0.878293,0.32443,0.737669,-0.387244,36.065792,2,-0.776234,0.56428
2,00c9bf44-28f3-469a-8a71-ea972af61bab-p102,"(1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0)",True,True,4,2,2,5,4,3,...,-0.250393,-0.179182,1.696015,1.277683,0.737669,-0.387244,36.065643,0,-0.776452,-0.806114
3,00c9bf44-28f3-469a-8a71-ea972af61bab-p102,"(1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0)",False,True,5,4,4,3,2,2,...,1.523665,1.567189,0.060572,-0.628822,-0.21321,-0.387244,37.139469,0,0.795615,-0.806114
4,00c9bf44-28f3-469a-8a71-ea972af61bab-p102,"(1, 1, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0)",False,False,1,1,1,1,5,1,...,-1.137421,-1.052368,-1.574871,2.230935,-1.164089,0.43761,35.682575,3,-1.337259,1.249477


In [7]:
def add_linebreaks(string: str, n_lines: int = 2, min_break_length: int = 36):
    if len(string) <= min_break_length:
        return string

    prev_break_index = 0
    line_parts = []

    for break_index in range(1, n_lines):
        break_index = string.find(' ', int(break_index * len(string) / n_lines))
        if break_index == -1:
            break

        line_parts.append(string[prev_break_index:break_index])
        prev_break_index = break_index + 1

    line_parts.append(string[prev_break_index:])
    return '\n'.join(line_parts)


# Human evaluation result summary

Table SI-2, from which Table 1 is derived.

In [9]:
rows = []
UNMATCHED_TOP_30 = 'unmatched_top_30'
REAL = 'real'
MATCHED = 'matched'
AST = '*'
USE_NORMALIZED = False
TRUNCATE_P_VALUES = False
RESULT_COLUMN_TEXT_SIZE = 'footnotesize'

# category_order_for_t_tests = [UNMATCHED_TOP_30, REAL, MATCHED]
category_order_for_means = [REAL, MATCHED, UNMATCHED_TOP_30]

for attribute in NUMBER_SCORE_ATTRIBUTES:
    if USE_NORMALIZED:
        attribute = f'normalized_{attribute}'

    attribute_name = NUMBER_SCORE_AXIS_NAMES[attribute]
    # attribute_name = attribute.replace('_', '\\_').replace('alized', '')

    row = [attribute_name]
    
    for category in category_order_for_means:
        curr = current_participants_df[current_participants_df.game_type == category][attribute]
        row.append(f'${curr.mean():.3f} \\pm {curr.sem():.3f}$')

    for first, second in combinations(category_order_for_means, 2):
        first_data = current_participants_df[current_participants_df.game_type == first][attribute]
        second_data = current_participants_df[current_participants_df.game_type == second][attribute]
        # result = stats.ttest_ind(first_data, second_data)
        result = stats.mannwhitneyu(first_data, second_data)
        statistic, p_value = result.statistic, result.pvalue
        stars = AST * int(p_value < 0.05) + AST * int(p_value < 0.01) + AST * int(p_value < 0.001)
        if stars:
            # stars = f'\\textsuperscript{{ \\textbf {{ {stars} }} }}'
            stars = f'^{{ {stars} }}'
        # else:
        #     stars = '$'

        if TRUNCATE_P_VALUES and p_value < 1e-5:
            p_value = r'$P < \num{1e-5}' + stars + '$'

        elif TRUNCATE_P_VALUES and p_value < 1e-3:
            p_value = r'$P < \num{1e-3}' + stars + '$'

        else:
            use_scientific = p_value < 1e-2
            if use_scientific:
                p_value = f'$P = \\num{{ {p_value:.3e} }}{stars}$'
            else:
                p_value = f'$P = {p_value:.3f}{stars}$'

        # row.extend([f'{{ \\{RESULT_COLUMN_TEXT_SIZE} {statistic:.1f} }}', f'{{ \\{RESULT_COLUMN_TEXT_SIZE} {p_value} }}'])
        # row.extend([f'{{ \\{RESULT_COLUMN_TEXT_SIZE} {statistic:.1f} }}, {{ \\{RESULT_COLUMN_TEXT_SIZE} {p_value} }}'])
        row.extend([f'{statistic:.1f}, {p_value}'])

    rows.append(row)


HEADERS = [
    'Attribute', 'Real', 'Matched', 'Unmatched',
    'U-stat, p-value', 'U-stat, p-value', 'U-stat, p-value'
]

tabulated = tabulate(rows, headers=HEADERS, floatfmt='.3f', tablefmt='latex_raw')
print(tabulated)
    

\begin{tabular}{lllllll}
\hline
 Attribute                                                & Real              & Matched           & Unmatched         & U-stat, p-value      & U-stat, p-value                          & U-stat, p-value                          \\
\hline
 How confident are you that you understand the game?      & $3.943 \pm 0.068$ & $3.923 \pm 0.070$ & $3.331 \pm 0.075$ & 45088.0, $P = 0.906$ & 55921.5, $P = \num{ 1.718e-09 }^{ *** }$ & 55846.0, $P = \num{ 3.733e-09 }^{ *** }$ \\
 How fun would it be to play the game yourself?           & $2.522 \pm 0.066$ & $2.430 \pm 0.064$ & $2.068 \pm 0.062$ & 46752.5, $P = 0.352$ & 54040.5, $P = \num{ 3.235e-07 }^{ *** }$ & 52539.5, $P = \num{ 1.826e-05 }^{ *** }$ \\
 How fun would it be to watch someone else play the game? & $2.385 \pm 0.068$ & $2.313 \pm 0.066$ & $2.024 \pm 0.064$ & 46169.0, $P = 0.519$ & 51636.5, $P = \num{ 8.793e-05 }^{ *** }$ & 50515.0, $P = \num{ 1.027e-03 }^{ ** }$  \\
 How helpful would it be for learning to 