In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import json
from collections import defaultdict, Counter, namedtuple
from itertools import combinations
from tabulate import tabulate
import sys
import os
import json
import editdistance

PROJECT_PATH = '/Users/guydavidson/projects/game-generation-modeling'
sys.path.append(PROJECT_PATH)

## Load and munge data

In [2]:
from schema.validate_schema import load_and_validate_game_schema
SCHEMA_FILE = '../schema/game_schema_with_refs.json'
GAME_SCHEMAS_FILE = '../schema/interactive_beta.json'

In [3]:
def schema_to_df_row(game_schema):
    row = defaultdict(list)
    row.update(game_schema['metadata'])

    row['game_name'] = row['prolific_id']
    row['is_throwing'] = 0
    row['is_building'] = 0
    row['is_organizing'] = 0
    objects_with_predicates = []

    if 'throwing' in game_schema:
        throwing_game = game_schema['throwing']
        row['is_throwing'] = 1
        row['throwing_objects'] = throwing_game['what']
        row['throwing_goal'] = throwing_game['goal']
        
        for key in ('to', 'from', 'on'):
            if key in throwing_game:
                value = throwing_game[key]
                row[f'throwing_{key}'] = value
                objects_with_predicates.extend(value)

    if 'building' in game_schema:
        building_game = game_schema['building']
        row['is_building'] = 1
        row['building_objects'] = building_game['objects']
        row['building_goal'] = building_game['goal']
        row['building_structure'] = building_game['structure'] if 'structure' in building_game else None
        row['building_order'] = building_game['order'] if 'order' in building_game else None

        if 'on' in building_game:
            objects_with_predicates.append(building_game['on'])

    if 'organizing' in game_schema:
        row['is_organizing'] = 1
        row['organizing'] = game_schema['organizing']
        for org_game in game_schema['organizing']:
            row['organizing_objects'].extend(org_game['what'])
            if 'from' in org_game:
                objects_with_predicates.append(org_game['from'])
            objects_with_predicates.extend(org_game['to'])

    row['objects_with_predicates'] = objects_with_predicates

    return row


In [27]:
set(firestore_stats_df.prolific_id[firestore_stats_df.prolific_id.duplicated()])

{'5f9aba6600cdf11f1c9b915c',
 '60306cf6330619ee41fa3cd2',
 '6103ec2bf88328284fd894bc',
 '61093eae2bc2e47e6f26c7d7'}

In [25]:
# firestore_stats_df.prolific_id[firestore_stats_df.prolific_id.duplicated()]

(firestore_stats_df.prolific_id == '60306cf6330619ee41fa3cd2').sum()

2

In [21]:
raw_stats_df = pd.read_csv('../data/dsl_statistics_interactive.csv', index_col='Index')
print(raw_stats_df.game_name.size - raw_stats_df.game_name.nunique())
print(raw_stats_df.shape)

game_schemas = load_and_validate_game_schema(GAME_SCHEMAS_FILE, SCHEMA_FILE)
game_schema_rows = [schema_to_df_row(game_schema) for game_schema in game_schemas]

schema_df = pd.DataFrame(game_schema_rows)
print(schema_df.prolific_id.size - schema_df.prolific_id.nunique())
print(schema_df.shape)
# manual_df = pd.read_csv('../data/manual_dsl_statistics.csv')
stats_df = raw_stats_df.merge(schema_df, on='game_name', copy=True)

firestore_stats_df = pd.read_csv('../data/interactive_beta_firestore_statistics.csv')
firestore_stats_df = firestore_stats_df.rename(columns={key: key.replace('game_', 'raw_game_') for key in firestore_stats_df.columns if key.startswith('game_')})
firestore_stats_df = firestore_stats_df.rename(columns={key: key.replace('gameScore_', 'raw_game_') for key in firestore_stats_df.columns if key.startswith('gameScore_')})
print(firestore_stats_df.prolific_id.size - firestore_stats_df.prolific_id.nunique())
print(firestore_stats_df.shape)

stats_df = stats_df.merge(firestore_stats_df, on='prolific_id', copy=True)

stats_df.shape

0
(98, 14)
98
0
(98, 21)
4
(117, 14)


(101, 47)

In [5]:
repeated_structures_df = pd.read_csv('../data/dsl_repeated_structures_temporal_operator.csv')
print(repeated_structures_df.shape)

# repeated_structures_replaced_predicates_df = pd.read_csv('../data/dsl_repeated_structures_pred_names_replaced.csv')
# print(repeated_structures_replaced_predicates_df.shape)

(134, 4)


In [6]:
stats_df.loc[stats_df.terminal_exists.isna(), 'terminal_exists'] = False

room = np.zeros((len(stats_df),), dtype=np.int32)
room[['medium' in d for d in stats_df.domain_name]] = 1
room[['many' in d for d in stats_df.domain_name]] = 2

room_name = ['Few'] * len(stats_df)
stats_df = stats_df.assign(room=room, room_name=room_name)

stats_df.loc[['medium' in d for d in stats_df.domain_name], 'room_name'] = 'Medium'
stats_df.loc[['many' in d for d in stats_df.domain_name], 'room_name'] = 'Many'

DIFFICULTIES = ('Very Easy', 'Easy', 'Medium', 'Hard', 'Very Hard')
stats_df = stats_df.assign(difficulty=[DIFFICULTIES[i] for i in stats_df.raw_game_difficulty])


stats_df.src_file = stats_df.src_file.apply(lambda s: s.replace('problems-', '').replace('.pddl', ''))
room[['interactive' in s for s in stats_df.src_file]] = 3
stats_df = stats_df.assign(src=room)

def list_from_text(list_text):
    if isinstance(list_text, str):
        return np.fromstring(list_text[1:-1], sep=',', dtype=np.int32)
    
    return []

stats_df = stats_df.assign(length_of_then=stats_df.length_of_then.apply(list_from_text))
stats_df = stats_df.assign(setup_objects_quantified=stats_df.setup_objects_quantified.apply(list_from_text))
stats_df = stats_df.assign(preference_objects_quantified=stats_df.preference_objects_quantified.apply(list_from_text))

def average_list_series(df, name):
    avg = np.empty_like(df[name])
    avg[:] = np.NaN
    for i, entry in df[name].iteritems():
        if len(entry) > 0:
            avg[i] = np.mean(entry)

    return df.assign(**{f'average_{name}': avg})

stats_df = average_list_series(stats_df, 'length_of_then')
stats_df = average_list_series(stats_df, 'setup_objects_quantified')
stats_df = average_list_series(stats_df, 'preference_objects_quantified')

GAME_TYPES = ('throwing', 'building', 'organizing')
game_type_data = [[type_name for type_name in GAME_TYPES if row[f'is_{type_name}']]
    for i, row in stats_df.iterrows()]


for col in stats_df.columns:
    if col.startswith('is_'):
        stats_df[col] = stats_df[col].astype('bool')


stats_df = stats_df.assign(game_type=game_type_data)
stats_df = stats_df.assign(game_type_str=['_'.join(sorted(types)) for types in stats_df.game_type])

def dict_from_str_with_key_filters(keys_to_filter):
    def inner(dict_str):
        d = json.loads(dict_str.replace("'",'"'))
        for key in keys_to_filter:
            if key in d:
                del d[key]

        return d

    return inner

OBJECT_TYPES_REFERENCED_KEYS_TO_FILTER = ('back', 'front', 'left', 'right', 'front_left_corner', 'upright', 'upside_down', 'sideways')

stats_df = stats_df.assign(object_types_referenced=stats_df.object_types_referenced.apply(dict_from_str_with_key_filters(OBJECT_TYPES_REFERENCED_KEYS_TO_FILTER)))
stats_df = stats_df.assign(predicates_referenced=stats_df.predicates_referenced.apply(dict_from_str_with_key_filters([])))
stats_df = stats_df.assign(type_to_pred_counts=stats_df.type_to_pred_counts.apply(dict_from_str_with_key_filters(OBJECT_TYPES_REFERENCED_KEYS_TO_FILTER)))

stats_df.loc[stats_df.edited_game_fields.isna(), 'edited_game_fields'] = ''
stats_df = stats_df.assign(edited_game_fields=[fields.split(',') if fields else '' for fields in stats_df.edited_game_fields])

    

In [7]:
all_types = set()
for type_dict in stats_df.object_types_referenced:
    all_types.update(type_dict.keys())

AGENT = 'agent'
BALLS = 'balls'
BLOCKS = 'blocks'
COLORS = 'colors'
FURNITURE = 'furniture'
LARGE_OBJCETS = 'large_objects'
MEDIUM_OBJECTS = 'medium_objects'
OTHER_OBJECTS = 'other_objects'
ROOM_FEATURES = 'room_features'
SMALL_OBJECTS = 'small_objects'


CATEGORIES_TO_TYPES = {
    AGENT: ('agent', ),
    BALLS: (
        'ball', 'basketball', 'beachball', 'blue_dodgeball', 'dodgeball',
        'golfball', 'green_golfball', 'pink_dodgeball', 'red_dodgeball',
    ),
    BLOCKS: (
        'block', 'bridge_block', 'cube_block', 'cylindrical_block', 
        'flat_block', 'pyramid_block', 'tall_cylindrical_block', 'triangle_block',
        'tan_cube_block', 'red_pyramid_block', 'blue_cube_block', 'blue_pyramid_block',
        'yellow_pyramid_block', 'yellow_cube_block', 
    ),
    COLORS: (
        'color', 'blue', 'brown', 'green', 'orange', 
        'pink', 'purple', 'red', 'tan', 'white', 'yellow', 
    ),
    FURNITURE: (
        'bed', 'blinds', 'chair', # TODO: does chair qualify as funiture? since it's movable
        'desk', 'desk_shelf', 'drawer', 'main_light_switch', 'desktop', 
        'top_drawer', 'side_table',
    ),
    OTHER_OBJECTS: (
        'building',  'game_object', '',
    ),
    LARGE_OBJCETS: (
        'curved_wooden_ramp', 'doggie_bed', 'green_triangular_ramp', 'hexagonal_bin',  'triangular_ramp',
    ),
    MEDIUM_OBJECTS: (
        'laptop', 'pillow', 'teddy_bear',
    ),
    ROOM_FEATURES: (
        'door', 'floor', 'rug', 'shelf', 'bottom_shelf', 'top_shelf', 'sliding_door',
        'south_sliding_door', 'south_west_corner', 'wall', 'east_wall',  'south_wall', 'west_wall',
    ),
    SMALL_OBJECTS: (
        'alarm_clock', 'book', 'cd', 'cellphone',  'credit_card', 
        'key_chain', 'lamp',  'mug', 'pen', 'pencil', 'watch',
    )
}

TYPES_TO_CATEGORIES = {type_name: cat for cat, type_names in CATEGORIES_TO_TYPES.items() for type_name in type_names}


for category_objects in CATEGORIES_TO_TYPES.values():
    for obj in category_objects:
        if obj not in all_types:
            print(f'Found unexpected type: "{obj}"')
        else:
            all_types.remove(obj)

if len(all_types) > 0:
    print(f'Found unaccounted types: {all_types}')


object_categories_referenced = []
object_categories_referenced_total = []

for object_to_count_dict in stats_df.object_types_referenced:
    refs = defaultdict(lambda: 0)
    total_refs = defaultdict(lambda: 0)

    for obj, count in object_to_count_dict.items():
        obj_type = TYPES_TO_CATEGORIES[obj]
        refs[obj_type] += 1
        total_refs[obj_type] += count

    object_categories_referenced.append(dict(refs))
    object_categories_referenced_total.append(dict(total_refs))

stats_df = stats_df.assign(object_categories_referenced=object_categories_referenced, object_categories_referenced_total=object_categories_referenced_total)

Found unexpected type: ""


In [13]:
FEW = 'Few'
MEDIUM = 'Medium'
MANY = 'Many'
ROOM_NAMES = (FEW, MEDIUM, MANY)


ROOMS_TO_AVAILABLE_OBJECTS = {
    FEW: set([
        'agent',
        'ball', 'dodgeball', 'blue_dodgeball', 'pink_dodgeball', 
        'block', 'cube_block', 'yellow_cube_block', 'blue_cube_block', 'tan_cube_block',
        'color', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'tan', 'white', 'yellow', 
        'bed', 'blinds', 'chair', 'desk', 'desk_shelf', 'drawer', 'main_light_switch', 'desktop', 'top_drawer', 'side_table',
        'building',  'game_object', '',
        'curved_wooden_ramp', 'hexagonal_bin',
        'laptop', 'pillow', 
        'door', 'floor', 'rug', 'shelf', 'top_shelf', 'bottom_shelf', 'sliding_door', 'south_west_corner', 'south_sliding_door', 'wall', 'east_wall', 'south_wall', 'west_wall',
        'alarm_clock', 'book', 'cd', 'cellphone',  'credit_card', 'key_chain', 'lamp',  'mug', 'pen', 'pencil', 'watch',
    ]),
    MEDIUM: set([
        'agent',
        'ball', 'basketball', 'beachball', 'dodgeball', 'red_dodgeball',
        'block', 'bridge_block', 'cube_block', 'cylindrical_block', 'flat_block', 'pyramid_block', 'tall_cylindrical_block', 
        'yellow_pyramid_block', 'red_pyramid_block', 'yellow_cube_block', 'blue_cube_block',
        'color', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'tan', 'white', 'yellow', 
        'bed', 'blinds', 'chair', 'desk', 'desk_shelf', 'drawer', 'main_light_switch', 'desktop', 'top_drawer', 'side_table',
        'building',  'game_object', '',
        'doggie_bed', 'hexagonal_bin',  'triangular_ramp',
        'laptop', 'pillow', 'teddy_bear',
        'door', 'floor', 'rug', 'shelf', 'top_shelf', 'bottom_shelf', 'sliding_door', 'south_west_corner', 'south_sliding_door', 'wall', 'east_wall', 'south_wall', 'west_wall',
        'alarm_clock', 'book', 'cd', 'cellphone',  'credit_card', 'key_chain', 'lamp',  'mug', 'pen', 'pencil', 'watch',
    ]),
    MANY: set([
        'agent',
        'ball', 'beachball', 'dodgeball', 'blue_dodgeball', 'pink_dodgeball', 'red_dodgeball', 'golfball', 'green_golfball',
        'block', 'bridge_block', 'cube_block', 'cylindrical_block', 'flat_block', 'pyramid_block', 'tall_cylindrical_block', 'triangle_block',
        'yellow_pyramid_block', 'red_pyramid_block', 'blue_pyramid_block', 'yellow_cube_block', 'blue_cube_block', 'tan_cube_block',
        'color', 'blue', 'brown', 'green', 'orange', 'pink', 'purple', 'red', 'tan',  'white', 'yellow', 
        'bed', 'blinds', 'chair', 'desk', 'desk_shelf', 'drawer', 'main_light_switch', 'desktop', 'top_drawer', 'side_table',
        'building',  'game_object', '',
        'curved_wooden_ramp', 'doggie_bed', 'green_triangular_ramp', 'hexagonal_bin',  'triangular_ramp',
        'laptop', 'pillow', 'teddy_bear',
        'door', 'floor', 'rug', 'shelf', 'top_shelf', 'bottom_shelf', 'sliding_door', 'south_west_corner', 'south_sliding_door', 'wall', 'east_wall', 'south_wall', 'west_wall',
        'alarm_clock', 'book', 'cd', 'cellphone',  'credit_card', 'key_chain', 'lamp',  'mug', 'pen', 'pencil', 'watch',
    ]),
}

game_valid_rooms = []
for idx, obj_types in enumerate(stats_df.object_types_referenced):
    g = [room_name for room_name, room_objects in ROOMS_TO_AVAILABLE_OBJECTS.items() 
        if set(obj_types.keys()) <= room_objects]

    if stats_df.room_name[idx] not in g:
        room_objects = ROOMS_TO_AVAILABLE_OBJECTS[stats_df.room_name[idx]]
        print(idx, stats_df.game_name[idx], stats_df.room_name[idx], g, set(obj_types.keys()) - room_objects)
    
    game_valid_rooms.append(g)

# print(game_valid_rooms)


stats_df = stats_df.assign(game_valid_rooms=game_valid_rooms, num_game_valid_rooms=[len(g) for g in game_valid_rooms])

50 602a1735bf92e79a5e7cb632 Many [] {'basketball'}
85 61093eae2bc2e47e6f26c7d7 Few ['Medium', 'Many'] {'triangular_ramp'}
86 61093eae2bc2e47e6f26c7d7 Few ['Medium', 'Many'] {'triangular_ramp'}


In [15]:
stats_df.game_name.size - stats_df.game_name.nunique()

3

In [None]:
category_to_pred_counts = []
pred_to_type_counts = []
pred_to_category_counts = []

for type_to_pred in stats_df.type_to_pred_counts:
    cat_to_pred = defaultdict(lambda: defaultdict(lambda: 0))
    pred_to_type = defaultdict(lambda: defaultdict(lambda: 0))
    pred_to_cat = defaultdict(lambda: defaultdict(lambda: 0))

    for type_name, pred_counts in type_to_pred.items():
        cat_name = TYPES_TO_CATEGORIES[type_name]
        for pred, count in pred_counts.items():
            cat_to_pred[cat_name][pred] += count
            pred_to_type[pred][type_name] += count
            pred_to_cat[pred][cat_name] += count

    category_to_pred_counts.append({outer_key: dict(inner_dict) for outer_key, inner_dict in cat_to_pred.items()})
    pred_to_type_counts.append({outer_key: dict(inner_dict) for outer_key, inner_dict in pred_to_type.items()})
    pred_to_category_counts.append({outer_key: dict(inner_dict) for outer_key, inner_dict in pred_to_cat.items()})
    
stats_df = stats_df.assign(category_to_pred_counts=category_to_pred_counts, pred_to_type_counts=pred_to_type_counts, pred_to_category_counts=pred_to_category_counts)


In [None]:
DIFFICULTIES = ('Very Easy', 'Easy', 'Medium', 'Hard', 'Very Hard')

REORDERED_COLUMNS = ['src', 'src_file', 'game_name', 'domain_name', 'room', 'room_name',
    'num_preferences', 'length_of_then', 'average_length_of_then',
    'setup_objects_quantified', 'average_setup_objects_quantified',
    'preference_objects_quantified', 'average_preference_objects_quantified',
    'terminal_exists', 'object_types_referenced', 'predicates_referenced', 
    'type_to_pred_counts', 'category_to_pred_counts', 'pred_to_type_counts', 'pred_to_category_counts',
    'object_categories_referenced', 'object_categories_referenced_total',
    'game_valid_rooms', 'num_game_valid_rooms',
    'max_depth', 'ast_nodes', 'setup_nodes',
    'game_type', 'game_type_str', 'is_throwing', 'is_building', 'is_organizing', 
    'objects_with_predicates',
    'throwing_objects', 'throwing_goal', 'throwing_to', 'throwing_from', 'throwing_on',
    'building_objects', 'building_goal', 'building_structure', 'building_order', 
    'organizing', 'organizing_objects',
    'raw_game_setup', 'raw_game_gameplay', 'raw_game_scoring',
    'raw_game_difficulty', 'raw_game_firstTimeScore', 'raw_game_score',
    'raw_game_thoughts', 'raw_game_edited', 'edited_game_fields', 'difficulty',
]
stats_df = stats_df.reindex(columns=REORDERED_COLUMNS)
stats_df.shape

In [None]:
stats_df.head()

# Dataframe columns readme:
* `src/src_file`: which file the data came from (survey by specific room or interactive beta)
* `game_name`: room-\[row number in my spreadsheet\].
* `domain_name`: the room identifier as it's iun the games (domain is a PDDL thing)
* `room/room_name`: simplifications of the room designation from the game/domain names to ease working with the data.
* `num_preferences`: how many preferences I used to represent the game.
* `length_of_then`: length of all `then` operators in this game.
* `setup/preference_objects_quantified`: average number of objects quantified over in quantifiers (`exists`/`forall`) in the game representation, split by which section of the game it's in.
* `terminal_exists`: whether the game uses a `terminal` clause.
* `object_types_referenced`: how many times each object type was quantified in each game, combined between the setup and preferences. 
* `object_categories_referenced`: how many times each game refers to object types from each category (as coded above)
* `predicates_referenced`: how many times each predicate was referenced in each game (as above, combined between setup and preferences). 
* `type_to_pred_counts`: a mapping, for each game, from object type, to how many times it's used with each predicate
* `category_to_pred_counts`: same as above, but for object categories -- for each, game, from each object category, to how many times it's used with each predicate
* `pred_to_type_counts`: same two above, but inverted -- for each game, for each predicate, how often it's used with each object type
* `pred_to_category_counts`: same as above, but for object categories, rather than individual types -- for each game, for each predicate, how often it's used with objects from each type
* `game_valid_rooms`: in which rooms is this game valid, by the types of objects it appears in
* `num_game_valid_rooms`: same as above, but as a number, rather than a list of room names
* `max_depth`: what's the deepest the game's AST goes
* `ast_nodes`: how many total nodes of type AST (so not strings, lists, etc.) exist in the AST.
* `is_throwing/building/organizing`: does the schema representaiton of this game utilize this sort of block?
* `game_type`: a combination of the above into a single list
* `game_type_str`: a combination of the above into a sorted and joined string
* `objects_with_predicates`: a list of all of the `objectWithPredicate` types used in the schema representation of this game
* `throwing_*/building_*/organizing_*`: individual fields from the schema representation for each type.
* `raw_game_*`: raw data from the participants, as loaded from our firestore DB
* `raw_game_edited`: did participants opt to edit their games after playing them?
* `edited_game_fields`: for participants who edited their games, which fields did they edit?
* `difficulty`: the participant-reported difficulty rating converted to a string


# Schema-based analyses
## How many games of each type exist



In [None]:
game_type_counts = stats_df.groupby(['is_throwing', 'is_building', 'is_organizing']).size().reset_index().rename(columns={0: 'num'})

labels = []
values = []
for index, row in game_type_counts.iterrows():
    elements = []
    if row.is_throwing: elements.append('throwing') 
    if row.is_building: elements.append('building') 
    if row.is_organizing: elements.append('organizing') 
    labels.append('+\n'.join(elements))
    values.append(row.num)

x_values = np.arange(len(values))
plt.bar(x_values, values)
plt.xticks(x_values, labels=labels)
plt.title('Count of games by type')
plt.show()

In [None]:
FIGURE_TEMPLATE = r'''\begin{{figure}}[!htb]
% \vspace{{-0.225in}}
\centering
\includegraphics[width=\linewidth]{{figures/{save_path}}}
\caption{{ {{\bf FIGURE TITLE.}} FIGURE DESCRIPTION.}}
\label{{fig:{label_name}}}
% \vspace{{-0.2in}}
\end{{figure}}
'''
WRAPFIGURE_TEMPLATE = r'''\begin{{wrapfigure}}{{r}}{{0.5\linewidth}}
\vspace{{-.3in}}
\begin{{spacing}}{{1.0}}
\centering
\includegraphics[width=0.95\linewidth]{{figures/{save_path}}}
\caption{{ {{\bf FIGURE TITLE.}} FIGURE DESCRIPTION.}}
\label{{fig:{label_name}}}
\end{{spacing}}
% \vspace{{-.25in}}
\end{{wrapfigure}}'''

SAVE_PATH_PREFIX = '../figures'


def save_plot(save_path, bbox_inches='tight', should_print=False):
    if save_path is not None:
        save_path_no_ext = os.path.splitext(save_path)[0]
        if should_print:
            print('Figure:\n')
            print(FIGURE_TEMPLATE.format(save_path=save_path, label_name=save_path_no_ext.replace('/', '-').replace('_', '-')))
            print('\nWrapfigure:\n')
            print(WRAPFIGURE_TEMPLATE.format(save_path=save_path, label_name=save_path_no_ext.replace('/', '-').replace('_', '-')))
            print('')
        
        if not save_path.startswith(SAVE_PATH_PREFIX):
            save_path = os.path.join(SAVE_PATH_PREFIX, save_path)
        
        save_path = os.path.abspath(save_path)
        folder, filename = os.path.split(save_path)
        os.makedirs(folder, exist_ok=True)
        plt.savefig(save_path, bbox_inches=bbox_inches, facecolor=plt.gcf().get_facecolor(), edgecolor='none')


In [None]:
def _add_new_histogram_results(all_results, new_results, key):
    if isinstance(new_results, (list, tuple)):
        all_results[key].extend(new_results)
    else:
        all_results[key].append(new_results)


def _extract_histogram_data(df, group_by_col, row_value_func, split_group_by_values):
    results_by_key = defaultdict(list)

    for _, row in df.iterrows():
        group_by_key = row[group_by_col]
        row_values = row_value_func(row)

        # single key
        if isinstance(group_by_key, (str, int)):
            _add_new_histogram_results(results_by_key, row_values, group_by_key)
        
        # multiple keys
        else:
            if split_group_by_values:
                for key in group_by_key:
                    _add_new_histogram_results(results_by_key, row_values, key)

            else:
                group_by_key = '_'.join(sorted(group_by_key))
                _add_new_histogram_results(results_by_key, row_values, group_by_key)

    return results_by_key


def _add_new_bar_chart_results(all_results, new_results, key):    
    if isinstance(new_results, (list, tuple)):
        for new_res in new_results:
            _add_new_bar_chart_results(all_results, new_res, key)

    elif isinstance(new_results, dict):
        for result_key in new_results:
            all_results[key][result_key] += new_results[result_key]

    elif isinstance(new_results, (str, int)):
        all_results[key][new_results] += 1
    
    else:
        raise ValueError(f'_add_new_bar_chart_results expected dict (or list/tuple of dicts), received {type(new_results)}: {new_results}')


def _extract_bar_chart_data(df, group_by_col, row_value_func, split_group_by_values, swap_outer_inner_keys=False):
    results_by_key = defaultdict(lambda: defaultdict(lambda: 0))

    for _, row in df.iterrows():
        group_by_key = row[group_by_col]
        row_values = row_value_func(row)

        # single key
        if isinstance(group_by_key, (str, int)):
            _add_new_bar_chart_results(results_by_key, row_values, group_by_key)
        
        # multiple keys
        else:
            if split_group_by_values:
                for key in group_by_key:
                    _add_new_bar_chart_results(results_by_key, row_values, key)

            else:
                group_by_key = '_'.join(sorted(group_by_key))
                _add_new_bar_chart_results(results_by_key, row_values, group_by_key)

    if swap_outer_inner_keys:
        swapped_results_by_key = defaultdict(lambda: defaultdict(lambda: 0))

        for outer_key, inner_dict in results_by_key.items():
            for inner_key, count in inner_dict.items():
                swapped_results_by_key[inner_key][outer_key] = count

        return swapped_results_by_key

    return results_by_key


def parallel_histograms(df, group_by_col, row_value_func, split_group_by_values=False,
    figsize=(16, 6), plot_density=True, title='',
    super_title_fontsize=24, ax_title_fontsize=16, ax_label_fontsize=16):

    results_by_key = _extract_histogram_data(df, group_by_col, row_value_func, split_group_by_values)

    global_min = min([min(values) for values in results_by_key.values() if len(values) > 0])
    global_max = max([max(values) for values in results_by_key.values() if len(values) > 0])

    fig, axes = plt.subplots(1, len(results_by_key), figsize=figsize)

    for index, key in enumerate(results_by_key):
        ax = axes[index]
        ax.hist(results_by_key[key], range=(global_min, global_max), density=plot_density)
        ax.set_title(key, fontsize=ax_title_fontsize)

        if index == 0:
             ax.set_ylabel('Density' if plot_density else 'Count', fontsize=ax_label_fontsize)
        else:
            ax.set_yticks([])
    

    if title:
        plt.suptitle(title, fontsize=super_title_fontsize)
    
    plt.show()


def single_ax_histograms(df, group_by_col, row_value_func, split_group_by_values=False,
    figsize=(8, 6), plot_density=False, stacked=True, title='', cmap='tab10', legend_loc='best', xlabel='',
    super_title_fontsize=24, ax_title_fontsize=16, ax_label_fontsize=16):

    results_by_key = _extract_histogram_data(df, group_by_col, row_value_func, split_group_by_values)
    keys = results_by_key.keys()
    values = [results_by_key[key] for key in keys]
    colormap = plt.cm.get_cmap(cmap)
    colors = [colormap(i) for i in range(len(keys))]

    plt.figure(figsize=figsize)
    plt.hist(values, label=list(keys), density=plot_density, stacked=stacked, color=colors)
    plt.legend(loc=legend_loc)
    plt.title(title, fontsize=super_title_fontsize)
    plt.ylabel('Density' if plot_density else 'Count', fontsize=ax_label_fontsize)
    plt.xlabel(xlabel, fontsize=ax_label_fontsize)    
    plt.show()


def single_ax_bar_chart(df, group_by_col, row_value_func, split_group_by_values=False, swap_outer_inner_keys=False, *,
    figsize=(8, 6), title='', cmap='tab10', legend_loc='best', xlabel='', ylabel='Count',
    super_title_fontsize=24, ax_title_fontsize=16, ax_label_fontsize=16,
    inner_key_order=None, outer_key_order=None, normalize_columns=False, 
    vertical_bar_labels=False, save_path=None, print_save_latex=False):

    results_by_key = _extract_bar_chart_data(df, group_by_col, row_value_func, split_group_by_values, swap_outer_inner_keys)
    single_ax_bar_chart_from_results(results_by_key, figsize=figsize, title=title, cmap=cmap, 
        legend_loc=legend_loc, xlabel=xlabel, ylabel=ylabel, super_title_fontsize=super_title_fontsize,
        ax_title_fontsize=ax_title_fontsize, ax_label_fontsize=ax_label_fontsize,
        inner_key_order=inner_key_order, outer_key_order=outer_key_order, 
        normalize_columns=normalize_columns, vertical_bar_labels=vertical_bar_labels, 
        save_path=save_path, print_save_latex=print_save_latex)


def single_ax_bar_chart_from_results(results_by_key, *,
    figsize=(8, 6), title='', cmap='tab10', legend_loc='best', legend_bbox_to_anchor=None, xlabel='', ylabel='Count',
    super_title_fontsize=24, ax_title_fontsize=16, ax_label_fontsize=16, ax_tick_fontsize=12,
    inner_key_order=None, outer_key_order=None, normalize_columns=False, vertical_bar_labels=False, 
    color_dict=None, save_path=None, print_save_latex=False):

    if outer_key_order is None:
        outer_key_order = sorted(results_by_key.keys())

    if inner_key_order is None:
        inner_key_order =  sorted(set(
            [key for inner_keys in [list(x.keys()) for x in results_by_key.values()] 
            for key in inner_keys]
        ))
    
    current_start_values = np.zeros((len(inner_key_order,)))

    if normalize_columns:
        inner_key_sums = [sum([results_by_key[outer_key][inner_key] for outer_key in outer_key_order]) 
            for inner_key in inner_key_order]

        if ylabel == 'Count':
            ylabel = 'Proportion'

        if legend_bbox_to_anchor is None:
            legend_bbox_to_anchor = (1.0, 0.5)
            legend_loc = 'center left'

    if not vertical_bar_labels:
        inner_key_names = [key.replace('_', ' +\n') for key in inner_key_order]
    else:
        inner_key_names = inner_key_order

    colormap = plt.cm.get_cmap(cmap)
    if color_dict is None:
        colors = [colormap(i) for i in range(len(outer_key_order))]
    else:
        colors = [color_dict[key] for key in outer_key_order]

    plt.figure(figsize=figsize)

    for index, outer_key in enumerate(outer_key_order):
        current_key_values = [results_by_key[outer_key][inner_key] for inner_key in inner_key_order]
        if normalize_columns:
            current_key_values = [x / y if x != 0 else x for x, y in zip(current_key_values, inner_key_sums)]

        # TODO: think about whether or not I want to support non-stacked
        plt.bar(inner_key_names, current_key_values, bottom=current_start_values, 
            label=outer_key, color=colors[index])
        current_start_values += np.array(current_key_values)

    plt.legend(bbox_to_anchor=legend_bbox_to_anchor, loc=legend_loc, prop=dict(size=ax_tick_fontsize))
    plt.title(title, fontsize=super_title_fontsize)

    plt.xlabel(xlabel, fontsize=ax_label_fontsize)    
    plt.ylabel(ylabel, fontsize=ax_label_fontsize)
    
    plt.xticks(size=ax_tick_fontsize, rotation='vertical' if vertical_bar_labels else 'horizontal')
    plt.yticks(size=ax_title_fontsize)
    
    if save_path is not None:
        save_plot(save_path, should_print=print_save_latex)

    plt.show()


In [None]:
class CoocurrenceDefinition:
    def __init__(self, outer_key, inner_key, *,
        outer_primary_obj_only=True, inner_primary_obj_only=True, 
        use_categories_outer=False, use_categories_inner=False, name=None):

        self.outer_key = outer_key
        self.inner_key = inner_key
        self.outer_primary_obj_only = outer_primary_obj_only
        self.inner_primary_obj_only = inner_primary_obj_only
        self.use_categories_outer = use_categories_outer
        self.use_categories_inner = use_categories_inner

        if name is None:
            name_components = [outer_key]
            if not outer_primary_obj_only: name_components.append('all')
            if use_categories_outer: name_components.append('cat')
            name_components.append(inner_key)
            if not inner_primary_obj_only: name_components.append('all')
            if use_categories_inner: name_components.append('cat')
            name = '_'.join(name_components)

        self.name = name

SplitObjectPredicates = namedtuple(
    'SplitObjectPredicates', ('primary_objects', 'secondary_objects', 'predicates')
)

empty_coocurrence_dict = lambda: defaultdict(lambda: defaultdict(lambda: 0))

def update_coocurrence_dict(cooc_dict, outer_keys, inner_keys, omit_equals=False):
    for outer in outer_keys:
        for inner in inner_keys:
            if omit_equals and outer == inner:
                continue

            cooc_dict[outer][inner] += 1

def sort_by_count_desc(key_to_count):
    return [item[0] for item in sorted(key_to_count.items(), key=lambda item: item[1], reverse=True)]


def coocurrence_dict_to_matrix(cooc_dict):
    outer_key_counts = {key: sum(cooc_dict[key].values()) for key in cooc_dict}
    all_inner_keys = set([inner_key for keys in [inner_dict.keys() for inner_dict in cooc_dict.values()] for inner_key in keys])
    inner_key_counts = {inner_key: sum([cooc_dict[outer_key][inner_key] for outer_key in cooc_dict]) for inner_key in all_inner_keys}

    sorted_outer_keys = sort_by_count_desc(outer_key_counts)
    sorted_inner_keys = sort_by_count_desc(inner_key_counts)

    cooc_mat = np.zeros((len(sorted_outer_keys), len(sorted_inner_keys)))
    for i, outer in enumerate(sorted_outer_keys):
        for j, inner in enumerate(sorted_inner_keys):
            cooc_mat[i, j] = cooc_dict[outer][inner]

    return cooc_mat, sorted_outer_keys, sorted_inner_keys


def separate_objects_and_predicates(objects_with_predicates_list):
    primary_objects = []
    secondary_objects = []
    predicates = []

    if isinstance(objects_with_predicates_list, list):
        for object_with_predicates in objects_with_predicates_list:
            primary_objects.append(object_with_predicates['object'])

            if 'predicates' in object_with_predicates:
                for predicate_desc in object_with_predicates['predicates']:
                    if 'object' in predicate_desc:
                        secondary_objects.append(predicate_desc['object'])

                    if 'predicate' in predicate_desc:
                        predicates.append(predicate_desc['predicate'])

    return SplitObjectPredicates(primary_objects, secondary_objects, predicates)


def extract_all_coocurrences(df, column_prefix, coocurrence_defs, type_to_category_mapping=TYPES_TO_CATEGORIES):
    relevant_columns = list(filter(lambda c: c.startswith(f'{column_prefix}_'), df.columns))
    obj_with_pred_columns = list(filter(
        lambda c: any([isinstance(x, list) and isinstance(x[0], dict) for x in df[c]]), 
        relevant_columns))
    
    if any([cooc_def.name is None or cooc_def.inner_key is None or cooc_def.outer_key is None for cooc_def in coocurrence_defs]):
        raise ValueError(f'Received at least one cooc def without a name, inner key, or outer key: {coocurrence_defs}')

    coocurrence_dicts = {
        cooc_def.name: empty_coocurrence_dict()   
        for cooc_def in coocurrence_defs
    }

    for col in obj_with_pred_columns:
        coocurrence_dicts[f'{col}_object_predicate'] = empty_coocurrence_dict()
        coocurrence_dicts[f'{col}_object_object'] = empty_coocurrence_dict()

    for _, row in df.iterrows():
        row_values = {col: separate_objects_and_predicates(row[col]) if col in obj_with_pred_columns else row[col] for col in relevant_columns}

        # compute predefined coocurrences
        for cooc_def in coocurrence_defs:
            outer_col = f'{column_prefix}_{cooc_def.outer_key}'
            inner_col = f'{column_prefix}_{cooc_def.inner_key}'

            outer_values = row_values[outer_col]
            inner_values = row_values[inner_col]

            if outer_col in obj_with_pred_columns:
                if cooc_def.outer_primary_obj_only:
                    outer_values = outer_values.primary_objects[:]
                else:
                    outer_values = outer_values.primary_objects + outer_values.secondary_objects

            if cooc_def.use_categories_outer:
                outer_values = [type_to_category_mapping[val] for val in outer_values]

            if inner_col in obj_with_pred_columns:
                if cooc_def.inner_primary_obj_only:
                    inner_values = inner_values.primary_objects[:]
                else:
                    inner_values = inner_values.primary_objects + inner_values.secondary_objects

            if cooc_def.use_categories_inner:
                inner_values = [type_to_category_mapping[val] for val in inner_values]

            update_coocurrence_dict(coocurrence_dicts[cooc_def.name], outer_values, inner_values)

        # compute generic object-object and object-predicate coocurrences
        for col in obj_with_pred_columns:
            if isinstance(row[col], list):
                for object_with_predicates in row[col]:
                    first_object = object_with_predicates['object']
                    if 'predicates' in object_with_predicates:
                        for predicate_desc in object_with_predicates['predicates']:
                            second_object = None
                            if 'object' in predicate_desc:
                                second_object = predicate_desc['object']
                                coocurrence_dicts[f'{col}_object_object'][first_object][second_object] += 1
                                coocurrence_dicts[f'{col}_object_object'][second_object][first_object] += 1

                            if 'predicate' in predicate_desc:
                                predicate = predicate_desc['predicate']
                                
                                coocurrence_dicts[f'{col}_object_predicate'][first_object][predicate] += 1
                                if second_object:
                                    coocurrence_dicts[f'{col}_object_predicate'][second_object][predicate] += 1

    # combine the individual object-predicate coocurrences to the combined ones
    coocurrence_dicts['all_object_predicate'] = empty_coocurrence_dict()
    coocurrence_dicts['all_object_object'] = empty_coocurrence_dict()

    for col in obj_with_pred_columns:
        for template in '{key}_object_predicate', '{key}_object_object':
            overall_cooc_dict = coocurrence_dicts[template.format(key='all')]
            current_cooc_dict = coocurrence_dicts[template.format(key=col)]

            for outer_key in current_cooc_dict:
                for inner_key in current_cooc_dict[outer_key]:
                    overall_cooc_dict[outer_key][inner_key] += current_cooc_dict[outer_key][inner_key]

    return coocurrence_dicts


def plot_coocurrence_data(cooc_dict, title='', xlabel='', ylabel='', figsize=(12, 12),
    title_fontsize=24, ax_label_fontsize=20, tick_fontsize=16, cmap='gist_yarg'): 
    cooc_mat, outer_keys, inner_keys = coocurrence_dict_to_matrix(cooc_dict)

    fig = plt.figure(figsize=figsize)
    ax = fig.gca()
    plt.imshow(cooc_mat, cmap=cmap)

    plt.xticks(np.arange(len(inner_keys)), inner_keys, rotation='vertical', fontsize=tick_fontsize)
    ax.xaxis.set_ticks_position('top')
    # ax.xaxis.set_label_position('top')
    plt.yticks(np.arange(len(outer_keys)), outer_keys, fontsize=tick_fontsize)
    cbar = plt.colorbar()
    cbar.ax.tick_params(labelsize=tick_fontsize)

    plt.title(title, fontsize=title_fontsize)
    plt.xlabel(xlabel, fontsize=ax_label_fontsize)
    plt.ylabel(ylabel, fontsize=ax_label_fontsize)
    plt.show()




# Potentially relevant plots to the paper

In [None]:
!pwd

In [None]:
single_ax_bar_chart(stats_df, 'game_type', lambda row: row.object_categories_referenced, swap_outer_inner_keys=True,
    title='Use of different object categories by game type', xlabel='Game Type', figsize=(12, 6), 
    normalize_columns=True, save_path='object_categories_by_game_type.png')


In [None]:
single_ax_bar_chart(stats_df, 'room_name', lambda row: row.game_type_str, swap_outer_inner_keys=True,
    title='Game types by room', xlabel='Game type', figsize=(8, 6),
    inner_key_order=ROOM_NAMES, normalize_columns=True, save_path='game_types_by_room.png')


In [None]:
single_ax_bar_chart(stats_df, 'room_name', lambda row: row.difficulty,
    title='Difficulty by room', xlabel='Room', figsize=(12, 6),
    inner_key_order=DIFFICULTIES, outer_key_order=ROOM_NAMES)


In [None]:
single_ax_bar_chart(stats_df, 'difficulty', lambda row: row.game_type_str,
    title='Game types by difficulty', xlabel='Game type', figsize=(12, 6),
    outer_key_order=DIFFICULTIES)


In [None]:
stats_df.groupby('game_type_str').raw_game_edited.mean().plot(kind='bar', title='Probability of editing game by game type')

In [None]:
stats_df.groupby('game_type_str').raw_game_edited.mean().plot(kind='bar', title='Probability of editing game room')

In [None]:
# we want the outer keys to be the stacked labels colored differently (in this case, the object categories)
# and the inner keys to the things that appear on the x axis (in this case, the predicates)


def combine_nested_dict_results(df, nested_dict_field):
    combined_results = defaultdict(lambda: defaultdict(lambda: 0))
    for row_results in df[nested_dict_field]:
        for outer_key, inner_results in row_results.items():
            _add_new_bar_chart_results(combined_results, inner_results, outer_key)


    return combined_results

category_to_predicate_results_all_games = combine_nested_dict_results(stats_df, 'category_to_pred_counts')



tab10_colormap = plt.cm.get_cmap('tab10')
OBJECT_CATEGORY_TO_COLOR = {obj_category: tab10_colormap(i) for i, obj_category in enumerate(CATEGORIES_TO_TYPES)}

single_ax_bar_chart_from_results(category_to_predicate_results_all_games, 
    title='Predicate use by object category', xlabel='Predicate', figsize=(12, 6), 
    vertical_bar_labels=True, color_dict=OBJECT_CATEGORY_TO_COLOR  
)


In [None]:
category_to_predicate_results_throwing = combine_nested_dict_results(stats_df.loc[stats_df.game_type_str == 'throwing', :], 'category_to_pred_counts')


single_ax_bar_chart_from_results(category_to_predicate_results_throwing, 
    title='Predicate use by object category in throwing games', xlabel='Predicate', figsize=(12, 6), 
    vertical_bar_labels=True, color_dict=OBJECT_CATEGORY_TO_COLOR,
    save_path='throwing_only_predicate_use_by_object_category.png'     
)

In [None]:
category_to_predicate_results_building = combine_nested_dict_results(stats_df.loc[stats_df.is_building, :], 'category_to_pred_counts')

single_ax_bar_chart_from_results(category_to_predicate_results_building, 
    title='Predicate use by object category in games involving building', xlabel='Predicate', figsize=(12, 6), 
    vertical_bar_labels=True, color_dict=OBJECT_CATEGORY_TO_COLOR,
    save_path='all_building_predicate_use_by_object_category.png'
)

In [None]:
category_to_predicate_results_throwing = combine_nested_dict_results(stats_df.loc[stats_df.game_type_str == 'building', :], 'category_to_pred_counts')


single_ax_bar_chart_from_results(category_to_predicate_results_throwing, 
    title='Predicate use by object category in building games', xlabel='Predicate', figsize=(12, 6), 
    vertical_bar_labels=True, color_dict=OBJECT_CATEGORY_TO_COLOR,
    save_path='building_only_predicate_use_by_object_category.png'    
)

In [None]:
category_to_predicate_results_organizing = combine_nested_dict_results(stats_df.loc[stats_df.is_organizing, :], 'category_to_pred_counts')

single_ax_bar_chart_from_results(category_to_predicate_results_organizing, 
    title='Predicate use by object category in games involving organizing', xlabel='Predicate', figsize=(12, 6),  
    vertical_bar_labels=True, color_dict=OBJECT_CATEGORY_TO_COLOR,
    save_path='all_organizing_predicate_use_by_object_category.png'
)

In [None]:
category_to_predicate_results_throwing = combine_nested_dict_results(stats_df.loc[stats_df.game_type_str == 'organizing', :], 'category_to_pred_counts')


single_ax_bar_chart_from_results(category_to_predicate_results_throwing, 
    title='Predicate use by object category in organizing games', xlabel='Predicate', figsize=(12, 6), 
    vertical_bar_labels=True, color_dict=OBJECT_CATEGORY_TO_COLOR,
    save_path='organizing_only_predicate_use_by_object_category.png'
)

## Context sensitivity analyses

In [None]:
stats_df.groupby('room_name').num_game_valid_rooms.mean().plot(kind='bar', title='Average number of valid rooms by room')

In [None]:
room_name_validity_matrix = np.zeros((len(ROOM_NAMES), len(ROOM_NAMES)))
for i, room_name in enumerate(ROOM_NAMES):
    for j, other_room_name in enumerate(ROOM_NAMES):
        room_name_validity_matrix[i, j] = len(stats_df.loc[(stats_df.room_name == room_name) & [other_room_name in rooms for rooms in stats_df.game_valid_rooms]])

    room_name_validity_matrix[i, :] /= room_name_validity_matrix[i, i]


plt.matshow(room_name_validity_matrix, cmap='coolwarm_r')

plt.colorbar()
plt.xticks(np.arange(len(ROOM_NAMES)), ROOM_NAMES, rotation='vertical')
plt.yticks(np.arange(len(ROOM_NAMES)), ROOM_NAMES)

ax = plt.gca()
ax.tick_params(axis='both', which='both',length=0)

for (i, j), p in np.ndenumerate(room_name_validity_matrix):
    ax.text(j, i, '{:0.2f}'.format(p), ha='center', va='center', size=12)

plt.xlabel('Room valid in', fontsize=16)
plt.ylabel('Room created in', fontsize=16)
plt.title('Context Sensitivity', fontsize=20)

In [None]:
stats_df.groupby('game_type_str').num_game_valid_rooms.mean().plot(kind='bar', title='Average number of valid rooms by game type')

In [None]:
stats_df.pred_to_type_counts[8]

In [None]:
stats_df.pred_to_category_counts[8], stats_df.is_building[8]

# Between-game-type visualizations


In [None]:
single_ax_histograms(stats_df, 'game_type', lambda row: row.ast_nodes, 
    title='Total AST nodes by game type', xlabel='Total AST nodes', legend_loc='upper right')

In [None]:
single_ax_histograms(stats_df, 'game_type', lambda row: row.setup_nodes, 
    title='Total setup nodes by game type', xlabel='Total setup nodes', legend_loc='upper right')

In [None]:
single_ax_histograms(stats_df, 'game_type', lambda row: row.max_depth, 
    title='Max depth by game type', xlabel='Max depth', legend_loc='upper right')

In [None]:
single_ax_histograms(stats_df, 'game_type', lambda row: row.num_preferences, 
    title='Number of preferences by game type', xlabel='# of preferences', legend_loc='upper right')

In [None]:
def extract_num_types_referenced(row):
    if isinstance(row.object_types_referenced, dict):
        return len(row.object_types_referenced)

    return 0


single_ax_histograms(stats_df, 'game_type', extract_num_types_referenced, 
    title='Total number of types referenced by game type', xlabel='# of types', legend_loc='upper right')

In [None]:
def extract_num_categories_referenced(row):
    if isinstance(row.object_categories_referenced, dict):
        return len(row.object_categories_referenced)

    return 0


single_ax_histograms(stats_df, 'game_type', extract_num_categories_referenced, 
    title='Total number of object categories referenced by game type', xlabel='# of categories', legend_loc='upper right')

In [None]:
def extract_predicates_referenced(row):
    if isinstance(row.predicates_referenced, dict):
        return len(row.predicates_referenced)

    return 0


single_ax_histograms(stats_df, 'game_type', extract_predicates_referenced, 
    title='Total number of predicates referenced by game type', xlabel='# of predicates', 
    legend_loc='upper left')

In the next two plots, and the two plots that follow, I plot references to types (in the first two) and predicates (in the second two) by game type. The first of each two plots counts individual references to each type (so if a game refers to a type three times, it adds 3), while the second of each two plots counts how many games refer to each type (so hthe same game referring to a type three times will only add 1).

In [None]:
single_ax_bar_chart(stats_df, 'game_type', lambda row: row.object_types_referenced,
    title='Total object type references by game type', xlabel='Object type', figsize=(12, 6))


In [None]:
def dict_keys_to_1(row):
    if isinstance(row.object_types_referenced, dict):
        return {key: 1 for key in row.object_types_referenced}

single_ax_bar_chart(stats_df, 'game_type', dict_keys_to_1,
    title='Count of games referring to each object type', xlabel='Object type', figsize=(12, 6))


In [None]:
single_ax_bar_chart(stats_df, 'game_type', lambda row: row.predicates_referenced,
    title='Total predicate references by game type', xlabel='Predicate', figsize=(12, 6))


In [None]:
def dict_keys_to_1(row):
    if isinstance(row.predicates_referenced, dict):
        return {key: 1 for key in row.predicates_referenced}

single_ax_bar_chart(stats_df, 'game_type', dict_keys_to_1,
    title='Count of games referring to each predicate', xlabel='Predicate', figsize=(12, 6))


# Throwing game visualizations

* The fiirst set of plots visualize coocurrence matrices between different game elmenets. For example:
    * Thrown object <> thrown game goal (to get something in, on, to hit another object, etc.)
    * Thrown object <> target object coocurrence
    * Coocurrences between various types of objects in the thrown game schema (the object thrown to, the object thrown from, objects and predicates, etc.)
* **In all of these, I don't currently control for the fact that same objects appear in more of the rooms -- I could count how many games I have in each room, note which objects appear in each room, and account for that in my analysis, right?**
* I can also generate analyses for throwing games like the analyses I generated for all games above. I'll generate a few of those plots, to give some examples, below the cocourrence matrices. Some of the things I could plot include:
    * Number of preferences?
    * Average length/depth of preferences?
    * Max depth?
    * Total type references
    * Number of types referenced
    * **Would complexity of the setup be an interesting thing to quantify?**

In [None]:
throwing_df = stats_df[stats_df.is_throwing == 1]

throwing_df.head()

In [None]:
THROWING_COOCURRENCE_DEFINITIONS = (
    CoocurrenceDefinition('objects', 'goal'),
    CoocurrenceDefinition('objects', 'goal', use_categories_outer=True),
    CoocurrenceDefinition('objects', 'to'),
    CoocurrenceDefinition('objects', 'to', use_categories_inner=True),
    CoocurrenceDefinition('objects', 'on'),
    CoocurrenceDefinition('objects', 'on', use_categories_inner=True),
    CoocurrenceDefinition('goal', 'to'),
    CoocurrenceDefinition('goal', 'to', use_categories_inner=True),
    CoocurrenceDefinition('on', 'to'),
    CoocurrenceDefinition('from', 'to', outer_primary_obj_only=False, inner_primary_obj_only=False),
)

throwing_coocurrences = extract_all_coocurrences(throwing_df, 'throwing', THROWING_COOCURRENCE_DEFINITIONS)

throwing_coocurrences.keys()

In [None]:
plot_coocurrence_data(throwing_coocurrences['objects_cat_goal'], 
    'Thrown category <> goal cocurrence', 'Goal', 'Thrown Object Category', (8, 8))

In [None]:
plot_coocurrence_data(throwing_coocurrences['objects_goal'], 
    'Thrown object <> goal cocurrence', 'Goal', 'Thrown Object', (6, 8))

In [None]:
plot_coocurrence_data(throwing_coocurrences['objects_to'], 
    'Thrown object <> target object cocurrence', 'Object', 'Thrown Object', (12, 6), cmap='gist_yarg')

In [None]:
plot_coocurrence_data(throwing_coocurrences['objects_to_cat'], 
    'Thrown object <> target object category', 'Object Category', 'Thrown Object', (6, 8))

In [None]:
plot_coocurrence_data(throwing_coocurrences['goal_to'], 
    'Goal <> target object cocurrence', 'Target Object', 'Goal', (12, 4), cmap='gist_yarg')

In [None]:
plot_coocurrence_data(throwing_coocurrences['goal_to_cat'], 
    'Goal <> target object category', 'Target Object Category', 'Goal', (12, 4), cmap='gist_yarg')

In [None]:
plot_coocurrence_data(throwing_coocurrences['on_to'], 
    'On object <> target object cocurrence', 'Target Object', 'On Object', (12, 4), cmap='gist_yarg')

In [None]:
plot_coocurrence_data(throwing_coocurrences['from_to'], 
    'From object <> target object cocurrence', 'Target Object', 'From Object', (12, 4), cmap='gist_yarg')

In [None]:
plot_coocurrence_data(throwing_coocurrences['all_object_predicate'], 
    'Object <> predicate cocurrence', 'Predicate', 'Object', (12, 8), cmap='gist_yarg')

In [None]:
plot_coocurrence_data(throwing_coocurrences['all_object_object'], 
    'Object <> object cocurrence', 'Object', 'Object', (12, 12), cmap='gist_yarg')

## Throwing game bar charts

In [None]:
def dict_keys_to_1(row):
    if isinstance(row.object_types_referenced, dict):
        return {key: 1 for key in row.object_types_referenced}

single_ax_bar_chart(throwing_df, 'throwing_goal', dict_keys_to_1,
    title='Count of throwing games referring to each object type', xlabel='Object type', figsize=(12, 6))



In [None]:
def dict_keys_to_1(row):
    if isinstance(row.predicates_referenced, dict):
        return {key: 1 for key in row.predicates_referenced}

single_ax_bar_chart(throwing_df, 'throwing_goal', dict_keys_to_1,
    title='Count of throwing games referring to each predicate', xlabel='Object type', figsize=(12, 6))



In [None]:
single_ax_histograms(throwing_df, 'throwing_goal', lambda row: row.num_preferences, 
    title='Total number of preferneces by throwing game type', xlabel='# of preferences', legend_loc='upper right')

In [None]:
single_ax_histograms(throwing_df, 'throwing_goal', lambda row: row.setup_nodes, 
    title='Total setup nodes by throwing game type', xlabel='Total setup nodes', legend_loc='upper right')

In [None]:
building_df_or_organizing_df = stats_df[np.logical_or(stats_df.is_building == 1, stats_df.is_organizing == 1)]
building_df = stats_df[stats_df.is_building == 1]
organizing_df = stats_df[stats_df.is_organizing == 1]
non_throwing_df = stats_df[stats_df.is_throwing == 0]

len(building_df_or_organizing_df), len(building_df), len(organizing_df), len(non_throwing_df)

## Non-throwing game statistics

See the above cell -- we have a total of 9 games that don't involve any throwing, or alternatively, a total of 13 games that involve either building or organizing. 

**Is this enough to separately extract statistics over? Or too little? If yes, anything specific we want to see?**

In [None]:
stats_df.room.value_counts()

In [None]:
stats_df.sort_values('ast_nodes').loc[:, ['game_name', 'ast_nodes', 'game_type']]

In [None]:
sorted_df = stats_df.sort_values('ast_nodes')
sorted_df.loc[sorted_df.is_building == 1, ['game_name', 'ast_nodes', 'game_type']]

In [None]:
sorted_df = stats_df.sort_values('ast_nodes')
sorted_df.loc[(sorted_df.is_throwing == 1) & (sorted_df.setup_nodes > 0), ['game_name', 'ast_nodes', 'game_type']]

In [None]:
sorted_df = stats_df.sort_values('ast_nodes')
sorted_df.loc[sorted_df.is_throwing == 1, ['game_name', 'room_name', 'ast_nodes', 'game_type']]

In [None]:
stats_df.object_types_referenced

In [None]:
repeated_structures_df.head()

In [None]:
plt.figure(figsize=(12, 4))
ax = plt.gca()
repeated_structures_df.hist(column='count', ax=ax, grid=False, by='structure_start', layout=(1, 3))
# plt.suptitle('Repeated structure count')

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform

def edit_distance_matrix(structure_df):
    distances = np.zeros((len(structure_df), len(structure_df)))
    for i, structure in enumerate(structure_df.structure):
        for j, other_structure in enumerate(structure_df.structure):
            distances[i, j] = editdistance.eval(structure, other_structure)

    return distances


def edit_distance_and_dendrogram(structure_df):
    distances = edit_distance_matrix(repeated_structures_df)
    plt.figure(figsize=(8, 8))
    plt.imshow(distances, cmap='coolwarm', interpolation='nearest')
    plt.colorbar()
    plt.show()

    square_distances = squareform(distances)
    linkage_matrix = linkage(square_distances, "single")

    plt.figure(figsize=(12, 8))
    dendrogram(linkage_matrix)
    plt.show()

edit_distance_and_dendrogram(repeated_structures_df)

In [None]:
repeated_structures_df.sort_values('count', ascending=False).head(10)

In [None]:
repeated_structures_df[repeated_structures_df.structure_start != 'hold-while' ]['count'].sum()

In [None]:
repeated_structures_df[(repeated_structures_df.structure_start != 'hold-while') & (repeated_structures_df['count'] == 2)].shape

In [None]:
148 / 260