In [None]:
%autoreload 2

In [None]:
from collections import defaultdict
from argparse import Namespace
from ast import literal_eval
import copy
import gzip
import itertools
import json
import math
import os
import pickle
import sys
import textwrap
import typing

import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

import duckdb
from IPython.display import display, Markdown, HTML  # type: ignore
import matplotlib
import matplotlib.axes
import matplotlib.pyplot as plt
from Levenshtein import distance as _edit_distance
import numpy as np
import pandas as pd
import tabulate
import tatsu
import tatsu.ast
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import tabulate
from tqdm.notebook import tqdm
from scipy import stats
from scipy.special import comb
import seaborn as sns
import sklearn
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.pipeline import Pipeline
from tqdm import tqdm

sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../src'))
from src.ast_utils import _extract_game_id, deepcopy_ast, replace_child
from src.ast_printer import ast_to_lines
from src import fitness_energy_utils as utils
from src.fitness_energy_utils import NON_FEATURE_COLUMNS
from src.fitness_features import *
from src.ast_counter_sampler import *
from src.evolutionary_sampler import *
from src import fitness_features_by_category, latest_model_paths

In [None]:
grammar = open('../dsl/dsl.ebnf').read()
grammar_parser = tatsu.compile(grammar)
game_asts = list(cached_load_and_parse_games_from_file('../dsl/interactive-beta.pddl', grammar_parser, False, relative_path='..'))

In [None]:

real_game_texts = [ast_printer.ast_to_string(ast, '\n') for ast in game_asts]


# regrown_game_texts = list(load_games_from_file('../dsl/ast-real-regrowth-samples.pddl'))
# regrown_game_1024_texts = list(load_games_from_file('../dsl/ast-real-regrowth-samples-1024.pddl'))
# print(len(real_game_texts), len(regrown_game_texts), len(regrown_game_texts) / 98, len(regrown_game_1024_texts), len(regrown_game_1024_texts) / 98)

fitness_df = utils.load_fitness_data('../data/fitness_features_1024_regrowths.csv.gz')
print(fitness_df.src_file.unique())
fitness_df.head()

In [None]:
# trace_filter_results_path = '../samples/trace_filter_results_max_exemplar_preferences_by_bcs_with_expected_values_2023_11_29_2023_12_05_1.pkl.gz'
model_key = 'max_exemplar_preferences_by_bcs_with_expected_values'
model_spec = latest_model_paths.MAP_ELITES_MODELS[model_key]
model = typing.cast(MAPElitesSampler, model_spec.load())

key_to_real_game_index = defaultdict(list)
real_game_index_to_key = {}
real_game_fitness_scores = []
ALL_REAL_GAME_KEYS = []
for i, ast in enumerate(game_asts):
    fitness_score, features = model._score_proposal(ast, return_features=True)  # type: ignore
    real_game_fitness_scores.append(fitness_score)
    key = model._features_to_key(ast, features)
    key_to_real_game_index[key].append(i)
    real_game_index_to_key[i] = key
    ALL_REAL_GAME_KEYS.append(key)

trace_filter_results = model_spec.load_trace_filter_data()
trace_filter_results.keys()

In [None]:
REAL_GAME_INDICES_TO_INCLUDE = [
    0, 4, 6, 7, 11,
    14, 17, 23, 26, 28,
    31, 32, 35, 37, 40,
    41, 42, 45, 49, 51,
    52, 55, 58, 59, 64,
    74, 88, 90, 94, 96,
]

REAL_GAME_KEY_LIST = [real_game_index_to_key[i] for i in REAL_GAME_INDICES_TO_INCLUDE]
REAL_GAME_KEY_DICT = {key: i for i, key in enumerate(REAL_GAME_KEY_LIST)}
REAL_GAME_KEYS = set(REAL_GAME_KEY_LIST)
print(len(REAL_GAME_KEYS))

In [None]:
UNMATCHED_TOP_30_KEYS = [
    (1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0),
    (1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1),
    (1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),
    (1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0),
    (1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0),
    (1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0),
    (1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0),
    (1, 1, 2, 0, 0, 0, 0, 0, 1, 0, 0, 0),
    (1, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0, 0),
    (1, 1, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1),
    (1, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0),
    (1, 0, 2, 1, 0, 0, 1, 0, 0, 0, 0, 0),
    (1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0),
    (1, 1, 2, 0, 0, 0, 1, 0, 1, 0, 0, 0),
    (1, 1, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0),
    (1, 1, 3, 1, 0, 0, 1, 0, 0, 0, 1, 0),
    (1, 1, 3, 0, 0, 2, 0, 0, 0, 0, 0, 0),
    (1, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0),
    (1, 0, 3, 0, 0, 0, 0, 0, 1, 0, 0, 0),
    (1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    (1, 1, 3, 0, 1, 0, 0, 0, 0, 0, 0, 0),
    (1, 1, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0),
    (1, 0, 4, 0, 1, 1, 0, 1, 0, 1, 0, 0),
    (1, 0, 4, 0, 0, 0, 0, 0, 3, 0, 0, 0),
    (1, 1, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0),
    (1, 1, 4, 0, 0, 1, 1, 1, 0, 1, 0, 0),
    (1, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 1),
    (1, 1, 4, 0, 2, 0, 0, 0, 1, 0, 0, 0),
    (1, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    (1, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0)
]

In [None]:
TRANSLATIONS_DIR = '../llm_tests/translations'
TRANSLATION_DATE = '2024_01_12'
UNMATCHED_ONLY_TOP_30 = True

with open(f'{TRANSLATIONS_DIR}/human_games_translations_split_{TRANSLATION_DATE}.json') as f:
    human_game_texts = json.load(f)
    human_game_texts = {literal_eval(k): v for k, v in human_game_texts.items()}

with open(f'{TRANSLATIONS_DIR}/human_cell_archive_games_translations_split_{TRANSLATION_DATE}.json') as f:
    human_cell_archive_game_texts = json.load(f)
    human_cell_archive_game_texts = {literal_eval(k): v for k, v in human_cell_archive_game_texts.items()}

with open(f'{TRANSLATIONS_DIR}/novel_archive_cell_games_translations_split_{TRANSLATION_DATE}.json') as f:
    novel_archive_cell_game_texts = json.load(f)
    novel_archive_cell_game_texts = {literal_eval(k): v for k, v in novel_archive_cell_game_texts.items()}
    if UNMATCHED_ONLY_TOP_30:
        novel_archive_cell_game_texts = {k: v for k, v in novel_archive_cell_game_texts.items() if k in UNMATCHED_TOP_30_KEYS}

### Map each archive cell key to the relevant AST

@gdrtodd -- we could also pull in more human games, etc., but these are the ones from the human eval dataset 

In [None]:
real_game_key_to_ast = {key: game_asts[i] for key, i in REAL_GAME_KEY_DICT.items()}
matched_game_key_to_ast = {key: model.population[key] for key in human_cell_archive_game_texts.keys()}
unmatched_game_key_to_ast = {key: model.population[key] for key in novel_archive_cell_game_texts.keys()}

### Extract fitness features for a game

This returns a dict where each key is a fitness feature name and each value is the value of that feature

If for some reason you want all features, rather than the ones that the model used, set `only_used=False`

In [None]:
def extract_fitness_features(ast: tatsu.ast.AST, only_used: bool = True):
    features = model._proposal_to_features(ast)
    if only_used:
        features = {k: v for k, v in features.items() if k in model.feature_names}
    return features

extract_fitness_features(real_game_key_to_ast[REAL_GAME_KEY_LIST[0]])

I don't know why you'd ever want a game's features as a Tensor, but just in case

In [None]:
def extract_fitness_tensor(ast: tatsu.ast.AST):
    features = extract_fitness_features(ast, False)
    return model._features_to_tensor(features)


def fitness_score(ast: tatsu.ast.AST):
    return model._score_proposal(ast, return_features=False)

fitness_score(real_game_key_to_ast[REAL_GAME_KEY_LIST[0]])

The below explicitly maps a game to its BCs, mostly useful in case you want to know which BC index is which feature

In [None]:
def game_to_behavioral_feature_dict(ast: tatsu.ast.AST):
    features = extract_fitness_features(ast, False)
    return model.custom_featurizer.get_game_features(ast, features)


game_to_behavioral_feature_dict(real_game_key_to_ast[REAL_GAME_KEY_LIST[0]])



# Comparisons Between Real and Matched Games

### Average Cosine Similarity in Feature Space

In [None]:
real_game_fitness_tensors = torch.stack([extract_fitness_tensor(real_game_key_to_ast[key]) for key in REAL_GAME_KEY_LIST])
matched_game_fitness_tensors = torch.stack([extract_fitness_tensor(matched_game_key_to_ast[key]) for key in human_cell_archive_game_texts.keys()])

In [None]:
# Compute average cosine similarity between real games
real_game_similarities = []
for i, j in itertools.combinations(range(len(REAL_GAME_KEY_LIST)), 2):
    real_game_similarities.append(
        F.cosine_similarity(real_game_fitness_tensors[i], real_game_fitness_tensors[j], dim=0).item()
    )

# Compute average cosine similarity between matched games
matched_game_similarities = []
for i, j in itertools.combinations(range(len(human_cell_archive_game_texts)), 2):
    matched_game_similarities.append(
        F.cosine_similarity(matched_game_fitness_tensors[i], matched_game_fitness_tensors[j], dim=0).item()
    )

# Compute average cosine similarity between real and matched games
real_matched_game_similarities = []
for i, j in itertools.product(range(len(REAL_GAME_KEY_LIST)), range(len(human_cell_archive_game_texts))):
    real_matched_game_similarities.append(
        F.cosine_similarity(real_game_fitness_tensors[i], matched_game_fitness_tensors[j], dim=0).item()
    )

real_game_similarities = np.array(real_game_similarities)
matched_game_similarities = np.array(matched_game_similarities)
real_matched_game_similarities = np.array(real_matched_game_similarities)

# Print the results
print(f'Average cosine similarity between real games: {real_game_similarities.mean()} +/- {real_game_similarities.std()}')
print(f'Average cosine similarity between matched games: {matched_game_similarities.mean()} +/- {matched_game_similarities.std()}')
print(f'Average cosine similarity between real and matched games: {real_matched_game_similarities.mean()} +/- {real_matched_game_similarities.std()}')


### Number of Differing Features

In [None]:
differences = []
for key in human_cell_archive_game_texts.keys():
    real_game = real_game_key_to_ast[key]
    matched_game = matched_game_key_to_ast[key]

    real_game_features = extract_fitness_features(real_game)
    matched_game_features = extract_fitness_features(matched_game)

    n_diffs = 0
    for k in model.feature_names:
        if real_game_features[k] != matched_game_features[k] and 'ngram' not in k:
            n_diffs += 1

    differences.append(n_diffs)

differences = np.array(differences)
print(f'Average number of differing positions excluding ngram features: {differences.mean()} +/- {differences.std()} (of {len(model.feature_names)} total)')

### [+] Which Features Differ the Most?

### Training a Discriminator Between Real and Corresponding Matched Games

### [+] Edit Distance (both on anonymized ASTs and on Stage 1 Translations)

### [+] Reward Machine: Are the Same Preferences Activated from the Same Traces?

In [None]:
from fitness_energy_utils import load_data

human_games_trace_filter_data = load_data('', 'samples', f'/trace_filter_results_interactive-beta.pddl_2024_03_19', relative_path='..')
human_games_trace_filter_data.keys()

In [None]:
def get_activating_traces(filter_info, key, exclude_setup=False):
    sub_ast_to_trace_activations = filter_info['full'][key]
    
    sub_ast_to_activating_traces = {}
    for sub_ast, trace_activations in sub_ast_to_trace_activations.items():
        activating_traces = [trace for trace, activation in trace_activations.items() if activation > 0]
        sub_ast_to_activating_traces[sub_ast] = set(activating_traces)

    if exclude_setup:
        sub_ast_to_activating_traces = {sub_ast: traces for sub_ast, traces in sub_ast_to_activating_traces.items() if 'setup' not in sub_ast}

    sub_ast_to_activating_traces['all'] = set.intersection(*[sub_ast_to_activating_traces[sub_ast] for sub_ast in sub_ast_to_activating_traces.keys()])
    sub_ast_to_activating_traces['any'] = set.union(*[sub_ast_to_activating_traces[sub_ast] for sub_ast in sub_ast_to_activating_traces.keys()])


    return sub_ast_to_activating_traces

In [None]:
# Remap the human_games_trace_filter_data according to the key instead of the index
remapped_human_games_trace_filter_data = {"full": {}}

for real_game_idx in human_games_trace_filter_data['full'].keys():
    if real_game_idx not in REAL_GAME_INDICES_TO_INCLUDE:
        continue
    
    real_game_key = real_game_index_to_key[real_game_idx]
    remapped_human_games_trace_filter_data['full'][real_game_key] = human_games_trace_filter_data['full'][real_game_idx]

In [None]:
remapped_human_games_trace_filter_data['full'][(1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0)]['(:setup'].keys()

In [None]:
trace_comparison_results = {}
for key in human_cell_archive_game_texts.keys():
    activating_traces = get_activating_traces(trace_filter_results, key)

    if key not in remapped_human_games_trace_filter_data['full']:
        print(f'Key {key} not in remapped_human_games_trace')
        continue

    corresponding_human_activating_traces = get_activating_traces(remapped_human_games_trace_filter_data, key)

    results = {}

    for ast_type in ['any', 'all']:
        results[ast_type] = {
            'intersection': activating_traces[ast_type].intersection(corresponding_human_activating_traces[ast_type]),
            'union': activating_traces[ast_type].union(corresponding_human_activating_traces[ast_type]),
            'generated_minus_human': activating_traces[ast_type] - corresponding_human_activating_traces[ast_type],
            'human_minus_generated': corresponding_human_activating_traces[ast_type] - activating_traces[ast_type],
        }

    trace_comparison_results[key] = results

In [None]:
# Compute the proportion of traces that activate at least one AST that are unique to generated / human games
prop_unique_generated_any = []
prop_unique_human_any = []
prop_unique_either_any = []
prop_not_unique_any = []
jaccard_similarities = []

for key in trace_comparison_results.keys():
    results = trace_comparison_results[key]['any']

    all_trace_count = len(results['union'])
    prop_unique_generated_any.append(len(results['generated_minus_human']) / all_trace_count)
    prop_unique_human_any.append(len(results['human_minus_generated']) / all_trace_count)
    prop_unique_either_any.append(len(results['generated_minus_human'].union(results['human_minus_generated'])) / all_trace_count)
    prop_not_unique_any.append(len(results['intersection']) / all_trace_count)

    jaccard_similarities.append(len(results['intersection']) / len(results['union']))

# Plot histograms of the proportion of unique traces in two separate plots
fig, ax = plt.subplots(1, 3, figsize=(18, 4))
ax[0].hist(prop_unique_generated_any, bins=10)
ax[0].set_xlabel('Proportion of activated traces unique to generated game')
ax[0].set_ylabel('Number of cells')

ax[1].hist(prop_unique_human_any, bins=10, color='g')
ax[1].set_xlabel('Proportion activating traces unique to human game')
ax[1].set_ylabel('Number of cells')

ax[2].hist(prop_unique_either_any, bins=10, color='r')
ax[2].set_xlabel('Proportion activating traces unique to *either* game')
ax[2].set_ylabel('Number of cells')

plt.show()

# Then do the same thing as a bar plot with the proportions in descending order
fig, ax = plt.subplots(1, 3, figsize=(18, 4))
prop_unique_generated_any = np.array(prop_unique_generated_any)
prop_unique_human_any = np.array(prop_unique_human_any)
prop_unique_either_any = np.array(prop_unique_either_any)
prop_not_unique_any = np.array(prop_not_unique_any)

sorted_indices = np.argsort(prop_unique_generated_any)
ax[0].bar(np.arange(len(prop_unique_generated_any)), prop_unique_generated_any[sorted_indices], color='b')

sorted_indices = np.argsort(prop_unique_human_any)
ax[1].bar(np.arange(len(prop_unique_human_any)), prop_unique_human_any[sorted_indices], color='g')

sorted_indices = np.argsort(prop_unique_either_any)
ax[2].bar(np.arange(len(prop_unique_either_any)), prop_unique_either_any[sorted_indices], color='r')

plt.show()


In [None]:
# Stacked bar chart showing the proportion of traces unique to generated / human / neither
fig, ax = plt.subplots(figsize=(12, 6))

indices_sorted_by_not_unique = np.argsort(prop_not_unique_any)

ax.bar(np.arange(len(prop_unique_generated_any)), 
       prop_unique_generated_any[indices_sorted_by_not_unique],
       color='dodgerblue',
       label='Unique to generated game')

ax.bar(np.arange(len(prop_unique_human_any)),
       prop_unique_human_any[indices_sorted_by_not_unique],
       bottom=prop_unique_generated_any[indices_sorted_by_not_unique],
       color='lightgreen',
       label='Unique to human game')

ax.bar(np.arange(len(prop_not_unique_any)),
       prop_not_unique_any[indices_sorted_by_not_unique],
       bottom=prop_unique_generated_any[indices_sorted_by_not_unique] + prop_unique_human_any[indices_sorted_by_not_unique],
       color='slateblue',
       label='Shared')

# remove x ticks
ax.set_xticks([])

ax.set_xlabel('Archive cells')
ax.set_ylabel('Proportion of activating traces')
ax.legend()
plt.show()

In [None]:
# Plot histogram of Jaccard similarities
plt.hist(jaccard_similarities, bins=10)

# Vertical line for the average Jaccard similarity
plt.axvline(np.mean(jaccard_similarities), color='r', linestyle='--')

plt.xlabel('Jaccard similarity')
plt.ylabel('Number of cells')
plt.title('Jaccard Similarity of Activating Traces Between Human and Generated Games')

print(f"Average and median Jaccard similarity: {np.mean(jaccard_similarities)}, {np.median(jaccard_similarities)}")

In [None]:
# Compute the proportion of traces that activate at least one AST that are unique to generated / human games
prop_unique_generated_all = []
prop_unique_human_all = []
prop_unique_either_all = []

for key in trace_comparison_results.keys():
    results = trace_comparison_results[key]['all']

    all_trace_count = len(results['union'])
    if all_trace_count == 0:
        print(f'Games at {key} have no traces that activate setup and all preferences in either game')
        continue

    prop_unique_generated_all.append(len(results['generated_minus_human']) / all_trace_count)
    prop_unique_human_all.append(len(results['human_minus_generated']) / all_trace_count)
    prop_unique_either_all.append(len(results['generated_minus_human'].union(results['human_minus_generated'])) / all_trace_count)

# Plot histograms of the proportion of unique traces in two separate plots
fig, ax = plt.subplots(1, 3, figsize=(18, 4))
ax[0].hist(prop_unique_generated_all, bins=10)
ax[0].set_xlabel('Proportion of activated traces unique to generated game')
ax[0].set_ylabel('Number of cells')

ax[1].hist(prop_unique_human_all, bins=10, color='g')
ax[1].set_xlabel('Proportion activating traces unique to human game')
ax[1].set_ylabel('Number of cells')

ax[2].hist(prop_unique_either_all, bins=10, color='r')
ax[2].set_xlabel('Proportion activating traces unique to *either* game')
ax[2].set_ylabel('Number of cells')

plt.show()

## [+] Most Similar Generated Game to Specific Real Games Using Reward Machine

In [None]:
all_generated_game_keys = list(matched_game_key_to_ast.keys()) + list(unmatched_game_key_to_ast.keys())

In [None]:
all_human_game_keys = real_game_index_to_key.values()

In [None]:
# The keys of the real games for which we want to know the most similar generated game
TARGET_KEYS = [
    (1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0), # matched 14
    (1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0), # matched 31
    (1, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0), # matched 40

    (1, 0, 2, 0, 1, 0, 0, 0, 0, 1, 0, 0), # unmatched (place the bin near the north wall...)
    (1, 1, 3, 1, 0, 0, 1, 0, 0, 0, 1, 0), # unmatched (credit cards and CDs)
    (1, 1, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0), # unmatched (block stacking)
]

In [None]:
for key, value in get_activating_traces(trace_filter_results, (1, 0, 2, 0, 0, 1, 0, 0, 0, 0, 1, 0)).items():
    print(key, len(value))

In [None]:
EXCLUDE_SETUP = True
AGGREGATION = "all"

paired_keys = []
print(f"CLOSEST MATCH RESULTS (aggregation = {AGGREGATION}, exclude setup = {EXCLUDE_SETUP})")
for target in TARGET_KEYS:
    target_activating_traces = get_activating_traces(trace_filter_results, target, EXCLUDE_SETUP)[AGGREGATION]

    best_key, best_jaccard_similarity = None, 0
    best_union, best_intersection = None, None
    for key in all_human_game_keys:
        if key not in remapped_human_games_trace_filter_data['full']:
            continue

        activating_traces = get_activating_traces(remapped_human_games_trace_filter_data, key, EXCLUDE_SETUP)[AGGREGATION]

        intersection = target_activating_traces.intersection(activating_traces)
        union = target_activating_traces.union(activating_traces)
        if len(union) > 0:
            jaccard_similarity = len(intersection) / len(union)
        else:
            jaccard_similarity = 0

        if jaccard_similarity > best_jaccard_similarity:
            best_key, best_jaccard_similarity = key, jaccard_similarity
            best_union, best_intersection = len(union), len(intersection)
    
    print(f"Gen. key {target} --> real key {best_key} (sim = {best_jaccard_similarity:.2f}, {best_intersection} / {best_union})")
    paired_keys.append((target, best_key))
    

In [None]:
for generated_key, real_game_key in paired_keys:
    print("\n\n" + "=" * 80)
    print(f"[GEN] Preference breakdown for {generated_key}")
    for key, value in get_activating_traces(trace_filter_results, generated_key).items():
        print(key, len(value))

    print(f"[REAL] Preference breakdown for {real_game_key}")
    for key, value in get_activating_traces(remapped_human_games_trace_filter_data, real_game_key).items():
        print(key, len(value))

## [+] Archive Diversity Computation Using Reward Machine

In [None]:
ablation_no_custom_ops_key = 'ablation_max_exemplar_preferences_by_bcs_with_expected_values_no_custom_ops'
ablation_no_custom_ops_model_spec = latest_model_paths.MAP_ELITES_MODELS[ablation_no_custom_ops_key]
ablation_no_custom_ops_model = typing.cast(MAPElitesSampler, ablation_no_custom_ops_model_spec.load())

ablation_no_custom_ops_no_crossover_key = 'ablation_max_exemplar_preferences_by_bcs_with_expected_values_no_custom_ops_no_crossover'
ablation_no_custom_ops_no_crossover_model_spec = latest_model_paths.MAP_ELITES_MODELS[ablation_no_custom_ops_no_crossover_key]
ablation_no_custom_ops_no_crossover_model = typing.cast(MAPElitesSampler, ablation_no_custom_ops_no_crossover_model_spec.load())

In [None]:
ablation_no_custom_ops_trace_filter_data = ablation_no_custom_ops_model_spec.load_trace_filter_data()
ablation_no_custom_ops_no_crossover_model_trace_filter_data = ablation_no_custom_ops_no_crossover_model_spec.load_trace_filter_data()

In [None]:
crossover_ablation_trace_filter_mapping = {
    "Full Model": trace_filter_results,
    "No Custom Ops": ablation_no_custom_ops_trace_filter_data,
    "No Custom Ops No Crossover": ablation_no_custom_ops_no_crossover_model_trace_filter_data
}

In [None]:
crossover_ablation_pairwise_similarities_mapping = {}

for ablation_key in crossover_ablation_trace_filter_mapping.keys():
    ablation_trace_filter_data = crossover_ablation_trace_filter_mapping[ablation_key]
    ablation_activating_traces = {}

    valid_keys = [key for key in ablation_trace_filter_data['full'].keys() if key[0] == 1] # filter out keys with the invalid BC active


    for key in valid_keys:
        activating_traces = get_activating_traces(ablation_trace_filter_data, key)
        ablation_activating_traces[key] = activating_traces

    pairwise_jaccard_similarities = []
    total = comb(len(valid_keys), 2)

    for key1, key2 in tqdm(itertools.combinations(valid_keys, 2), desc="Computing pairwise similarities", total=total):
        traces1 = ablation_activating_traces[key1]['any']
        traces2 = ablation_activating_traces[key2]['any']

        intersection = traces1.intersection(traces2)
        union = traces1.union(traces2)

        # TODO: how to handle the case where the union is empty?
        if len(union) < 1:
            continue

        pairwise_jaccard_similarities.append(len(intersection) / len(union))

    crossover_ablation_pairwise_similarities_mapping[ablation_key] = pairwise_jaccard_similarities

    

In [None]:
for key in crossover_ablation_pairwise_similarities_mapping.keys():
    similarities = crossover_ablation_pairwise_similarities_mapping[key]
    print(f"{key}: {len(set(similarities))} unique, {len(similarities)} total")

In [None]:
tick_labels = ['Full Model', 'No Custom Ops', 'No Custom Ops\nNo Crossover']

# Plot each of the pairwise similarities as a violin plot, labeled with the key
fig, ax = plt.subplots(figsize=(8, 6))
sns.violinplot(data=list(crossover_ablation_pairwise_similarities_mapping.values()), ax=ax, inner='quart', alpha=0.75)
sns.pointplot(data=list(crossover_ablation_pairwise_similarities_mapping.values()), errorbar=('ci', 95), linestyle='none', color='black', markers='d', markersize=15, ax=ax)
ax.set_xticklabels(tick_labels)
ax.set_ylabel('Jaccard similarity')
ax.set_xlabel('Ablation')


In [None]:
print("Number of Activating Traces for Games Under Each Ablation")
num_traces_per_cell = {}
for ablation_key in crossover_ablation_trace_filter_mapping.keys():
    ablation_trace_filter_data = crossover_ablation_trace_filter_mapping[ablation_key]
    ablation_activating_traces = {}

    valid_keys = [key for key in ablation_trace_filter_data['full'].keys() if key[0] == 1] # filter out keys with the invalid BC active

    for key in valid_keys:
        activating_traces = get_activating_traces(ablation_trace_filter_data, key)
        ablation_activating_traces[key] = activating_traces

    num_traces = [len(ablation_activating_traces[key]['any']) for key in valid_keys]

    average_num_traces = np.mean(num_traces)
    std_num_traces = np.std(num_traces)
    median_num_traces = np.median(num_traces)

    num_traces_per_cell[ablation_key] = num_traces

    print(f"{ablation_key}: {average_num_traces} +/- {std_num_traces}, median {median_num_traces}")

# T-Tests between each pair of ablations
# from scipy.stats import ttest_rel
# for key1, key2 in itertools.combinations(crossover_ablation_pairwise_similarities_mapping.keys(), 2):
#     nums1 = num_traces_per_cell[key1]
#     nums2 = num_traces_per_cell[key2]

#     t_stat, p_val = ttest_rel(nums1, nums2)
#     print(f"T-Test between {key1} and {key2}: t={t_stat}, p={p_val}")

In [None]:
tick_labels = ['Full Model', 'No Custom Ops', 'No Custom Ops\nNo Crossover']

# Plot each of the pairwise similarities as a violin plot, labeled with the key
fig, ax = plt.subplots(figsize=(8, 6))
sns.violinplot(data=list(num_traces_per_cell.values()), ax=ax, inner='quart', alpha=0.75)
sns.pointplot(data=list(num_traces_per_cell.values()), errorbar=('ci', 95), linestyle='none', color='black', markers='d', markersize=15, ax=ax)
ax.set_xticklabels(tick_labels)
ax.set_ylabel('Number of activating traces')
ax.set_xlabel('Ablation')

In [None]:
print("Number of *Unique* Activating Traces for Games Under Each Ablation")
for ablation_key in crossover_ablation_trace_filter_mapping.keys():
    ablation_trace_filter_data = crossover_ablation_trace_filter_mapping[ablation_key]
    ablation_activating_traces = {}

    valid_keys = [key for key in ablation_trace_filter_data['full'].keys() if key[0] == 1] # filter out keys with the invalid BC active


    for key in valid_keys:
        activating_traces = get_activating_traces(ablation_trace_filter_data, key)
        ablation_activating_traces[key] = activating_traces

    num_unique = []
    for key in tqdm(valid_keys, desc="Computing unique traces"):
        traces = ablation_activating_traces[key]['all']
        all_other_traces = set.union(*[ablation_activating_traces[k]['all'] for k in valid_keys if k != key])

        num_unique.append(len(traces - all_other_traces))

    average_num_unique = np.mean(num_unique)
    std_num_unique = np.std(num_unique)
    median_num_unique = np.median(num_unique)

    print(f"{ablation_key}: {average_num_unique} +/- {std_num_unique}, median {median_num_unique}")