In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import json
from collections import defaultdict, Counter, namedtuple
from itertools import combinations
from tabulate import tabulate
import sys
import json
import python_jsonschema_objects as pjs
PROJECT_PATH = '/Users/guydavidson/projects/game-generation-modeling'
sys.path.append(PROJECT_PATH)

In [41]:
from schema.validate_schema import load_and_validate_game_schema
SCHEMA_FILE = '../schema/game_schema_with_refs.json'
GAME_SCHEMAS_FILE = '../schema/interactive_beta.json'
with open(SCHEMA_FILE, 'r') as schema_file:
    schema = json.load(schema_file)
game_schemas = load_and_validate_game_schema(GAME_SCHEMAS_FILE, SCHEMA_FILE)

44


In [159]:
DEFAULT_RANDOM_SEED = 33


class PriorBase:
    def __init__(self, key, seed=DEFAULT_RANDOM_SEED):
        self.key = key
        self.rng = np.random.default_rng(seed)
        self.is_fit = False
        self.samples = []

    def add_observation(self, observation):
        self._add_observation(observation)
        self.is_fit = False
        
    def _add_observation(self, observation):
        raise NotImplementedError()

    def fit(self):
        self._fit()
        self.is_fit = True

    def _fit(self):
        raise NotImplementedError()

    def sample(self, sample_params=None):
        if sample_params is None:
            sample_params = dict()
        
        if self.is_fit:
            sample = self._sample(sample_params)
            self.samples.append(sample)
            return sample
        else:
            print('Call fit() before attempting to sample')
    
    def _sample(self, sample_params=None):
        raise NotImplementedError()


class ArrayPrior(PriorBase):
    def __init__(self, key, item_prior, seed=DEFAULT_RANDOM_SEED):
        super().__init__(key, seed)
        self.item_prior = item_prior
        self.lengths = []

        self.duplicates_observed = False
        self.lambda_mle = None

    def _add_observation(self, observation):
        if observation is None or not observation:
            self.lengths.append(0)

        elif isinstance(observation, list):
            self.lengths.append(len(observation))
        
            if hasattr(observation[0], '__hash__') and observation.__hash__ is not None and len(set(observation)) < len(observation):
                self.duplicates_observed = True

            for item in observation:
                self.item_prior.add_observation(item)

        else:
            raise ValueError(f'ArrayPrior expected to receive a list obervation, instead received a {type(observation)}: {observation}')

    def _fit(self):
        self.lambda_mle = np.mean(self.lengths)
        self.observed_lambda_mle = np.mean([l for l in self.lengths if l > 0])
        self.item_prior.fit()

    def _sample(self, sample_params=None):
        if 'required' in sample_params and sample_params['required']:
            sample_length = 0 
            while sample_length == 0:
                sample_length = self.rng.poisson(self.observed_lambda_mle)

        else:    
            sample_length = self.rng.poisson(self.lambda_mle)
            if sample_length == 0:
                return None

        array_prefix = f'[]{KEY_DELIMITER}'
        child_params = {key.replace(array_prefix, ''): value for key, value in sample_params.items() if key.startswith(array_prefix)}

        if self.duplicates_observed:
            return [self.item_prior.sample(child_params) for _ in range(sample_length)]

        sample = []
        while len(sample) < sample_length:
            candidate = self.item_prior.sample(child_params)
            if candidate not in sample:
                sample.append(candidate)

        return sample

DEFAULT_EPSILON = 0.05


class EnumPrior(PriorBase):
    def __init__(self, key, options, epsilon=DEFAULT_EPSILON, seed=DEFAULT_RANDOM_SEED):
        super().__init__(key, seed)
        self.epsilon = epsilon
        self.options = list(options)
        self.counts = {option: 0 for option in options}
        self.counts[None] = 0

        self.observed_options = []
        self.probabilities = None

    def _add_observation(self, observation):
        if observation is None:
            self.counts[None] += 1

        elif observation not in self.counts:
            raise ValueError(f'EnumItemPrior received an observation {observation} that was not in the initialized counts: {self.counts}')
        
        self.counts[observation] += 1

    def _fit(self):
        self.observed_options = [opt for opt in self.options if self.counts[opt] > 0]
        self.probabilities = np.array([self.counts[opt] for opt in self.observed_options])
        self.probabilities = self.probabilities / self.probabilities.sum()

    def _sample(self, sample_params=None):
        if self.rng.uniform() < self.epsilon:
            return self.rng.choice(self.options)
        
        return self.rng.choice(self.observed_options, p=self.probabilities)


REQUIRED = 'required'
OMIT = 'omit'


class ObjectItemPrior(PriorBase):
    def __init__(self, key, properties_to_priors, required_properties, keys_to_skip=None, seed=DEFAULT_RANDOM_SEED):
        super().__init__(key, seed)
        self.properties_to_priors = properties_to_priors
        self.required_properties = required_properties
        if keys_to_skip is None:
            keys_to_skip = []
        self.keys_to_skip = keys_to_skip
        
        self.observed_count = 0
        self.unobserved_count = 0

    def _add_observation(self, observation):
        if observation is None:
            self.unobserved_count += 1
            return
        
        for property in observation:
            if property not in self.properties_to_priors and property not in self.keys_to_skip:
                raise ValueError(f'ObjectItemPrior received unexpected property "{property}", expected only the following: {list(self.properties_to_priors.keys())}')

        for property in self.properties_to_priors:
            if property not in observation and property in self.required_properties:
                raise ValueError(f'ObjectItemPrior expected to receive required property "{property}", expecting the following: {list(self.properties_to_priors.keys())}')

            if property in observation:
                self.properties_to_priors[property].add_observation(observation[property])

            else:
                self.properties_to_priors[property].add_observation(None)
        
        self.observed_count += 1

    def _fit(self):
        total_count = self.observed_count + self.unobserved_count
        if total_count == 0:
            self.observe_p = 0

        else:
            self.observe_p = self.observed_count / total_count
        
        for prior in self.properties_to_priors.values():
            prior.fit()

    def _sample(self, sample_params=None):
        required = ('required' in sample_params and sample_params['required'])
        if required and self.observe_p == 0:
            raise ValueError(f'Cannot require to sample ObjectItemPrior {self.key} which has never been observed')

        if required or self.rng.uniform() < self.observe_p:
            sample = {}
            for property, prior in self.properties_to_priors.items():
                property_params = {}
                if property in sample_params:
                    if sample_params[property] == REQUIRED or property in self.required_properties:
                        property_params[REQUIRED] = True
                    elif sample_params[property] == OMIT:
                        continue
                    
                property_prefix = f'{property}{KEY_DELIMITER}'
                child_params = {key.replace(property_prefix, ''): value for key, value in sample_params.items() if key.startswith(property_prefix)}
                property_params.update(child_params)
                sample_value = prior.sample(property_params)
                if sample_value is not None:
                    sample[property] = sample_value

            return sample


class BooleanPrior(PriorBase):
    def __init__(self, key, default_value=None, epsilon=DEFAULT_EPSILON, seed=DEFAULT_RANDOM_SEED):
        super().__init__(key, seed)
        self.default_value = default_value
        self.epsilon = epsilon
        self.counts = {False: 0, True: 0}
        if self.default_value is None:
            self.counts[None] = 0

        self.observed_options = []
        self.probabilities = None

    def _add_observation(self, observation):
        if observation is None:
            if self.default_value is None:
                self.counts[None] += 1

            else:
                self.counts[self.default_value] += 1

        elif isinstance(observation, bool):
            self.counts[observation] += 1

        else:
            raise ValueError(f'BooleanPrior expected a boolean observation, received {observation}')

    def _fit(self):
        self.observed_options = [opt for opt in self.counts if self.counts[opt] > 0]
        self.probabilities = np.array([self.counts[opt] for opt in self.observed_options])
        self.probabilities = self.probabilities / self.probabilities.sum()

    def _sample(self, sample_params=None):
        if self.rng.uniform() < self.epsilon:
            sample = self.rng.choice(list(self.counts.keys()))
        else:
            sample = self.rng.choice(self.observed_options, p=self.probabilities)

        if self.default_value is not None and sample == self.default_value:
            return None
        
        return sample


DEFS_KEY = '$defs'
REF_KEY = '$ref'
KEY_DELIMITER = '/'
DEFAULT_KEYS_TO_SKIP = ('metadata',)


class SchemaPriorParser:
    def __init__(self, schema, keys_to_skip=DEFAULT_KEYS_TO_SKIP, kwargs_by_prior_class=None):
        self.schema_dict = {}
        self.schema = schema
        self.keys_to_skip = keys_to_skip

        self.kwargs_by_prior_class = defaultdict(dict)
        if kwargs_by_prior_class is not None:
            self.kwargs_by_prior_class.update(kwargs_by_prior_class)

        self.ref_prior_generators = {}
        self.ref_options = defaultdict(set)
        
        self._parse_defs()
        self.schema_prior = self._parse()

    def _ref_key(self, key):
        return f'#/{DEFS_KEY}/{key}'

    def _parse_defs(self):
        defs_section = self.schema[DEFS_KEY]

        key_to_ref_dependencies = {key: self._recursive_find_refs(defs_section[key]) for key in defs_section}
        
        while key_to_ref_dependencies:
            keys_with_no_unresolved_dependencies = [key for key, deps in key_to_ref_dependencies.items() if len(deps) == 0]

            for key in keys_with_no_unresolved_dependencies:
                self._resolve_refs(key, defs_section[key])
                del key_to_ref_dependencies[key]
                for deps in key_to_ref_dependencies.values():
                    ref_key = self._ref_key(key)
                    if ref_key in deps:
                        deps.remove(ref_key)

    def _recursive_find_refs(self, start_dict):
        frontier = [start_dict]
        refs = set()
        while frontier:
            current = frontier.pop()
            new_values = None

            if isinstance(current, dict):
                if REF_KEY in current:
                    refs.add(current[REF_KEY])

                new_values = current.values()

            elif isinstance(current, (list, tuple)):
                new_values = current

            frontier.extend([value for value in new_values if isinstance(value, (list, dict, tuple))])

        return refs

    def _resolve_refs(self, key, ref_def):
        ref_key = self._ref_key(key)

        if 'enum' in ref_def:
            self.ref_options[ref_key].update(ref_def['enum'])
            
            def gen(path_key):
                return EnumPrior(f'{path_key}{KEY_DELIMITER}{key}', self.ref_options[ref_key], **self.kwargs_by_prior_class[EnumPrior])

            self.ref_prior_generators[ref_key] = gen

        elif 'anyOf' in ref_def:
            for any_ref_def in ref_def['anyOf']:
                for any_key, any_value in any_ref_def.items():
                    if any_key == REF_KEY:
                        self.ref_options[ref_key].update(self.ref_options[any_value])
                    elif any_key == 'const':
                        self.ref_options[ref_key].add(any_value)
                    else:
                        raise ValueError(f'Encountered unexpected key/value pairing in anyOf: {any_key}: {any_value}')

            def gen(path_key):
                return EnumPrior(f'{path_key}{KEY_DELIMITER}{key}', self.ref_options[ref_key], **self.kwargs_by_prior_class[EnumPrior])

            self.ref_prior_generators[ref_key] = gen

        elif 'type' in ref_def:
            def gen(path_key):
                return self._parse(ref_def, f'{path_key}{KEY_DELIMITER}{key}')

            self.ref_prior_generators[ref_key] = gen

    def _parse_enum(self, schema_obj, key):
        if 'enum' not in schema_obj:
            raise ValueError('_parse_enum was called without an enum key in the object: {schema_obj}')

        options = schema_obj['enum']
        return EnumPrior(key, options, **self.kwargs_by_prior_class[EnumPrior])

    def _parse_array(self, schema_obj, key):
        if 'items' not in schema_obj:
            raise ValueError(f'_parse_array received array without items key, currently unsupported: {schema_obj}')

        sub_key = f'{key}{KEY_DELIMITER}[]'
        item_prior = self._parse(schema_obj['items'], sub_key)
        return ArrayPrior(sub_key, item_prior, **self.kwargs_by_prior_class[ArrayPrior])

    def _parse_object(self, schema_obj, key):
        if 'properties' not in schema_obj:
            raise ValueError(f'_parse_object was called without properties key, currently unsupported: {schema_obj}')

        properties_to_priors = {prop_key: self._parse(prop_value, f'{key}{KEY_DELIMITER}{prop_key}') 
            for prop_key, prop_value 
            in schema_obj['properties'].items()
            if prop_key not in self.keys_to_skip}
        reqiured_properties = schema_obj['required_properties'] if 'required_properties' in schema_obj else []
        return ObjectItemPrior(key, properties_to_priors, reqiured_properties, keys_to_skip=self.keys_to_skip, **self.kwargs_by_prior_class[ObjectItemPrior])

    def _parse_boolean(self, schema_obj, key):
        default_value = schema_obj['default'] if 'default' in schema_obj else None
        return BooleanPrior(key, default_value, **self.kwargs_by_prior_class[BooleanPrior])

    def _parse(self, schema_obj=None, key=None):
        if schema_obj is None:
            schema_obj = self.schema

        if key is None:
            key = ''

        if '$ref' in schema_obj:
            return self.ref_prior_generators[schema_obj['$ref']](key)

        if 'enum' in schema_obj:
            return self._parse_enum(schema_obj, key)

        if 'type' in schema_obj:
            schema_type = schema_obj['type']

            if schema_type == 'array':
                return self._parse_array(schema_obj, key)

            elif schema_type == 'object':
                return self._parse_object(schema_obj, key)

            elif schema_type == 'boolean':
                return self._parse_boolean(schema_obj, key)

        else:
            raise ValueError(f'Encountered schema which did not match any rule: {schema_obj}')



In [145]:
prior = SchemaPriorParser(schema).schema_prior
for game in game_schemas:
    prior.add_observation(game)
prior.fit()

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


## Next stages
* Prior-sampling:
    * Controllable sampling -- be able to sample a game with specific fields (at least the top-level ones)
    * Related: understand why I'm currently getting games with fields missing -- could it be that the sampling result is an empty dict, etc.? How can it happen with mandatory fields? 
    * Curate some games
* **Tree-based mutation:**
    * Keep a mapping of full path to value
    * Assuming I'll start by mutating same path => same path, rather than same type => same type
    * Implement replacements, additions (in arrays), and deletions (ditto)
    * Note I can also add a field that doesn't exist (e.g. from, on) or delete a non-required field entirely
    * Curate some games
* **Share samples of original games and both mutated types with Todd and Brenden**

In [146]:
org_p = prior.properties_to_priors['organizing']
b_p = prior.properties_to_priors['building']

In [149]:
prior.sample({'throwing': 'required', 'building': 'omit', 'throwing/from': 'required'})

{'throwing': {'what': ['golfball'],
  'from': [{'object': 'rug',
    'predicates': [{'object': 'desk', 'predicate': 'adjacent'}]}],
  'on': [{'object': 'curved_wooden_ramp',
    'predicates': [{'object': 'hexagonal_bin', 'predicate': 'touch'}]}],
  'goal': ['in'],
  'to': [{'object': 'hexagonal_bin',
    'predicates': [{'object': 'room_center', 'predicate': 'on'}]}]},
 'organizing': [{'what': ['desktop', 'alarm_clock', 'cellphone'],
   'to': [{'object': 'laptop'}]}]}

In [157]:
prior.sample({'throwing': 'omit', 'building': 'omit', 'organizing': 'required'})

{'organizing': [{'what': ['laptop', 'book'], 'from': {'object': 'room_side'}}]}

## Record full paths for mutation

In [160]:
def parse_game_by_keys(mapping, schema_obj, key=None, keys_to_skip=DEFAULT_KEYS_TO_SKIP):
    if key is None:
        key = ''

    if isinstance(schema_obj, dict):
        if key:
            mapping[key].append(schema_obj)
        for inner_key, inner_value in schema_obj.items():
            if inner_key in keys_to_skip: 
                continue

            new_key = f'{key}{KEY_DELIMITER}{inner_key}' if key else inner_key
            parse_game_by_keys(mapping, inner_value, new_key, keys_to_skip)

    elif isinstance(schema_obj, (list, tuple)):
        new_key = f'{key}{KEY_DELIMITER}[]'
        for inner_value in schema_obj:
            parse_game_by_keys(mapping, inner_value, new_key, keys_to_skip)

    else:
        mapping[key].append(schema_obj)


mapping = defaultdict(list)
for game in game_schemas:
    parse_game_by_keys(mapping, game)

In [161]:
mapping.keys()

dict_keys(['throwing', 'throwing/what/[]', 'throwing/on/[]', 'throwing/on/[]/object', 'throwing/on/[]/predicates/[]', 'throwing/on/[]/predicates/[]/object', 'throwing/on/[]/predicates/[]/predicate', 'throwing/goal/[]', 'throwing/to/[]', 'throwing/to/[]/object', 'throwing/from/[]', 'throwing/from/[]/object', 'throwing/from/[]/predicates/[]', 'throwing/from/[]/predicates/[]/predicate', 'throwing/to/[]/predicates/[]', 'throwing/to/[]/predicates/[]/predicate', 'building', 'building/objects/[]', 'building/structure', 'building/goal', 'throwing/to/[]/predicates/[]/object', 'building/order/[]', 'building/on', 'building/on/object', 'building/on/predicates/[]', 'building/on/predicates/[]/object', 'building/on/predicates/[]/predicate', 'organizing/[]', 'organizing/[]/what/[]', 'organizing/[]/to/[]', 'organizing/[]/to/[]/object', 'organizing/[]/to/[]/predicates/[]', 'organizing/[]/to/[]/predicates/[]/object', 'organizing/[]/to/[]/predicates/[]/predicate', 'organizing/[]/to/[]/predicates/[]/negate