In [1]:
%autoreload 2

In [27]:
from argparse import Namespace
from collections import defaultdict
import copy
import difflib
import gzip
import itertools
import os
import pickle
import sys
import typing

from IPython.display import display, Markdown, HTML
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tatsu
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
import tqdm.notebook as tqdm
import sklearn
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.pipeline import Pipeline

sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../src'))
from src import fitness_energy_utils as utils
from src.fitness_energy_utils import NON_FEATURE_COLUMNS
from src.ast_counter_sampler import *
from src.ast_utils import cached_load_and_parse_games_from_file, load_games_from_file, _extract_game_id
from src import ast_printer

In [3]:
fitness_df = utils.load_fitness_data()
print(fitness_df.src_file.unique())
fitness_df.head()

['interactive-beta.pddl' 'ast-real-regrowth-samples.pddl']


Unnamed: 0,Index,src_file,game_name,domain_name,all_variables_defined,all_variables_used,all_preferences_used,setup_objects_used,no_adjacent_once,starts_and_ends_once,...,mean_depth_setup,mean_depth_constraints,mean_depth_terminal,mean_depth_scoring,node_count_setup,node_count_constraints,node_count_terminal,node_count_scoring,real,original_game_name
0,0,interactive-beta.pddl,6172feb1665491d1efbce164-0,medium-objects-room-v1,1.0,1.0,1.0,1.0,1.0,0.5,...,5.73913,8.538462,2.6,1.5,23,78,10,4,True,6172feb1665491d1efbce164-0
1,1,interactive-beta.pddl,5f77754ba932fb2c4ba181d8-2,many-objects-room-v1,1.0,1.0,1.0,1.0,1.0,1.0,...,3.0,8.710843,2.6,4.921053,10,83,10,38,True,5f77754ba932fb2c4ba181d8-2
2,2,interactive-beta.pddl,614b603d4da88384282967a7-3,many-objects-room-v1,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,5.611111,0.0,2.5,0,18,0,6,True,614b603d4da88384282967a7-3
3,3,interactive-beta.pddl,5bc79f652885710001a0e82a-5,few-objects-room-v1,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,9.215686,0.0,1.5,0,51,0,4,True,5bc79f652885710001a0e82a-5
4,4,interactive-beta.pddl,614dec67f6eb129c3a77defd-6,medium-objects-room-v1,1.0,1.0,1.0,0.25,1.0,1.0,...,5.264706,9.622449,0.0,4.53125,34,98,0,32,True,614dec67f6eb129c3a77defd-6


In [4]:
def create_filtered_df(df: pd.DataFrame, 
    synthetic_data_src_files: typing.Sequence[str] = ('interactive-beta.pddl', 'ast-real-regrowth-samples.pddl'),
    ) -> pd.DataFrame:
    syntethic_df = fitness_df[fitness_df.src_file.isin(synthetic_data_src_files)].reset_index(drop=True)
    syntethic_df.loc[syntethic_df.src_file == synthetic_data_src_files[0], 'real'] = 1
    return syntethic_df

filtered_fitness_df = create_filtered_df(fitness_df)

In [5]:
filtered_fitness_df.head()

Unnamed: 0,Index,src_file,game_name,domain_name,all_variables_defined,all_variables_used,all_preferences_used,setup_objects_used,no_adjacent_once,starts_and_ends_once,...,mean_depth_setup,mean_depth_constraints,mean_depth_terminal,mean_depth_scoring,node_count_setup,node_count_constraints,node_count_terminal,node_count_scoring,real,original_game_name
0,0,interactive-beta.pddl,6172feb1665491d1efbce164-0,medium-objects-room-v1,1.0,1.0,1.0,1.0,1.0,0.5,...,5.73913,8.538462,2.6,1.5,23,78,10,4,1,6172feb1665491d1efbce164-0
1,1,interactive-beta.pddl,5f77754ba932fb2c4ba181d8-2,many-objects-room-v1,1.0,1.0,1.0,1.0,1.0,1.0,...,3.0,8.710843,2.6,4.921053,10,83,10,38,1,5f77754ba932fb2c4ba181d8-2
2,2,interactive-beta.pddl,614b603d4da88384282967a7-3,many-objects-room-v1,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,5.611111,0.0,2.5,0,18,0,6,1,614b603d4da88384282967a7-3
3,3,interactive-beta.pddl,5bc79f652885710001a0e82a-5,few-objects-room-v1,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,9.215686,0.0,1.5,0,51,0,4,1,5bc79f652885710001a0e82a-5
4,4,interactive-beta.pddl,614dec67f6eb129c3a77defd-6,medium-objects-room-v1,1.0,1.0,1.0,0.25,1.0,1.0,...,5.264706,9.622449,0.0,4.53125,34,98,0,32,1,614dec67f6eb129c3a77defd-6


In [6]:
fake_games = filtered_fitness_df[filtered_fitness_df.src_file == 'ast-real-regrowth-samples.pddl'].reset_index()
broadcasted_original = filtered_fitness_df.loc[[filtered_fitness_df.index[(filtered_fitness_df.game_name == original_name)][0] for original_name in fake_games.original_game_name], :].reset_index()

original_regrown_diffs = (broadcasted_original.drop(NON_FEATURE_COLUMNS, axis=1) - fake_games.drop(NON_FEATURE_COLUMNS, axis=1))

unchanged_games_prop = (original_regrown_diffs.drop('index', axis=1) == 0).all(axis=1).sum() / len(original_regrown_diffs)
print(f'In {unchanged_games_prop * 100:.2f}% of the games, the regrown game was identical to the original game.')

In 11.13% of the games, the regrown game was identical to the original game.


# Synthetic model-fitting experiment approach
We have a large dataset now, I can try to cross-validate over some of the choices I might make:
* Loss function
* Parameters of each loss function (margin, etc.)
* Batch size
* Number of negative examples sampled per iteration
* Regularization strength?
* ...?



In [7]:
def cross_validate(train: pd.DataFrame, feature_columns: typing.List[str],
    param_grid: typing.Union[typing.List[typing.Dict[str, typing.Any]], typing.Dict[str, typing.Any]],
    scoring_function: typing.Callable = utils.evaluate_fitness,
    model_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None,
    train_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None, 
    cv_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None,
    n_folds: int = 5, verbose: int = 0):

    if model_kwargs is None:
        model_kwargs = {}

    if train_kwargs is None:
        train_kwargs = {}

    if cv_kwargs is None:
        cv_kwargs = {}

    if 'n_jobs' not in cv_kwargs: 
        cv_kwargs['n_jobs'] = -1
    if 'verbose' not in cv_kwargs:
        cv_kwargs['verbose'] = verbose

    train_tensor = utils.df_to_tensor(train, feature_columns)
    pipeline = Pipeline(steps=[('scaler', utils.CustomSklearnScaler()), ('fitness', utils.SklearnFitnessWrapper(model_kwargs=model_kwargs, train_kwargs=train_kwargs))])

    if isinstance(param_grid, list):
        for param_grid_dict in param_grid:
            param_grid_dict['fitness__n_features'] = [len(feature_columns)]
    else:
        param_grid['fitness__n_features'] = [len(feature_columns)]        

    random_seed = train_kwargs['random_seed'] if 'random_seed' in train_kwargs else None

    cv = GridSearchCV(pipeline, param_grid, scoring=scoring_function, 
        cv=KFold(n_folds, shuffle=True, random_state=random_seed), 
        **cv_kwargs)
    return cv.fit(train_tensor, None)


def model_fitting_experiment(df: pd.DataFrame,
    param_grid: typing.Union[typing.List[typing.Dict[str, typing.Any]], typing.Dict[str, typing.Any]], 
    feature_columns: typing.Optional[typing.List[str]] = None, 
    random_seed: int = utils.DEFAULT_RANDOM_SEED,
    scoring_function: typing.Callable = utils.evaluate_fitness,
    model_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None,
    train_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None,
    cv_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None,
    n_folds: int = 5, verbose: int = 0
    ):

    if model_kwargs is None:
        model_kwargs = {}

    if train_kwargs is None:
        train_kwargs = {}

    filtered_df = create_filtered_df(df)

    if feature_columns is None:
        feature_columns = [c for c in filtered_df.columns if c not in NON_FEATURE_COLUMNS]

    # rng = np.random.default_rng(random_seed)

    train_df, test_df = utils.train_test_split_by_game_name(filtered_df, random_seed=random_seed)
    cv = cross_validate(train_df, feature_columns, param_grid, 
        scoring_function=scoring_function,
        train_kwargs={'random_seed': random_seed, **train_kwargs}, 
        model_kwargs=model_kwargs, cv_kwargs=cv_kwargs, n_folds=n_folds, verbose=verbose)
    # best_model = cv.best_estimator_.named_steps['fitness'].model  # type: ignore

    # TODO: add evaluation on the held-out part of the dataset here
    return cv, test_df



In [8]:
test_param_grid = [
    {
        'fitness__loss_function': [utils.fitness_hinge_loss_with_cross_example],
        'fitness__weight_decay': [0.0, 0.25, 1, 2],  
        'fitness__margin': [1, 2, 4, 8, 16],
        'fitness__lr': [1e-1, 3e-2, 1e-2, 3e-3],
        'fitness__k': [32, 64, 128],  # [16, 32, 64, 128],
        'fitness__batch_size': [2, 4, 8, 16],  # [1, 4, 8, 16],
        'fitness__alpha': [0, 0.1, 0.2, 0.3],  # [0, 0.25, 0.5, 0.75, 1],
    },
    {
    #     'fitness__loss_function': [utils.fitness_log_loss],
    #     'fitness__weight_decay': [0.0, 0.125, 0.25, 0.5, 1],  
    #     'fitness__lr': [1e-2, 3e-3, 1e-3, 3e-4],
    #     'fitness__k': [16, 32, 64, 128],
    #     'fitness__batch_size': [1, 4, 8, 16],
    # },
    # {
    #     'fitness__loss_function': [utils.fitness_square_square_loss],
    #     'fitness__weight_decay': [0.0, 0.125, 0.25, 0.5, 1],  
    #     'fitness__margin': [1, 2, 4],
    #     'fitness__lr': [1e-2, 3e-3, 1e-3, 3e-4],
    #     'fitness__k': [16, 32, 64, 128],
    #     'fitness__batch_size': [1, 4, 8, 16],
    },
    
]

all_feature_columns = [c for c in filtered_fitness_df.columns if c not in NON_FEATURE_COLUMNS]
model_kwargs = dict(output_activation=nn.Identity())
train_kwargs = dict()
cv_kwargs = dict(refit='overall_ecdf')
scoring = utils.build_multiple_scoring_function(
    [utils.evaluate_fitness_overall_ecdf, utils.evaluate_fitness_single_game_rank],
    ['overall_ecdf', 'single_game_rank'],
)

cv, test_df = model_fitting_experiment(fitness_df, test_param_grid,
    scoring_function=scoring, verbose=1, 
    model_kwargs=model_kwargs, train_kwargs=train_kwargs, cv_kwargs=cv_kwargs)


Fitting 5 folds for each of 3841 candidates, totalling 19205 fits




In [9]:
cv_df = pd.concat([
    pd.DataFrame(cv.cv_results_["params"]), 
    pd.DataFrame(cv.cv_results_["mean_test_overall_ecdf"], columns=['ecdf_mean']), 
    pd.DataFrame(cv.cv_results_["std_test_overall_ecdf"], columns=['ecdf_std']), 
    pd.DataFrame(cv.cv_results_["rank_test_overall_ecdf"], columns=['ecdf_rank']),
    pd.DataFrame(cv.cv_results_["mean_test_single_game_rank"], columns=['game_rank_mean']), 
    pd.DataFrame(cv.cv_results_["std_test_single_game_rank"], columns=['game_rank_std']), 
    pd.DataFrame(cv.cv_results_["rank_test_single_game_rank"], columns=['game_rank_rank']),
],axis=1)

cv_df.sort_values(by='ecdf_rank').head(10)

Unnamed: 0,fitness__alpha,fitness__batch_size,fitness__k,fitness__loss_function,fitness__lr,fitness__margin,fitness__n_features,fitness__weight_decay,ecdf_mean,ecdf_std,ecdf_rank,game_rank_mean,game_rank_std,game_rank_rank
1396,0.1,4.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,16.0,637,0.0,-0.349669,0.013739,1,0.82278,0.01533,8
2176,0.2,4.0,32.0,<function fitness_hinge_loss_with_cross_exampl...,0.1,16.0,637,0.0,-0.351975,0.024274,2,0.818535,0.015734,80
2492,0.2,8.0,64.0,<function fitness_hinge_loss_with_cross_exampl...,0.1,8.0,637,0.0,-0.352183,0.017712,3,0.815807,0.011878,155
2812,0.2,16.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.1,8.0,637,0.0,-0.352193,0.022594,4,0.818724,0.01807,75
2105,0.2,2.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,2.0,637,0.25,-0.353201,0.013059,5,0.819395,0.016009,48
3536,0.3,8.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.1,16.0,637,0.0,-0.353611,0.023927,6,0.819121,0.018374,57
3076,0.3,2.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,16.0,637,0.0,-0.353613,0.016485,7,0.818092,0.017409,96
676,0.0,8.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,16.0,637,0.0,-0.35384,0.016235,8,0.814622,0.016468,201
356,0.0,4.0,64.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,16.0,637,0.0,-0.354218,0.028299,9,0.818822,0.013135,72
1232,0.1,4.0,32.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,8.0,637,0.0,-0.354349,0.011832,10,0.817526,0.014408,111


In [10]:
cv_df.sort_values(by='game_rank_rank').head(10)

Unnamed: 0,fitness__alpha,fitness__batch_size,fitness__k,fitness__loss_function,fitness__lr,fitness__margin,fitness__n_features,fitness__weight_decay,ecdf_mean,ecdf_std,ecdf_rank,game_rank_mean,game_rank_std,game_rank_rank
3068,0.3,2.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,4.0,637,0.0,-0.384064,0.022245,434,0.82571,0.01754,1
2028,0.2,2.0,64.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,4.0,637,0.0,-0.362012,0.021653,65,0.82446,0.01927,2
3072,0.3,2.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,8.0,637,0.0,-0.385641,0.02014,472,0.8236,0.012728,3
896,0.0,16.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.1,16.0,637,0.0,-0.355094,0.017712,13,0.823314,0.015734,4
28,0.0,2.0,32.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,4.0,637,0.0,-0.380788,0.033998,366,0.823138,0.0181,5
1616,0.1,8.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.1,16.0,637,0.0,-0.370034,0.027585,192,0.822969,0.01317,6
1065,0.1,2.0,64.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,2.0,637,0.25,-0.364534,0.019764,94,0.822904,0.013646,7
1396,0.1,4.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,16.0,637,0.0,-0.349669,0.013739,1,0.82278,0.01533,8
1216,0.1,4.0,32.0,<function fitness_hinge_loss_with_cross_exampl...,0.1,16.0,637,0.0,-0.356968,0.023505,22,0.82276,0.01403,9
436,0.0,4.0,128.0,<function fitness_hinge_loss_with_cross_exampl...,0.03,16.0,637,0.0,-0.36673,0.025724,130,0.822747,0.016797,10


In [11]:
filtered_df = create_filtered_df(fitness_df)
feature_columns = [c for c in filtered_df.columns if c not in NON_FEATURE_COLUMNS]
full_tensor = utils.df_to_tensor(filtered_df, feature_columns)

cv.best_estimator_.fit(full_tensor)

output_path = '../models/cv_fitness_model_with_cross_example_2022_12_05.pkl.gz'
original_output_path = output_path[:]
i = 0
while os.path.exists(output_path):
    output_path = original_output_path + f'_{i}'
    i += 1

with gzip.open(output_path, 'wb') as f:
    pickle.dump(cv.best_estimator_, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
count = 0
for column in fitness_df.columns:
    unique_values = fitness_df[column].unique()
    if len(unique_values) == 1:
        print(f'{column}: {unique_values}')
        count += 1

print(f'{count} columns with only one unique value')

In [None]:
syntethic_fitness_df = create_filtered_df(fitness_df)
feature_columns = [c for c in syntethic_fitness_df.columns if c not in NON_FEATURE_COLUMNS]
train_df, test_df = utils.train_test_split_by_game_name(syntethic_fitness_df, random_seed=42)
train_df, val_df = utils.train_test_split_by_game_name(train_df, random_seed=42)
train_tensor = utils.df_to_tensor(train_df, feature_columns)
val_tensor = utils.df_to_tensor(val_df, feature_columns)
test_tensor = utils.df_to_tensor(test_df, feature_columns)

scaler = utils.CustomSklearnScaler().fit(train_tensor)
train_tensor = scaler.transform(train_tensor)
val_tensor = scaler.transform(val_tensor)
test_tensor = scaler.transform(test_tensor)

model = utils.FitnessEenrgyModel(train_tensor.shape[-1], output_activation=nn.Identity())
model.apply(utils.init_weights)
results = utils.train_and_validate_model(
    model, train_tensor, val_tensor,
    loss_function=utils.fitness_hinge_loss,  
    loss_function_kwargs=dict(margin=4.0),
    lr=1e-2, weight_decay=0.0,
    batch_size=1, k=32,
    patience_epochs=10,
    random_seed=42,
    eval_method=utils.evaluate_fitness_flipped_sign,
)

In [None]:
scores = model(val_tensor)
pos = scores[:, 0, None]
neg = scores[:, 1:]
pos.shape, neg.shape

In [None]:
margin = 1
torch.relu(pos + margin - neg).shape

In [None]:
utils.fitness_hinge_loss_with_cross_example(scores)

In [None]:
scoring(model, train_tensor), scoring(model, val_tensor), scoring(model, test_tensor)

In [None]:
train_preds = model(train_tensor).detach().squeeze().numpy()
val_preds = model(val_tensor).detach().squeeze().numpy()
test_preds = model(test_tensor).detach().squeeze().numpy()

In [None]:
((train_preds[:, 0, None] < train_preds[:, 1:]).sum(axis=1) / (train_preds.shape[1] - 1)).mean()

In [None]:
def _make_labels(prds: np.ndarray):
    labels = np.zeros_like(prds)
    labels[:, 0] = 1
    return labels

train_labels = _make_labels(train_preds)
val_labels = _make_labels(val_preds)
test_labels = _make_labels(test_preds)


In [None]:
from sklearn import metrics


def predict_at_threshold(preds: np.ndarray, threshold: float):
    y_pred = (preds < threshold).astype(int)
    y_true = np.zeros_like(y_pred)
    y_true[:, 0] = 1
    return y_true.reshape(-1), y_pred.reshape(-1)


def evaluate_f1_with_threshold(preds: np.ndarray, threshold: float):
    return metrics.f1_score(*predict_at_threshold(preds, threshold))


def find_f1_threshold(preds: np.ndarray):
    best_f1 = 0
    best_threshold = 0
    for threshold in np.linspace(preds.min(), preds.max(), 100):
        f1 = evaluate_f1_with_threshold(preds, threshold)
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold

    return best_threshold, best_f1


train_threshold, train_f1 = find_f1_threshold(train_preds)
evaluate_f1_with_threshold(test_preds, train_threshold)

In [None]:
metrics.PrecisionRecallDisplay.from_predictions(train_labels.reshape(-1), train_preds.reshape(-1))

In [None]:
(train_preds[:, 0, np.newaxis] <= train_preds[:, 1:]).sum(axis=1)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
ax.hist(train_preds[:, 1:].reshape(-1), bins=100, label='negatives')
ax.hist(train_preds[:, 0], bins=100, label='positives')
ax.legend()
plt.show()

In [None]:
from statsmodels.distributions.empirical_distribution import ECDF

In [None]:
train_ecdf = ECDF(train_preds.reshape(-1))

In [None]:
train_ecdf(train_preds[:, 0]).mean()

In [None]:
t = torch.tensor(train_preds)

In [None]:
p = t[:, 0]
n = t[:, 1:]

(p[:, None] < n).mean(axis=1, dtype=torch.float)