# Dataset stats workbook
- Created by Gabe (2022-05-07)
- For Theo to use to compute dataset statistics
- Note: should be run at the top-level of laps/

In [None]:
import numpy as np
from run_experiment import init_experiment_state_and_iterator
from dreamcoder.program import Program
from src.config_builder import build_config
from src.experiment_iterator import EXPORT_DIRECTORY
from src.task_loaders import GroundTruthOrderedTaskBatcher

In [None]:
#DOMAIN = "drawings_nuts_bolts"
#DOMAIN = "drawings_furniture"
#DOMAIN = "drawings_dials"
#DOMAIN = "drawings_wheels"

DOMAIN = "clevr"
#DOMAIN = "re2"

In [None]:
config = build_config(
    experiment_name="test_experiment",
    experiment_type="stitch",
    domain=DOMAIN,
    task_batcher="ground_truth_ordered_task_batcher",
    random_seed=111,
    global_batch_size="all",
    codex_params={},
    stitch_params={},
    compute_likelihoods=False,
    compute_description_lengths=True,
)

In [None]:
experiment_state, experiment_iterator = init_experiment_state_and_iterator(
    {}, config
)
experiment_state.initialize_ground_truth_task_frontiers(task_split="train")
experiment_state.initialize_ground_truth_task_frontiers(task_split="test")

In [None]:
train_frontiers = experiment_state.get_frontiers_for_ids(task_split="train", task_ids="all")
print(len(train_frontiers))

In [None]:
# A frontier contains one or more programs that solve a task
train_frontiers[0]

In [None]:
# The first program in the frontier. You can assume all domains have one program per frontier.
p = train_frontiers[0].entries[0].program
print(p)

In [None]:
# description length
len(Program.left_order_tokens(p, show_vars=True))

In [None]:
# character length
len(str(p))

In [None]:
# TODO(theoxo): Compute and report the following for the paper
# - number of programs in each domain, broken down by train/test
# - mean and std of description and character lengths for all domains, broken down by train/test
# - any other relevant program stats you can think of

In [None]:
data = {}
for t in ["train", "test"]:
    data[t] = {}
    frontiers = experiment_state.get_frontiers_for_ids(task_split=t, task_ids="all")
    data[t]["count"] = len(frontiers)
    data[t]["dls"] = np.array([len(Program.left_order_tokens(frontier.entries[0].program, show_vars=True)) for frontier in frontiers])
    data[t]["chars"] = np.array([len(str(frontier.entries[0].program)) for frontier in frontiers])

In [None]:
print(f"Domain={DOMAIN}")
print(f"Number of programs: train={data['train']['count']} test={data['test']['count']}")
print(f"Mean and std-dev of description length: train={(np.mean(data['train']['dls']), np.std(data['train']['dls']))} test={(np.mean(data['test']['dls']), np.std(data['test']['dls']))}")
print(f"Mean and std-dev of char length: train={(np.mean(data['train']['chars']), np.std(data['train']['chars']))} test={(np.mean(data['test']['chars']), np.std(data['test']['chars']))}")