## TEST DATASET INITIALISATION

In [1]:
import sys
sys.path.append('../')

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import datasets
import importlib
importlib.reload(datasets)

from datasets.utils import dec2bin, dec2base, base2dec
from datasets.random_hierarchy_model import sample_rules, sample_trees, sample_trees_unif

In [2]:
from dataclasses import dataclass
@dataclass
class config:

    device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

    num_features = 64   # The number of values each variable can take (vocabulary size, int).
    num_classes = 64    # The number of classes (int).
    num_synonyms = 32   # The number of synonymic lower-level representations (multiplicity, int).
    tuple_size = 2      # The size of lower-level representations (int).
    num_layers = 2      # The number of levels in the hierarchy (int).

    seed_rules = 12345678
    seed_sample = 56781234
    train_size = 2**17
    test_size = 2**15

config.input_size = config.tuple_size**config.num_layers   # number of pixels, actual input size is (input_size x num_features) because of one-hot encoding
config.num_data = config.num_classes * (config.num_synonyms**((config.tuple_size**config.num_layers-1)//(config.tuple_size-1))) # total number of data
config.sample_complexity = config.num_classes * config.num_synonyms**config.num_layers
print(config.input_size, config.num_data, config.sample_complexity)
print(config.device)

4 2097152 65536
mps


From PRX "How Deep Neural Networks Learn Compositional Data: The Random Hierarchy Model", CNNs trained with $P\gg n m^{L}$ data achieve perfect classification accuracy (config.train_size >> config.sample_complexity)

In [3]:
# generate RHM object (contains train_size + test_size data with the complete tree structure and the rules)
rhm = datasets.RHM(
    v=config.num_features,
    n=config.num_classes,
    m=config.num_synonyms,
    s=config.tuple_size,
    L=config.num_layers,
    seed_rules=config.seed_rules,
    seed_samples=config.seed_sample,
    num_data=config.train_size+config.test_size,
    probs=None,
    transform=None
)

inputs = rhm.trees[config.num_layers]
targets = rhm.trees[0]
print('input: tensor of size', inputs.size())
print('target: tensor of size', targets.size())
print(f'rules: {len(rhm.rules.keys())} tensors of size {rhm.rules[list(rhm.rules.keys())[0]].size()} (v, m, s)')

input: tensor of size torch.Size([163840, 4])
target: tensor of size torch.Size([163840])
rules: 2 tensors of size torch.Size([64, 32, 2]) (v, m, s)


## Generating data from given rules

In [4]:
# here I generate random rules, but they could be taken from a file or another simulation
seed_rules = random.randint(10000000,99999999)
rules = sample_rules( config.num_features, config.num_classes, config.num_synonyms, config.tuple_size, config.num_layers, seed=seed_rules)

In [5]:
dataset_size = 1024
seed_samples = random.randint(10000000,99999999)

trees = sample_trees(dataset_size, rules, prior=None, probs=None, seed=seed_samples)
inputs = trees[config.num_layers]
targets = trees[0]