## TEST DATASET INITIALISATION

In [1]:
import sys
sys.path.append('../')

import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data_utils

import datasets
import importlib
importlib.reload(datasets)

from datasets.utils import dec2bin, dec2base, base2dec
from datasets.random_hierarchy_model import sample_rules, sample_data_from_labels, sample_data_from_labels_unif

In [2]:
v=4    # The number of values each variable can take (vocabulary size, int).
n=v    # The number of classes (int).
m=2     # The number of synonymic lower-level representations (multiplicity, int).
s=2     # The size of lower-level representations (int).
L=2     # The number of levels in the hierarchy (int).

input_size = s**L # number of pixels, actual input size is (input_size x num_features) because of one-hot encoding
num_data = n * (m**((s**L-1)//(s-1))) # total number of data
print(input_size, num_data)

4 32


In [26]:
# SAMPLE RULES AND DATA FROM LABELS

# rules = sample_rules( v, v, m, s, L, seed=42)
# for l in range(L):
#     print(f'level {l}, rules:')
#     for i in range(v):
#         print(f'{i}->{list(rules[l][i])}')

# labels = torch.randint(low=0, high=n, size=(32,))
# features, labels = sample_data_from_labels_unif(labels, rules)
# for i in range(features.size(0)):
#     print(features[i,:], labels[i])

# SAMPLING WITH REPLACEMENT (REQUIRED FOR DATASET LARGER THAN sys.maxsize)

In [5]:
random.seed()
seed_rules = 12345678   # seed of the random hierarchy model
seed_sample = 99999999  # random.randrange(10000000,99999999)
print('sampling seed:', seed_sample)

train_size = 4 # size of the training set
test_size = 0 # size of the test set
input_format = 'onehot' # alternative: onehot
# to generate the full dataset: set trainset=num_data, test_size=0
bonus = {}

dataset = datasets.RandomHierarchyModel(
    num_features=v, # vocabulary size
    num_synonyms=m, # features multiplicity
    num_layers=L,   # number of layers
    num_classes=n,  # number of classes
    tuple_size=s,   # number of branches of the tree
    seed_rules=seed_rules,
    seed_sample=seed_sample,
    train_size=train_size,
    test_size=test_size,
    input_format=input_format,
    whitening=0, # 1 to whiten the input
    replacement=True,
    bonus=bonus
)

sampling seed: 99999999


In [4]:
for l in range(L):
    print(f'level {l}, rules:')
    for i in range(v):
        print(f'{i}->{list(dataset.rules[l][i])}')

x = dataset.features
print(x.size())

if 'onehot' in input_format:
    print(x.mean(dim=1).mean())
    print(x.norm(dim=1).mean())
    for i in range(x.size(0)):
        print(x[i,:], dataset.labels[i])

elif 'long' in input_format:
    for i in range(x.size(0)):
        print(x[i,:], dataset.labels[i])

level 0, rules:
0->[tensor([0, 3]), tensor([2, 2])]
1->[tensor([2, 1]), tensor([3, 3])]
2->[tensor([1, 0]), tensor([3, 1])]
3->[tensor([2, 0]), tensor([3, 2])]
level 1, rules:
0->[tensor([0, 2]), tensor([3, 0])]
1->[tensor([1, 1]), tensor([0, 1])]
2->[tensor([3, 3]), tensor([3, 1])]
3->[tensor([0, 0]), tensor([1, 3])]
torch.Size([4, 4, 4])
tensor(0.2500)
tensor(1.)
tensor([[0., 0., 1., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 0.],
        [1., 1., 0., 0.]]) tensor(1)
tensor([[1., 0., 0., 0.],
        [0., 0., 1., 0.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]]) tensor(0)
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0., 0.],
        [1., 1., 1., 0.]]) tensor(0)
tensor([[0., 0., 0., 1.],
        [1., 1., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 1., 0.]]) tensor(2)


# TEST PRODUCTION RULES DISTRIBUTION

In [3]:
v=8    # The number of values each variable can take (vocabulary size, int).
n=v    # The number of classes (int).
m=v//2     # The number of synonymic lower-level representations (multiplicity, int).
s=2     # The size of lower-level representations (int).
L=1     # The number of levels in the hierarchy (int).

input_size = s**L # number of pixels, actual input size is (input_size x num_features) because of one-hot encoding
num_data = n * (m**((s**L-1)//(s-1))) # total number of data
print(input_size, num_data)

random.seed()
seed_rules = 12345678   # seed of the random hierarchy model
seed_sample = 99999999  # random.randrange(10000000,99999999)
print('sampling seed:', seed_sample)

probability = {}
for l in range(L-1):
    probability[l] = torch.ones(m)/m
# lognormal = torch.randn(m).exp()
# probability[L-1] = lognormal /lognormal.sum()
rank1 = torch.zeros(m)
rank1[0] = 1.0
probability[L-1] = rank1

train_size = 2**20 # size of the training set
test_size = 0 # size of the test set
input_format = 'onehot_tuples' # alternative: onehot
# to generate the full dataset: set trainset=num_data, test_size=0

dataset = datasets.RandomHierarchyModel(
    num_features=v, # vocabulary size
    num_synonyms=m, # features multiplicity
    num_layers=L,   # number of layers
    num_classes=n,  # number of classes
    tuple_size=s,   # number of branches of the tree
    probability=probability,
    seed_rules=seed_rules,
    seed_sample=seed_sample,
    train_size=train_size,
    test_size=test_size,
    input_format=input_format,
    whitening=0, # 1 to whiten the input
    replacement=True
)

2 32
sampling seed: 99999999


In [4]:
rep0 = dataset.rules[L-1][0]
print(rep0.size())

rep0_indices = base2dec(rep0, v)
print(rep0_indices)

x = dataset.features
print(x.size())
empirical = x.sum(dim=0).sum(dim=-1)/(s**(L-1))/train_size
emp_extract = empirical[rep0_indices]*v
print(emp_extract)
print(dataset.probability[L-1])
x = emp_extract-dataset.probability[L-1]
print(train_size, x.var())

# print(f'level {L-1}, rules:')
# print(f'{0}->{list(dataset.rules[L-1][0])}, prob. {probability[L-1]}')

torch.Size([4, 2])
tensor([12, 40, 37, 14])
torch.Size([1048576, 64, 1])
tensor([0.9996, 0.0000, 0.0000, 0.0000])
tensor([1., 0., 0., 0.])
1048576 tensor(3.6380e-08)


In [5]:
x = dataset.features
print(x.size())
empirical = x.sum(dim=0).sum(dim=-1)/(s**(L-1))/train_size
empirical, _ = torch.sort(empirical, descending=True)
empirical = empirical.reshape(v,-1)
true, _ = torch.sort(dataset.probability[L-1], descending=True)
print(true*1./v)
print(empirical[:, 0])
# print([0.8*5/16, 0.2*3/16, 0.8*3/16, 0., 0., 0.8*3/16, 0., 0.2*5/16, 0., 0., 0., 0., 0.2*3/16, 0.2*5/16, 0.8*5/16])

torch.Size([1048576, 64, 1])
tensor([0.1250, 0.0000, 0.0000, 0.0000])
tensor([0.1253, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000])


# STANDARD SAMPLING (WITHOUT REPLACEMENT)

In [4]:
v=4    # The number of values each variable can take (vocabulary size, int).
n=v    # The number of classes (int).
m=2     # The number of synonymic lower-level representations (multiplicity, int).
s=2     # The size of lower-level representations (int).
L=2     # The number of levels in the hierarchy (int).

input_size = s**L # number of pixels, actual input size is (input_size x num_features) because of one-hot encoding
num_data = n * (m**((s**L-1)//(s-1))) # total number of data
print(input_size, num_data)

4 32


In [5]:
random.seed()
seed_rules = 12345678 # seed of the random hierarchy model
seed_sample = random.randrange(10000000,99999999)
print('sampling seed:', seed_sample)

train_size = -1 # size of the training set
test_size = 0 # size of the test set
input_format = 'long' # alternative: onehot
# to generate the full dataset: set trainset=num_data, test_size=0
bonus = dict.fromkeys(['tree', 'noise', 'synonyms', 'size'])
bonus['size'] = 4

dataset = datasets.RandomHierarchyModel(
    num_features=v, # vocabulary size
    num_synonyms=m, # features multiplicity
    num_layers=L, # number of layers
    num_classes=n, # number of classes
    tuple_size=s, # number of branches of the tree
    seed_rules=seed_rules,
    seed_sample=seed_sample,
    train_size=train_size,
    test_size=test_size,
    input_format=input_format,
    whitening=0, # 1 to whiten the input
    replacement=False,
    bonus=bonus
)

print(dir(dataset)) 
# for the input points call trainset.input
print(dataset.features.size()) # dimension: train_size x num_features x input_size
# for the labels call trainset.output
print(dataset.labels.size()) # dimension: train_size

sampling seed: 94944558
['__add__', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_is_protocol', 'features', 'labels', 'num_classes', 'num_features', 'num_layers', 'num_synonyms', 'rules', 'transform', 'tuple_size']
torch.Size([32, 4])
torch.Size([32])


In [6]:
print(dataset.features)

tensor([[1, 3, 1, 1],
        [1, 3, 2, 4],
        [4, 1, 1, 1],
        [4, 1, 2, 4],
        [4, 4, 4, 4],
        [4, 4, 4, 2],
        [4, 2, 4, 4],
        [4, 2, 4, 2],
        [4, 4, 2, 2],
        [4, 4, 1, 2],
        [4, 2, 2, 2],
        [4, 2, 1, 2],
        [1, 1, 1, 1],
        [1, 1, 2, 4],
        [2, 4, 1, 1],
        [2, 4, 2, 4],
        [2, 2, 1, 3],
        [2, 2, 4, 1],
        [1, 2, 1, 3],
        [1, 2, 4, 1],
        [1, 1, 2, 2],
        [1, 1, 1, 2],
        [2, 4, 2, 2],
        [2, 4, 1, 2],
        [4, 4, 1, 3],
        [4, 4, 4, 1],
        [4, 2, 1, 3],
        [4, 2, 4, 1],
        [1, 1, 4, 4],
        [1, 1, 4, 2],
        [2, 4, 4, 4],
        [2, 4, 4, 2]])


In [8]:
for k in bonus['tree'].keys():
    print(k,bonus['tree'][k])

2 tensor([3, 3, 3, 3])
1 tensor([[3, 2],
        [3, 2],
        [3, 2],
        [3, 2]])


In [9]:
for k in bonus['synonyms'].keys():
    print(k,bonus['synonyms'][k])

2 tensor([[4, 4, 1, 3],
        [1, 1, 4, 2],
        [4, 2, 1, 3],
        [4, 2, 4, 1]])
1 tensor([[2, 4, 4, 4],
        [1, 1, 4, 2],
        [1, 1, 4, 4],
        [1, 1, 4, 2]])


In [10]:
for k in bonus['noise'].keys():
    print(k,bonus['noise'][k])

2 tensor([[1, 1, 1, 1],
        [1, 1, 4, 2],
        [4, 2, 4, 4],
        [2, 4, 2, 4]])
1 tensor([[1, 3, 4, 4],
        [1, 1, 4, 2],
        [4, 1, 4, 4],
        [1, 2, 4, 2]])
0 tensor([[1, 1, 3, 4],
        [1, 1, 4, 2],
        [2, 4, 1, 4],
        [2, 4, 2, 2]])


In [11]:
x = dataset.features
print(x.size())

if 'onehot' in input_format:
    print(x.mean(dim=1).mean())
    print(x.norm(dim=1).mean())

elif 'long' in input_format:
    for i in range(x.size(0)):
        print(x[i,:], dataset.labels[i])

torch.Size([32, 4])
tensor([1, 3, 1, 1]) tensor(0)
tensor([1, 3, 2, 4]) tensor(0)
tensor([4, 1, 1, 1]) tensor(0)
tensor([4, 1, 2, 4]) tensor(0)
tensor([4, 4, 4, 4]) tensor(0)
tensor([4, 4, 4, 2]) tensor(0)
tensor([4, 2, 4, 4]) tensor(0)
tensor([4, 2, 4, 2]) tensor(0)
tensor([4, 4, 2, 2]) tensor(1)
tensor([4, 4, 1, 2]) tensor(1)
tensor([4, 2, 2, 2]) tensor(1)
tensor([4, 2, 1, 2]) tensor(1)
tensor([1, 1, 1, 1]) tensor(1)
tensor([1, 1, 2, 4]) tensor(1)
tensor([2, 4, 1, 1]) tensor(1)
tensor([2, 4, 2, 4]) tensor(1)
tensor([2, 2, 1, 3]) tensor(2)
tensor([2, 2, 4, 1]) tensor(2)
tensor([1, 2, 1, 3]) tensor(2)
tensor([1, 2, 4, 1]) tensor(2)
tensor([1, 1, 2, 2]) tensor(2)
tensor([1, 1, 1, 2]) tensor(2)
tensor([2, 4, 2, 2]) tensor(2)
tensor([2, 4, 1, 2]) tensor(2)
tensor([4, 4, 1, 3]) tensor(3)
tensor([4, 4, 4, 1]) tensor(3)
tensor([4, 2, 1, 3]) tensor(3)
tensor([4, 2, 4, 1]) tensor(3)
tensor([1, 1, 4, 4]) tensor(3)
tensor([1, 1, 4, 2]) tensor(3)
tensor([2, 4, 4, 4]) tensor(3)
tensor([2, 4, 4, 2]

In [12]:
L = len(dataset.rules)
print('rules: list of length ', len(dataset.rules), ',')
print('first element of size ', dataset.rules[0].size(), ', (num_classes x num_synonyms x tuple_size)')

for l in range(1,L):
    print(f'{l+1}-th element of size ', dataset.rules[l].size(), ', (num_features x num_synonyms x tuple_size)')
print('rules[l][v,j] = j-th rep of the v-th level-(L-l) feature,')
print('e.g. list of tuples corresponding to class 0:')
print(dataset.rules[0][0,:])

rules: list of length  2 ,
first element of size  torch.Size([4, 2, 2]) , (num_classes x num_synonyms x tuple_size)
2-th element of size  torch.Size([4, 2, 2]) , (num_features x num_synonyms x tuple_size)
rules[l][v,j] = j-th rep of the v-th level-(L-l) feature,
e.g. list of tuples corresponding to class 0:
tensor([[0, 3],
        [2, 2]])


# DATA PERTURBATIONS

In [None]:
def sample_data_from_indices(samples, rules, v, n, m, s, L, bonus):
    """
    Create data of the Random Hierarchy Model starting from a set of rules and the sampled indices.

    Args:
        samples: A tensor of size [batch_size, I], with I from 0 to max_data-1, containing the indices of the data to be sampled.
        rules: A dictionary containing the rules for each level of the hierarchy.
        n: The number of classes (int).
        m: The number of synonymic lower-level representations (multiplicity, int).
        s: The size of lower-level representations (int).
        L: The number of levels in the hierarchy (int).
        bonus: Dictionary for additional output (list), includes 'noise' (randomly replace one symbol at each level), . Includes 'size' for the number of additional data. TODO: add custom positions for 'noise'

    Returns:
        A tuple containing the inputs and outputs of the model (plus additional output in bonus dict).
    """
    max_data = n * m ** ((s**L-1)//(s-1))
    data_per_hl = max_data // n 	# div by num_classes to get number of data per class

    high_level = samples.div(data_per_hl, rounding_mode='floor')	# div by data_per_hl to get class index (run in range(n))
    low_level = samples % data_per_hl					# compute remainder (run in range(data_per_hl))

    labels = high_level		# labels are the classes (features of highest level)
    features = labels		# init input features as labels (rep. size 1)
    size = 1

    if bonus:
        if 'size' not in bonus.keys():
            bonus['size'] = samples.size(0)
        if 'noise' in bonus:	# add corrupted version of the last bonus[-1] data
            noise = {}
            noise[L] = torch.clone(features[-bonus['size']:])	# copy current representation (labels)...
            noise[L][:] = torch.randint(n, (bonus['size'],))	# ...and randomly change it
            bonus['noise'] = noise
        if 'synonyms' in bonus:
            synonyms = {}
            bonus['synonyms'] = synonyms


    for l in range(L):

        choices = m**(size)
        data_per_hl = data_per_hl // choices	# div by num_choices to get number of data per high-level feature

        high_level = low_level.div( data_per_hl, rounding_mode='floor')	# div by data_per_hl to get high-level feature index (1 index in range(m**size))
        high_level = dec2base(high_level, m, length=size).squeeze()	# convert to base m (size indices in range(m), squeeze needed if index already in base m)

        if 'synonyms' in bonus:

            for ell in synonyms.keys():	# propagate modified data down the tree
                synonyms[ell] = rules[l][synonyms[ell], high_level[-bonus['size']:]]
                synonyms[ell] = synonyms[ell].flatten(start_dim=1)

            high_level_syn = torch.clone(high_level[-bonus['size']:]) # copy current representation indices...
            if l==0:
                high_level_syn[:] = torch.randint(m, (high_level_syn.size(0),)) # ... and randomly change it (only one index at the highest level)
            else:
                high_level_syn[:,-2] = torch.randint(m, (high_level_syn.size(0),))# ... and randomly change the next-to-last
            synonyms[L-l] = torch.clone(features[-bonus['size']:])
            synonyms[L-l] = rules[l][synonyms[L-l], high_level_syn]
            synonyms[L-l] = synonyms[L-l].flatten(start_dim=1)
            #TODO: add custom positions for 'synonyms'
        
        features = rules[l][features, high_level]	        		# apply l-th rule to expand to get features at the lower level (tensor of size (batch_size, size, s))
        features = features.flatten(start_dim=1)				# flatten to tensor of size (batch_size, size*s)
        size *= s								# rep. size increases by s at each level
        low_level = low_level % data_per_hl				# compute remainder (run in range(data_per_hl))

        if 'noise' in bonus:

            for ell in noise.keys():	# propagate modified data down the tree
                noise[ell] = rules[l][noise[ell], high_level[-bonus['size']:]]
                noise[ell] = noise[ell].flatten(start_dim=1)

            noise[L-l-1] = torch.clone(features[-bonus['size']:])	# copy current representation ...
            noise[L-l-1][:,-2] = torch.randint(v, (bonus['size'],))	# ... and randomly change the next-to-last feature
            #TODO: add custom positions for 'noise'


    return features, labels

In [None]:
n = 2
v = 32
m = 2

L = 2
s = 2

input_size = s**L # number of pixels, actual input size is (input_size x num_features) because of one-hot encoding
max_data = n * (m**((s**L-1)//(s-1))) # total number of data
print(input_size, max_data)

## SAMPLE NOISE

In [None]:
seed_rules = 12345678 # seed of the random hierarchy model
rules = sample_rules( v, n, m, s, L, seed=seed_rules)
samples = torch.arange( max_data)

bonus = dict.fromkeys(['noise', 'size'])
bonus['size'] = 4
print(samples)

In [None]:
max_data = n * m ** ((s**L-1)//(s-1))
data_per_hl = max_data // n 	# div by num_classes to get number of data per class

high_level = samples.div(data_per_hl, rounding_mode='floor')
low_level = samples % data_per_hl					# compute remainder (run in range(data_per_hl))

labels = high_level		# labels are the classes (features of highest level)
features = labels		# init input features as labels (rep. size 1)
size = 1

if 'noise' in bonus:
    noise = {}
    noise[L] = torch.clone(features[-bonus['size']:]) # copy current representation (labels)...
    noise[L][:] = torch.randint(n, (bonus['size'],)) # ...and randomly change it
    bonus['noise'] = noise

print(features)
print(noise[L])


for l in range(L):

    choices = m**size
    data_per_hl = data_per_hl // choices	# div by num_choices to get number of data per high-level featur

    high_level = low_level.div( data_per_hl, rounding_mode='floor')	# div by data_per_hl to get high-level feature index (1 index in range(m**size))
    high_level = dec2base(high_level, m, length=size).squeeze()	# convert to base m (size indices in range(m), squeeze needed if index already in base m)

    features = rules[l][features, high_level]			# apply l-th rule to expand to get features at the lower level (tensor of size (batch_size, size, s))
    features = features.flatten(start_dim=1)			# flatten to tensor of size (batch_size, size*s)
    size *= s								# rep. size increases by s at each level
    low_level = low_level % data_per_hl				# compute remainder (run in range(data_per_hl))

    if 'noise' in bonus:
        for ell in noise.keys(): # propagate modified data
            noise[ell] = rules[l][noise[ell], high_level[-bonus['size']:]]
            noise[ell] = noise[ell].flatten(start_dim=1)

        noise[L-l-1] = torch.clone(features[-bonus['size']:]) # copy current representation ...
        noise[L-l-1][:,-2] = torch.randint(v, (bonus['size'],)) # ... and randomly change the next-to-last feature

    print(features)

    for key in noise.keys():
        print(noise[key])

In [None]:
print(bonus['noise'])
print(bonus)

testable by looking at the rule matrices rules[0] and rules[1] (recall rules[l][v,j] = j-th rep of the v-th level-(L-l) feature) 

 ## SAMPLE SYNONYMS

In [None]:
seed_rules = 12345678 # seed of the random hierarchy model
rules = sample_rules( v, n, m, s, L, seed=seed_rules)
samples = torch.arange( max_data)

bonus = dict.fromkeys(['synonyms', 'size'])
bonus['size'] = 4
print(samples)

In [None]:
max_data = n * m ** ((s**L-1)//(s-1))
data_per_hl = max_data // n 	# div by num_classes to get number of data per class

high_level = samples.div(data_per_hl, rounding_mode='floor')
low_level = samples % data_per_hl					# compute remainder (run in range(data_per_hl))

labels = high_level		# labels are the classes (features of highest level)
features = labels		# init input features as labels (rep. size 1)
size = 1

if 'synonyms' in bonus:
    synonyms = {}
    bonus['synonyms'] = synonyms

print(features)

for l in range(L):

    choices = m**size
    data_per_hl = data_per_hl // choices	# div by num_choices to get number of data per high-level featur

    high_level = low_level.div( data_per_hl, rounding_mode='floor')	# div by data_per_hl to get high-level feature index (1 index in range(m**size))
    high_level = dec2base(high_level, m, length=size).squeeze()	# convert to base m (size indices in range(m), squeeze needed if index already in base m)

    if 'synonyms' in bonus:

        for ell in synonyms.keys(): # propagate modified data
            synonyms[ell] = rules[l][synonyms[ell], high_level[-bonus['size']:]]
            synonyms[ell] = synonyms[ell].flatten(start_dim=1)

        high_level_syn = torch.clone(high_level[-bonus['size']:])
        if l==0:
            high_level_syn[:] = torch.randint(m, (high_level_syn.size(0),))
        else:
            high_level_syn[:,-2] = torch.randint(m, (high_level_syn.size(0),))
        synonyms[L-l] = torch.clone(features[-bonus['size']:])
        synonyms[L-l] = rules[l][synonyms[L-l], high_level_syn]
        synonyms[L-l] = synonyms[L-l].flatten(start_dim=1)
        print(high_level_syn)
        for key in synonyms.keys():
            print(synonyms[key])

    features = rules[l][features, high_level]			# apply l-th rule to expand to get features at the lower level (tensor of size (batch_size, size, s))
    features = features.flatten(start_dim=1)			# flatten to tensor of size (batch_size, size*s)
    size *= s								# rep. size increases by s at each level
    low_level = low_level % data_per_hl				# compute remainder (run in range(data_per_hl))
    print(high_level)
    print(features)

# for l in range(L):

#     choices = m**size
#     data_per_hl = data_per_hl // choices	# div by num_choices to get number of data per high-level featur

#     high_level = low_level.div( data_per_hl, rounding_mode='floor')	# div by data_per_hl to get high-level feature index (1 index in range(m**size))
#     high_level = dec2base(high_level, m, length=size).squeeze()	# convert to base m (size indices in range(m), squeeze needed if index already in base m)

#     features = rules[l][features, high_level]			# apply l-th rule to expand to get features at the lower level (tensor of size (batch_size, size, s))
#     features = features.flatten(start_dim=1)			# flatten to tensor of size (batch_size, size*s)
#     size *= s								# rep. size increases by s at each level
#     low_level = low_level % data_per_hl				# compute remainder (run in range(data_per_hl))

#     if 'noise' in bonus:
#         for ell in noise.keys(): # propagate modified data
#             noise[ell] = rules[l][noise[ell], high_level[-bonus['size']:]]
#             noise[ell] = noise[ell].flatten(start_dim=1)

#         noise[L-l-1] = torch.clone(features[-bonus['size']:]) # copy current representation ...
#         noise[L-l-1][:,-2] = torch.randint(v, (bonus['size'],)) # ... and randomly change the next-to-last feature

#     print(features)

#     for key in noise.keys():
#         print(noise[key])