# Generate synthetic data

To test whether the net can learn to reason globally over an entire string, manufacture 1d tensors like the following:

`[0 0 0 0 0 0 0 1 0 0 0 0 0 0 7 0 0 ]`

and train the net to judge whether the first or second non-zero integer is higher.

Next step will possibly be to smooth this so that the tensor reflects something more like a curve with two maxima.

In general, there will be at least two zeros between the two numbers, which hopefully will make it possible to do convolution on them still.

In [66]:
import numpy as np
import itertools
import random
import torch
import pickle

In [67]:
arr_len = 20
pos = list(itertools.permutations(range(arr_len),r=2)) # Make a list of tuples, each representing a pair of indices.
pos = [tup for tup in pos if abs(tup[1]-tup[0])>2] # Weed out the indices that have fewer than 2 zeros between them


In [68]:
len(pos)

306

In [79]:
examples = []
labels = []
# For each pair of indices, generate every possible array with integers[0,10] in the position indicated by the indices
for i,j in pos:
    for a in range(1,10):
        for b in range(1,10):
            if not a==b:
                base_arr = np.zeros([arr_len])
                base_arr[i] = a
                base_arr[j] = b
                examples.append(torch.tensor(base_arr,dtype=torch.float32))
                # Now figure out what the label should be for the generated example:
                second_el = a if i>j else b
                first_el = b if i>j else a
                if second_el > first_el:
                    labels.append(torch.tensor(1,dtype=torch.float32))
                else:
                    labels.append(torch.tensor(0,dtype=torch.float32))

In [80]:
ids = list(range(len(examples)))

In [81]:
examples

[tensor([1., 0., 0., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([1., 0., 0., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([1., 0., 0., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([1., 0., 0., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([1., 0., 0., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([1., 0., 0., 7., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([1., 0., 0., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([1., 0., 0., 9., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([2., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),
 tensor([2., 0., 0., 3., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]),


In [82]:
labels

[tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(0.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(0.),
 tensor(0.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(1.),
 tensor(1.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(1.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(0.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),
 tensor(1.),

In [83]:
label_dict = {}
feat_dict = {}
for i in ids:
    label_dict[i] = labels[i]
    feat_dict[i] = examples[i]

In [84]:
ex = 2222

In [85]:
print(feat_dict[ex])
print(label_dict[ex])

tensor([0., 8., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 7.,
        0., 0.])
tensor(0.)


In [86]:
sum(labels)

tensor(11016.)

In [87]:
len(labels)

22032

In [88]:
with open('../data/synth_int_feats.pkl','wb') as f:
    pickle.dump(feat_dict,f)
with open('../data/synth_int_labels.pkl','wb') as f:
    pickle.dump(label_dict,f)
