# AwkwardNN: Trial 1

### Get awkward toy data
(assume awkward is installed)

- Focus on just nested and jagged arrays
- Assign maximum possible length and max possible depth for all arrays.
    * Randomly pick max length/depth from range [1, max] for each event
    and max length again for each sublist/nested list.
- Binary targets: 0, 1
- Data drawn from
    * Gaussian(-L, 1) for target 0, where L is level of data
        * e.g. for the highest level of data (nest 0) with max depth 5,
        data would be drawn from Gaussian(-5, 1), for data within the
        first nest, the max depth is 4 so data would be drawn from
        Gaussian(-4, 1), and so on.
    * Gaussian(+L, 1) for target 1
- Assign probabilities for possible elements in array. Example:
    * p( signal / target 1 ) = 0.30
    * p( noise / target 0 ) = 0.30
    * p( subarray ) = 0.40
- Starting with empty list, randomly sample elements until max length
is reached. When max depth is reached, only sample from p(signal)
and p(noise)



In [1]:
import awkward
from awkwardNN.createAwkwardData import generate_data_target

max_len = 4
max_depth = 4
p_signal = 0.30
p_noise = 0.30
p_subarray = 0.40
num_events = 10000

X_train, y_train = generate_data_target(num_events=num_events,
                                        prob_nest=p_subarray,
                                        prob_sig=p_signal,
                                        prob_bkg=p_noise,
                                        max_len=max_len,
                                        max_depth=max_depth)

X_test, y_test = generate_data_target(num_events=1000,
                                      prob_nest=p_subarray,
                                      prob_sig=p_signal,
                                      prob_bkg=p_noise,
                                      max_len=max_len,
                                      max_depth=max_depth)

for i in range(5):
    print("{}: {}".format(y_train[i], X_train[i]))


1: [[4.429801596478066] [1.550401341203096] [3.722566914905654 1.336849830707411 3.3895145610720516]]
1: [4.77513166]
1: [[2.091071741655326] [2.930729509983566 [2.341295749404614 1.074742178966952 [0.7417639830926431 1.674351306604812] 1.5545883223375734] 1.8451455792507119] 5.106839460671908]
0: [-3.74539092]
1: [5.290886329859674 [1.702121814823695 3.175989892509131 2.736146492136837]]


### Create Pytorch dataloader for neural net

In [2]:
import torch
from awkwardNN.preprocessAwkwardData import AwkwardDataset
from torch.utils.data import DataLoader

trainset = AwkwardDataset(X_train, y_train)
trainsize = len(trainset)
trainloader = DataLoader(trainset, batch_size=1, shuffle=True)

testset = AwkwardDataset(X_test, y_test)
testsize = len(testset)
testloader = DataLoader(testset, batch_size=1)


### Create pytorch neural net

First trial network: flattening all the data from an event into one list and passing through an RNN.
Loses information from nesting structure.

Second trial network:
- Recurrent neural network with a layer for each nested level of data.
- RNN processes an event one nested level at a time, passing each level through its associated layer in order from highest level to lowest level.
- Append hidden state to input for each layer, then use hidden state for through final output layer


In [3]:
import torch.nn as nn
import torch.nn.functional as F


class AwkwardNeuralNet(nn.Module):
    def __init__(self, max_depth, input_sz, hidden_sz, output_sz):
        super(AwkwardNeuralNet, self).__init__()
        self.max_depth = max_depth
        self.input_sz = input_sz
        self.hidden_sz = hidden_sz
        self.layers = []
        for _ in range(max_depth):
            self.layers.append( nn.Linear(input_sz + hidden_sz, hidden_sz) )
        self.output = nn.Linear(hidden_sz, output_sz)

    def forward(self, input_data, markers, hidden):
        i = 0
        # since we're not iterating over batches
        input_data, markers = input_data[0], markers[0]
        for marker, net_layer in zip(markers, self.layers):
            if marker == 0:
                continue
            for _ in range(marker):
                x = torch.tensor([[input_data[i]]], dtype=torch.float32)
                combined = torch.cat((x, hidden), 1)
                hidden = F.relu(net_layer(combined))
                i += 1
        output = F.log_softmax(self.output(hidden), dim=1)
        return output, hidden

### Train neural net

In [6]:
import torch.optim as optim
import awkwardNN.utils as utils

hidden_size = 32
in_size = 1
out_size = trainset.get_output_size()
max_depth = trainset.get_max_depth()
learning_rate = 0.001
epochs = 5
batch_size = 1
print_freq = int(num_events / 10)

model = AwkwardNeuralNet(max_depth=max_depth,
                         input_sz=in_size,
                         hidden_sz=hidden_size,
                         output_sz=out_size)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def train():
    for epoch in range(epochs):
        print('\nEpoch: {}/{}'.format(epoch+1, epochs))
        train_one_epoch(epoch)
    return

def train_one_epoch(epoch):
    losses, accs = utils.AverageMeter(), utils.AverageMeter()
    model.train()
    for i, (x, marker, y) in enumerate(trainloader):
        x, marker, y, hidden = _reset(x, marker, y, batch_size)
        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            y_hat, hidden = model(x, marker, hidden)
            loss = utils.get_loss(y, y_hat)
            acc = utils.get_accuracy(y, y_hat)
            loss.backward()
            optimizer.step()
            losses.update(loss.item(), x.size(0))
            accs.update(acc.item(), x.size(0))

        if i % print_freq == 0:
            utils.print_train_stat(epoch+1, i+print_freq, x, num_events, loss, acc)
    return losses.avg, accs.avg

def test():
    correct = 0
    losses = utils.AverageMeter()
    model.eval()
    for i, (x, marker, y) in enumerate(testloader):
        x, marker, y, hidden = _reset(x, marker, y, batch_size)
        with torch.no_grad():
            y_hat, _ = model(x, marker, hidden)
            loss = utils.get_loss(y, y_hat)
            _, prediction = torch.max(y_hat, 1)
            correct += prediction.eq(y.data.view_as(prediction)).sum()
            losses.update(loss.item(), x.size(0))
    acc = 100. * correct / testsize
    utils.print_test_set(losses.avg, correct, acc, testsize)
    return losses.avg, acc


def _reset(x, marker, y, batch_size):
    x = x.to(device)
    marker = marker.to(device)
    y = y.to(device)
    hidden = torch.zeros(batch_size, hidden_size)
    hidden = hidden.to(device)
    return x, marker, y, hidden

train()
test()


Epoch: 1/5

Epoch: 2/5

Epoch: 3/5

Epoch: 4/5

Epoch: 5/5

[*] Test set:
    Avg. loss: 0.0878, Accuracy: 9664/10000 (97%)



(0.08776980313313719, tensor(96.6400))

### Current issues/problems:

- Pytorch only takes in `Tensor` objects, which have to be rectangular like numpy arrays
- Varying length arrays → hard to batch data
    - because `Tensor` objects have to be rectangular
    - Could zero pad and then ignore those zeros
    - Currently: go through data one event at a time
- Other ways to pass an Awkward Data Structure through an RNN are possible, perhaps
consider other architectures