# AwkwardNN: trial 1

### Get awkward toy data

- Focus on just nested and jagged arrays
- Assign maximum possible length and max possible depth for all arrays.
    * Randomly pick max length/depth from range [1, max] for each event
    and again for each sublist/nested list.
- Binary targets: 0, 1
- Signal data drawn from
    * Gaussian(-L, 1) for target 0, where L is level on data
        * e.g. for the highest level of data (nest 0) with max depth 5,
        data would be drawn from Gaussian(-5, 1).
    * Gaussian(+L, 1) for target 1
- Noise data: drawn from uniform dist in range (-max_depth, +max_depth)
- Assign probabilities for possible elements in array. Example:
    * p( signal ) = 0.50
    * p( noise ) = 0.10
    * p( subarray ) = 0.40
- Starting with empty list, randomly sample elements until max length
is reached. When max depth is reached, only sample from p(signal)
and p(noise)



In [None]:
import awkward
from awkwardNN.createAwkwardData import generate_data_target

max_len = 3
max_depth = 3
p_signal = 0.50
p_noise = 0.10
p_subarray = 0.40
num_events = 5

data, targets = generate_data_target(num_events=num_events,
                                     prob_nest=p_subarray,
                                     prob_sig=p_signal,
                                     prob_bkg=p_noise,
                                     max_len=max_len,
                                     max_depth=max_depth)

for i in range(num_events):
    print("{}: {}".format(targets[i], data[i]))


### Some details

Create configuration namespace object to assign arbitrary values for NN
parameters such as size of hidden layers, number of training epochs,
learning rate, etc.

In [None]:
from awkwardNN.config import get_config
import numpy as np
import torch

config, _ = get_config()

np.random.seed(config.random_seed)
torch.manual_seed(config.random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(config.random_seed)

config_dict = vars(config)
for i in config_dict:
    print(i + ": {}".format(config_dict[i]))

### Create Pytorch dataloader for neural net

In [None]:

from awkwardNN.preprocessAwkwardData import get_dataloader

trainloader = get_dataloader(
    dataset_size=config.train_size, batch_size=config.batch_size,
    prob_nest=config.prob_nest, prob_signal=config.prob_signal,
    prob_noise=config.prob_noise, max_len=config.max_len,
    max_depth=config.max_depth
)
validloader = get_dataloader(
    dataset_size=config.valid_size, batch_size=config.batch_size,
    prob_nest=config.prob_nest, prob_signal=config.prob_signal,
    prob_noise=config.prob_noise, max_len=config.max_len,
    max_depth=config.max_depth
)
dataloader = (trainloader, validloader)


### Create pytorch neural net

First trial network: flattening all the data from an event into one list and passing through an RNN.
Loses information from nesting structure.

Second trial network:
- Recurrent neural network with a layer for each nested level of data.
- RNN processes an event one nested level at a time, passing each level through its associated layer in order from highest level to lowest level.
- Append hidden state to input for each layer, then use hidden state for through final output layer


In [None]:
import torch.nn as nn
import torch.nn.functional as F


class AwkwardNN(nn.Module):
    def __init__(self, max_depth, input_sz, hidden_sz, output_sz):
        super(AwkwardNN, self).__init__()
        self.max_depth = max_depth
        self.input_sz = input_sz
        self.hidden_sz = hidden_sz
        self.layers = []
        for _ in range(max_depth):
            self.layers.append( nn.Linear(input_sz + hidden_sz, hidden_sz) )
        self.output = nn.Linear(hidden_sz, output_sz)

    def forward(self, input_data, markers, hidden):
        i = 0
        # since we're not iterating over batches
        input_data, markers = input_data[0], markers[0]
        for marker, net_layer in zip(markers, self.layers):
            if marker == 0:
                continue
            for _ in range(marker):
                x = torch.tensor([[input_data[i]]], dtype=torch.float32)
                combined = torch.cat((x, hidden), 1)
                hidden = F.relu(net_layer(combined))
                i += 1
        output = F.log_softmax(self.output(hidden), dim=1)
        return output, hidden

### Train neural net

In [None]:
import torch.optim as optim
import awkwardNN.utils as utils

model = AwkwardNN(max_depth=config.max_depth,
                  input_sz=1,
                  hidden_sz=config.hidden_size,
                  output_sz=2)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

def train():
    best_valid_acc = 0
    for epoch in range(config.epochs):
        print('\nEpoch: {}/{}'.format(epoch+1, config.epochs))
        train_one_epoch(epoch)
        valid_loss, valid_acc = validate_one_epoch(epoch)
        best_valid_acc = max(valid_acc, best_valid_acc)
        utils.print_valid_stat(valid_loss, valid_acc, config.valid_size, best_valid_acc)
    return

def train_one_epoch(epoch):
    losses, accs = utils.AverageMeter(), utils.AverageMeter()
    model.train()
    for i, (x, marker, y) in enumerate(trainloader):
        x, marker, y = x.to(device), marker.to(device), y.to(device)
        hidden = torch.zeros(config.batch_size, config.hidden_size)
        hidden = hidden.to(device)
        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            y_hat, hidden = model(x, marker, hidden)
            loss = utils.get_loss(y, y_hat)
            acc = utils.get_accuracy(y, y_hat)
            loss.backward()
            optimizer.step()
            losses.update(loss.item(), x.size(0))
            accs.update(acc.item(), x.size(0))

        if i % config.print_freq == 0:
            utils.print_train_stat(epoch+1, i+config.print_freq, x, config.train_size, loss, acc)
    return losses.avg, accs.avg


def validate_one_epoch(epoch):
    losses, accs = utils.AverageMeter(), utils.AverageMeter()
    model.eval()
    for i, (x, marker, y) in enumerate(validloader):
        x, marker, y = x.to(device), marker.to(device), y.to(device)
        hidden = torch.zeros(config.batch_size, config.hidden_size)
        hidden = hidden.to(device)
        with torch.no_grad():
            y_hat, _ = model(x, marker, hidden)
            loss = utils.get_loss(y, y_hat)
            acc = utils.get_accuracy(y, y_hat)
            losses.update(loss.item(), x.size(0))
            accs.update(acc.item(), x.size(0))
    return losses.avg, accs.avg

train()

Or you can run `main.py`

### Current issues/problems:

- Pytorch only takes in `Tensor` objects, which have to be rectangular like numpy arrays
- Varying length arrays → hard to batch data
    - because `Tensor` objects have to be rectangular
    - Could zero pad and then ignore those zeros
    - Currently: go through data one event at a time
- Other ways to pass an Awkward Data Structure through an RNN are possible, perhaps
consider other architectures