# My first Variational Autoencoder for Jet compression

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import uproot
import awkward

In [2]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

## Getting the data
I will only use $p_T, \eta, \phi \text{ and } E$.

In [3]:
path_to_data = '../../data/'

folder15 = 'breynold/user.breynold.data15_13TeV.00284484.physics_Main.DAOD_NTUP_JTRIG_JETM1.r9264_p3083_p3601_j042_tree.root/'
file15 = 'user.breynold.18753218._000001.tree.root'
folder16 = 'breynold/user.breynold.data16_13TeV.00307656.physics_Main.DAOD_NTUP_JTRIG_JETM1.r9264_p3083_p3601_j042_tree.root/'
file16 = 'user.breynold.18797259._000001.tree.root'

# Load a ROOT file
filePath = path_to_data + folder16 + file16
ttree = uproot.open(filePath)['outTree']['nominal']

In [4]:
branchnames = ['nAntiKt4EMTopoJets_Calib2018',
               'AntiKt4EMTopoJets_Calib2018_E',
               'AntiKt4EMTopoJets_Calib2018_pt',
               'AntiKt4EMTopoJets_Calib2018_phi',
               'AntiKt4EMTopoJets_Calib2018_eta']

jaggedE = ttree.array(branchnames[1])
jaggedpT = ttree.array(branchnames[2])
jaggedphi = ttree.array(branchnames[3])
jaggedeta = ttree.array(branchnames[4])

In [8]:
def get_leading(jaggedX):
    return jaggedX[jaggedX.counts > 1, 0]

leading_E = get_leading(jaggedE)
leading_pT = get_leading(jaggedpT)
leading_phi = get_leading(jaggedphi)
leading_eta = get_leading(jaggedeta)

In [9]:
print(leading_E.shape, leading_eta.shape, leading_phi.shape, leading_pT.shape)

(1937902,) (1937902,) (1937902,) (1937902,)


In [10]:
df = pd.DataFrame(data = {'pT': leading_pT, 'eta': leading_eta, 'phi': leading_phi, 'E': leading_E})

In [11]:
df.head(10)

Unnamed: 0,pT,eta,phi,E
0,161.850494,-0.764774,2.28735,212.2173
1,164.702454,0.205651,-1.074816,169.021805
2,405.421387,-0.064094,-2.32402,407.661316
3,139.671234,-0.289339,-2.052494,145.860703
4,227.195618,2.158644,-1.864455,996.913025
5,180.480667,1.684722,-1.049987,503.47641
6,39.454185,-0.305395,0.755678,41.94334
7,202.920883,1.583752,-0.301926,515.302856
8,263.867432,-0.141594,-2.308791,267.712372
9,234.460327,-2.949176,2.11475,2244.307617


## Splitting into training and test sets

In [12]:
n_features = len(df.loc[0])

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
print(train.shape, test.shape)

(1550321, 4) (387581, 4)


In [14]:
# Normalize the features
train_mean = train.mean()
train_std = train.std()

train = (train - train_mean) / train_std
test = (test - train_mean) / train_std  # Is this the right way to normalize? (only using train mean and std to normalize both train and test)

train_x = train
test_x = test
train_y = train_x  # y = x since we are building and AE
test_y = test_x

train_ds = TensorDataset(torch.tensor(train_x.values), torch.tensor(train_y.values))
valid_ds = TensorDataset(torch.tensor(test_x.values), torch.tensor(test_y.values))

## Building the model

In [51]:
input_size = n_features
representation_size = input_size - 1
intermediate_size = 6
class VAE(nn.Module):
    def __init__(self):
        super(VAE, self).__init__()
        self.en1 = nn.Linear(input_size, intermediate_size)
        self.en_mu = nn.Linear(intermediate_size, representation_size)
        self.en_std = nn.Linear(intermediate_size, representation_size)
        self.de1 = nn.Linear(representation_size, intermediate_size)
        self.de2 = nn.Linear(intermediate_size, input_size)
        self.tanh = nn.Tanh()
        self.sigmoid = nn.Sigmoid()
        
    def encode(self, x):
        """Encode a batch of samples, and return posterior parameters for each point."""
        h1 = self.tanh(self.en1(x))
        return self.en_mu(h1), self.en_std(h1)
    
    def decode(self, z):
        """Decode a batch of latent variables"""
        
        h2 = self.tanh(self.de1(z))
        return self.de2(h2)
#        return self.sigmoid(self.de2(h2))
    
    def reparam(self, mu, logvar):
        """Reparameterisation trick to sample z values. 
        This is stochastic during training,  and returns the mode during evaluation."""
        
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
            
    
    def forward(self, x):
        """Takes a batch of samples, encodes them, and then decodes them again to compare."""
        mu, logvar = self.encode(x.view(-1, input_size))
        z = self.reparam(mu, logvar)
        return self.decode(z), mu, logvar
    
    def loss(self, reconstruction, x, mu, logvar):
        """ELBO assuming entries of x are binary variables, with closed form KLD."""
        
        #bce = torch.nn.functional.binary_cross_entropy(reconstruction, x.view(-1, input_size))
        mse = nn.functional.mse_loss(reconstruction, x)
        KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        # Normalise by same number of elements as in reconstruction
        KLD /= x.view(-1, input_size).data.shape[0] * input_size

        return KLD + mse #+bce
    
    def get_z(self, x):
        """Encode a batch of data points, x, into their z representations."""
        
        mu, logvar = self.encode(x.view(-1, input_size))
        return self.reparam(mu, logvar)

In [52]:
model = VAE().float()

In [53]:
# Some helper functions

def get_data(train_ds, valid_ds, bs):
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs * 2),
    )


def loss_batch(model, xb, yb, opt=None):
    #loss = loss_func(model(xb), yb)
    data = Variable(xb, requires_grad=False)
    reco_b, mu, logvar = model(data)
    loss = model.loss(reco_b, xb, mu, logvar)
    
    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def fit(epochs, model, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            loss_batch(model, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            losses, nums = zip(
                *[loss_batch(model, xb, yb) for xb, yb in valid_dl]
            )
        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)  # MSE-Loss
        if(epoch % 1 == 0):
            print('epoch: ' + str(epoch) + ',', 'validation loss: ' + str(val_loss))

In [54]:
bs = 64  # batch size
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)

## Training

In [55]:
epochs = 10
opt = optim.Adam(model.parameters(), lr=1e-3)
fit(epochs, model, opt, train_dl, valid_dl)

epoch: 0, validation loss: 0.5196816085694753
epoch: 1, validation loss: 0.5108155167093494
epoch: 2, validation loss: 0.5154185904126377
epoch: 3, validation loss: 0.5132839906139113
epoch: 4, validation loss: 0.5160012102922058
epoch: 5, validation loss: 0.5144948043346376
epoch: 6, validation loss: 0.5122744664231682
epoch: 7, validation loss: 0.5143750624095714
epoch: 8, validation loss: 0.5143660303076657
epoch: 9, validation loss: 0.5159009462526689


In [56]:
model.eval()
for ii in np.arange(100, 105):
    data = valid_ds.tensors[0][ii]
    pred = model(data)
    print('Inp:', data)
    print('Out:', pred)
    print(' ')

Inp: tensor([ 1.6863, -0.7095,  1.6946,  0.2721])
Out: (tensor([[ 0.8634, -0.3817,  0.8849, -0.1324]], grad_fn=<AddmmBackward>), tensor([[-1.1585, -0.4377, -1.1141]], grad_fn=<AddmmBackward>), tensor([[-1.1476, -1.3939, -0.9113]], grad_fn=<AddmmBackward>))
 
Inp: tensor([ 0.1766, -0.1649, -0.0310, -0.5931])
Out: (tensor([[-0.0484, -0.0383, -0.0242, -0.5046]], grad_fn=<AddmmBackward>), tensor([[ 0.0388, -0.0627,  0.0279]], grad_fn=<AddmmBackward>), tensor([[-0.7765, -1.2031, -0.5871]], grad_fn=<AddmmBackward>))
 
Inp: tensor([-0.3746,  0.6508,  1.3037, -0.5510])
Out: (tensor([[-0.2542,  0.3137,  0.8408, -0.4866]], grad_fn=<AddmmBackward>), tensor([[ 0.4190,  0.3437, -0.9271]], grad_fn=<AddmmBackward>), tensor([[-0.7277, -1.2477, -0.8198]], grad_fn=<AddmmBackward>))
 
Inp: tensor([-0.3711, -0.0072,  1.0894, -0.7293])
Out: (tensor([[-0.2641,  0.0272,  0.7654, -0.5709]], grad_fn=<AddmmBackward>), tensor([[ 0.4449,  0.0076, -0.8131]], grad_fn=<AddmmBackward>), tensor([[-0.7119, -1.1983, -0.

In [41]:
idxs = (4000, 4010)  # Choose events to compare
#Get some data for comparison
data = valid_ds.tensors[0][idxs[0]:idxs[1]]
pred = model(data)

In [46]:
data.shape

torch.Size([10, 4])

In [50]:
pred[0]

tensor([[-0.1568,  0.2490,  0.7704, -0.4352],
        [-0.0594, -0.1868, -0.8088, -0.3842],
        [-0.3419,  0.0142,  0.0736, -0.4948],
        [-0.2194, -0.2284, -0.8096, -0.4249],
        [-0.3937,  0.1628,  0.8632, -0.5142],
        [ 0.6288,  0.0240,  0.6771, -0.2426],
        [-0.2346, -0.0035,  0.5084, -0.4622],
        [-0.3577,  0.0455, -0.6020, -0.5001],
        [-0.3641, -0.9947,  0.8514,  0.7858],
        [-0.3432, -0.9999,  0.8278,  0.9889]], grad_fn=<TanhBackward>)

In [25]:
# Plot input on top of output
linewd = 3
line_style = ['-', '--']
colors = ['c', 'orange']
fontsz = 16
figsz = (4, 3)
for kk in np.arange(4):
    plt.figure(kk, figsize=figsz)
    plt.plot(pred[:, kk], color=colors[0], label='Output', linestyle=line_style[0], linewidth=linewd)
    plt.plot(data[:, kk], color=colors[1], label='Input', linestyle=line_style[1], linewidth=linewd)
    plt.title(df.columns[kk], fontsize=fontsz)
    plt.xlabel('Event', fontsize=fontsz)
    plt.ylabel(df.columns[kk], fontsize=fontsz)

TypeError: tuple indices must be integers or slices, not tuple

<Figure size 288x216 with 0 Axes>

In [26]:
kk

0

In [29]:
type(pred)

tuple

In [30]:
data

tensor([[-0.0323,  0.7058,  1.1086, -0.3939],
        [ 0.1698, -0.6500, -1.3607, -0.3466],
        [-0.4372, -0.0921,  0.0349, -0.7382],
        [-0.1991, -0.7556, -1.3839, -0.3895],
        [-0.6349,  0.5452,  1.7008, -0.6860],
        [ 1.4177, -0.0505,  0.8754, -0.3445],
        [-0.1545, -0.1721,  0.5062, -0.6675],
        [-0.4818,  0.0487, -0.7327, -0.7531],
        [-0.5573, -1.7204,  1.6769,  1.1776],
        [-0.2746, -1.9680,  1.5403,  2.9611]])

In [39]:
data = test[idxs[0]:idxs[1]].values
data = torch.tensor(data)

In [40]:
model(data)

(tensor([[-0.1568,  0.2490,  0.7704, -0.4352],
         [-0.0594, -0.1868, -0.8088, -0.3842],
         [-0.3419,  0.0142,  0.0736, -0.4948],
         [-0.2194, -0.2284, -0.8096, -0.4249],
         [-0.3937,  0.1628,  0.8632, -0.5142],
         [ 0.6288,  0.0240,  0.6771, -0.2426],
         [-0.2346, -0.0035,  0.5084, -0.4622],
         [-0.3577,  0.0455, -0.6020, -0.5001],
         [-0.3641, -0.9947,  0.8514,  0.7858],
         [-0.3432, -0.9999,  0.8278,  0.9889]], grad_fn=<TanhBackward>),
 tensor([[ 0.8532,  0.1347, -0.3429],
         [-0.9502, -0.1197,  0.3582],
         [ 0.0413,  0.4601,  0.0386],
         [-0.9644,  0.1735,  0.4069],
         [ 1.0729,  0.6124, -0.2068],
         [ 0.7463, -0.9481,  0.0175],
         [ 0.4565,  0.2650,  0.0690],
         [-0.6096,  0.4706, -0.0095],
         [ 1.0724,  0.2565,  1.5770],
         [ 1.0277, -0.0427,  1.9915]], grad_fn=<AddmmBackward>),
 tensor([[-0.7347, -0.4488, -1.0043],
         [-0.7437, -0.4104, -1.0483],
         [-0.7366, -0

In [None]:
idxs = (4000, 4010)  # Choose events to compare
#Get some data for comparison
data = test
pred = model(data)