# My first Autoencoder for Jet compression

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import uproot
import awkward

In [2]:
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

## Getting the data
I will only use $p_T, \eta, \phi \text{ and } E$.

In [3]:
path_to_data = '../../data/'

folder15 = 'breynold/user.breynold.data15_13TeV.00284484.physics_Main.DAOD_NTUP_JTRIG_JETM1.r9264_p3083_p3601_j042_tree.root/'
file15 = 'user.breynold.18753218._000001.tree.root'
folder16 = 'breynold/user.breynold.data16_13TeV.00307656.physics_Main.DAOD_NTUP_JTRIG_JETM1.r9264_p3083_p3601_j042_tree.root/'
file16 = 'user.breynold.18797259._000001.tree.root'

# Load a ROOT file
filePath = path_to_data + folder16 + file16
ttree = uproot.open(filePath)['outTree']['nominal']

In [4]:
branchnames = ['nAntiKt4EMTopoJets_Calib2018',
               'AntiKt4EMTopoJets_Calib2018_E',
               'AntiKt4EMTopoJets_Calib2018_pt',
               'AntiKt4EMTopoJets_Calib2018_phi',
               'AntiKt4EMTopoJets_Calib2018_eta']

jaggedE = ttree.array(branchnames[1])
jaggedpT = ttree.array(branchnames[2])
jaggedphi = ttree.array(branchnames[3])
jaggedeta = ttree.array(branchnames[4])

In [5]:
jaggedE.counts

array([20,  9,  9, ...,  5, 18, 11])

In [6]:
jaggedE.content

array([212.2173  , 175.15614 ,  64.451935, ...,  24.369474,  13.941364,
        60.927433], dtype=float32)

In [7]:
def get_leading(jaggedX):
    return jaggedX[jaggedX.counts > 1, 0]

In [8]:
leading_E = get_leading(jaggedE)
leading_pT = get_leading(jaggedpT)
leading_phi = get_leading(jaggedphi)
leading_eta = get_leading(jaggedeta)

In [9]:
print(leading_E.shape, leading_eta.shape, leading_phi.shape, leading_pT.shape)

(1937902,) (1937902,) (1937902,) (1937902,)


In [10]:
df = pd.DataFrame(data = {'pT': leading_pT, 'eta': leading_eta, 'phi': leading_phi, 'E': leading_E})

In [11]:
df.head(10)

Unnamed: 0,pT,eta,phi,E
0,161.850494,-0.764774,2.28735,212.2173
1,164.702454,0.205651,-1.074816,169.021805
2,405.421387,-0.064094,-2.32402,407.661316
3,139.671234,-0.289339,-2.052494,145.860703
4,227.195618,2.158644,-1.864455,996.913025
5,180.480667,1.684722,-1.049987,503.47641
6,39.454185,-0.305395,0.755678,41.94334
7,202.920883,1.583752,-0.301926,515.302856
8,263.867432,-0.141594,-2.308791,267.712372
9,234.460327,-2.949176,2.11475,2244.307617


## Splitting into training and test sets

In [12]:
n_features = len(df.loc[0])

train, test = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
print(train.shape, test.shape)

(1550321, 4) (387581, 4)


In [14]:
# Normalize the features
train_mean = train.mean()
train_std = train.std()

train = (train - train_mean) / train_std
test = (test - train_mean) / train_std  # Is this the right way to normalize? (only using train mean and std to normalize both train and test)

train_x = train
test_x = test
train_y = train_x  # y = x since we are building and AE
test_y = test_x

train_ds = TensorDataset(torch.tensor(train_x.values), torch.tensor(train_y.values))
valid_ds = TensorDataset(torch.tensor(test_x.values), torch.tensor(test_y.values))

## Building the model

In [15]:
model_big = nn.Sequential(
    nn.Linear(n_features, 8),
    nn.Tanh(),
    nn.Linear(8, 6),
    nn.Tanh(),
    nn.Linear(6, 4),
    nn.Tanh(),
    nn.Linear(4, 6),
    nn.Tanh(),
    nn.Linear(6, 8),
    nn.Tanh(),
    nn.Linear(8, n_features),
)

In [16]:
model = nn.Sequential(
    nn.Linear(n_features, 4),
    nn.Tanh(),
    nn.Linear(4, 4),
    nn.Tanh(),
    nn.Linear(4, n_features),
)

In [17]:
# Some helper functions

def get_data(train_ds, valid_ds, bs):
    return (
        DataLoader(train_ds, batch_size=bs, shuffle=True),
        DataLoader(valid_ds, batch_size=bs * 2),
    )


def loss_batch(model, loss_func, xb, yb, opt=None):
    loss = loss_func(model(xb), yb)

    if opt is not None:
        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item(), len(xb)


def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        model.train()
        for xb, yb in train_dl:
            loss_batch(model, loss_func, xb, yb, opt)

        model.eval()
        with torch.no_grad():
            losses, nums = zip(
                *[loss_batch(model, loss_func, xb, yb) for xb, yb in valid_dl]
            )
        val_loss = np.sum(np.multiply(losses, nums)) / np.sum(nums)  # MSE-Loss
        if(epoch % 1 == 0):
            print(epoch, val_loss)

In [18]:
bs = 64  # batch size
train_dl, valid_dl = get_data(train_ds, valid_ds, bs)

## Training

In [19]:
epochs = 10
loss_func = nn.MSELoss()
opt = optim.Adam(model.parameters(), lr=1e-3)
fit(epochs, model, loss_func, opt, train_dl, valid_dl)

0 0.0010494361593958433
1 0.0004540579605446538
2 0.00034858031049211024
3 0.00026986450002987756
4 0.0004867862588560652
5 0.00020429829266885958
6 0.00022113420253831216
7 0.00018123884706553294
8 0.00013537716324884424
9 0.0014280721862093072


In [20]:
for ii in np.arange(100, 110):
    data = valid_ds.tensors[0][ii]
    pred = model(data)
    print('Inp:', data)
    print('Out:', pred)
    print(' ')

Inp: tensor([ 1.6863, -0.7095,  1.6946,  0.2721])
Out: tensor([ 1.6963, -0.7004,  1.7025,  0.2879], grad_fn=<AddBackward0>)
 
Inp: tensor([ 0.1766, -0.1649, -0.0310, -0.5931])
Out: tensor([ 0.1274, -0.1661, -0.0452, -0.6528], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.3746,  0.6508,  1.3037, -0.5510])
Out: tensor([-0.4260,  0.6506,  1.2878, -0.6035], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.3711, -0.0072,  1.0894, -0.7293])
Out: tensor([-0.4295, -0.0094,  1.0705, -0.7884], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.7818, -0.3965,  1.1572, -0.7662])
Out: tensor([-0.8466, -0.4023,  1.1318, -0.8270], grad_fn=<AddBackward0>)
 
Inp: tensor([0.4060, 1.5628, 1.4620, 1.8385])
Out: tensor([0.4181, 1.5393, 1.5021, 1.8960], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.0816, -0.7146, -1.2172, -0.3813])
Out: tensor([-0.1268, -0.7154, -1.2359, -0.4392], grad_fn=<AddBackward0>)
 
Inp: tensor([ 0.4208,  0.0398, -1.2887, -0.5597])
Out: tensor([ 0.3739,  0.0425, -1.3057, -0.6094], grad_fn=<AddBackward0>)
 


### Train big model

In [21]:
epochs = 10
loss_func = nn.MSELoss()
opt = optim.Adam(model_big.parameters(), lr=1e-3)
fit(epochs, model_big, loss_func, opt, train_dl, valid_dl)

0 0.0007220761639093102
1 0.0005584392987280982
2 0.00032148699721384167
3 0.0002659424027709085
4 0.00040910318819990164
5 0.00028860363617066976
6 0.0003309040378328268
7 0.0003966791239062391
8 0.00017902772095663744
9 0.00023927971482582953


In [22]:
for ii in np.arange(100, 110):
    data = valid_ds.tensors[0][ii]
    pred = model_big(data)
    print('Inp:', data)
    print('Out:', pred)
    print(' ')

Inp: tensor([ 1.6863, -0.7095,  1.6946,  0.2721])
Out: tensor([ 1.6891, -0.7021,  1.7048,  0.2885], grad_fn=<AddBackward0>)
 
Inp: tensor([ 0.1766, -0.1649, -0.0310, -0.5931])
Out: tensor([ 0.1732, -0.1706, -0.0303, -0.5954], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.3746,  0.6508,  1.3037, -0.5510])
Out: tensor([-0.3743,  0.6546,  1.3109, -0.5559], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.3711, -0.0072,  1.0894, -0.7293])
Out: tensor([-0.3758, -0.0057,  1.0981, -0.7372], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.7818, -0.3965,  1.1572, -0.7662])
Out: tensor([-0.7852, -0.3941,  1.1631, -0.7795], grad_fn=<AddBackward0>)
 
Inp: tensor([0.4060, 1.5628, 1.4620, 1.8385])
Out: tensor([0.4307, 1.5773, 1.4798, 1.8541], grad_fn=<AddBackward0>)
 
Inp: tensor([-0.0816, -0.7146, -1.2172, -0.3813])
Out: tensor([-0.0888, -0.7210, -1.2308, -0.3745], grad_fn=<AddBackward0>)
 
Inp: tensor([ 0.4208,  0.0398, -1.2887, -0.5597])
Out: tensor([ 0.4208,  0.0341, -1.3000, -0.5436], grad_fn=<AddBackward0>)
 
