In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import uproot
import pandas as pd
import scipy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from matplotlib import pyplot as plt

### Getting the data

In [2]:
def get_df(root_file_name, filter_name):
    events = uproot.open(root_file_name, filter_name=filter_name)["tree"]
    df = events.arrays(library="pd")
    return df

features = []
# variables: general
features += ['FatJet_pt', 'FatJet_eta', 'FatJet_phi', 'FatJet_DDX_jetNSecondaryVertices', \
             'FatJet_DDX_jetNTracks', 'FatJet_DDX_z_ratio', 'FatJet_Proba', 'FatJet_area', \
             'FatJet_jetId', 'FatJet_lsf3', 'FatJet_rawFactor', 'FatJet_n2b1', 'FatJet_n3b1', \
            ]

# variables: tau1
features += ['FatJet_tau1', 'FatJet_DDX_tau1_flightDistance2dSig', 'FatJet_DDX_tau1_trackEtaRel_0', \
             'FatJet_DDX_tau1_trackEtaRel_1', 'FatJet_DDX_tau1_trackEtaRel_2', 'FatJet_DDX_tau1_trackSip3dSig_0', \
             'FatJet_DDX_tau1_trackSip3dSig_1', 'FatJet_DDX_tau1_vertexDeltaR', 'FatJet_DDX_tau1_vertexEnergyRatio', \
            ]

# variables: tau2
features += ['FatJet_tau2', 'FatJet_DDX_tau2_flightDistance2dSig', 'FatJet_DDX_tau2_trackEtaRel_0', \
             'FatJet_DDX_tau2_trackEtaRel_1', 'FatJet_DDX_tau2_trackEtaRel_3', 'FatJet_DDX_tau2_trackSip3dSig_0', \
             'FatJet_DDX_tau2_trackSip3dSig_1', 'FatJet_DDX_tau2_vertexEnergyRatio', \
            ]

# variables: tau3 and tau4
features += ['FatJet_tau3', 'FatJet_tau4',]

# variables: track
features += ['FatJet_DDX_trackSip2dSigAboveBottom_0', 'FatJet_DDX_trackSip2dSigAboveBottom_1', \
             'FatJet_DDX_trackSip2dSigAboveCharm', 'FatJet_DDX_trackSip3dSig_0', \
             'FatJet_DDX_trackSip3dSig_1', 'FatJet_DDX_trackSip3dSig_2', 'FatJet_DDX_trackSip3dSig_3', \
            ]

# variables: subjet 1
features += ['FatJet_subjet1_pt', 'FatJet_subjet1_eta', 'FatJet_subjet1_phi', \
             'FatJet_subjet1_Proba', 'FatJet_subjet1_tau1', 'FatJet_subjet1_tau2', \
             'FatJet_subjet1_tau3', 'FatJet_subjet1_tau4', 'FatJet_subjet1_n2b1', 'FatJet_subjet1_n3b1', \
            ]

# variables: subjet 2
features += ['FatJet_subjet2_pt', 'FatJet_subjet2_eta', 'FatJet_subjet2_phi', \
             'FatJet_subjet2_Proba', 'FatJet_subjet2_tau1', 'FatJet_subjet2_tau2', \
             'FatJet_subjet2_tau3', 'FatJet_subjet2_tau4', 'FatJet_subjet2_n2b1', 'FatJet_subjet2_n3b1', \
            ]

# variables: fatjet sv
features += ['FatJet_sv_costhetasvpv', 'FatJet_sv_d3dsig', 'FatJet_sv_deltaR', 'FatJet_sv_dxysig', \
             'FatJet_sv_enration', 'FatJet_sv_normchi2', 'FatJet_sv_ntracks', 'FatJet_sv_phirel', \
             'FatJet_sv_pt', 'FatJet_sv_ptrel', \
            ]

features = sorted(features)

# inputfile = 'QCD_HT500to700.root'
inputfile = '/eos/user/a/afriberg/datasets/QCD_samples/QCD_HT700to1000.root'
df = get_df(inputfile, '*')

df.dropna(inplace=True)
df = df[features]

# Prior to this, df is a pandas dataframe
X = df.to_numpy()

In [3]:
# MUST BE RUN BEFORE THE NORMALIZATION
# Splitting the data based on the number of secondary vertices that it has
# The index of the maximum number of secondary vertices
X = X.astype("float32")
max_idx = np.argmax(X[:, 0])
print(f"This should be close to an integer: {X[max_idx, 0]}")

print(f"values are all close to integers: {np.allclose(X[:,0] - X[:,0].astype(int), 0)}")

zero_idxs = X[:,0]==0
pos_idxs = X[:,0] > 0
neg_idxs = X[:,0] < 0

X_pos = X[pos_idxs]
X_zero = X[zero_idxs]
X_neg = X[neg_idxs]

# my_bins = range(0, 8)
# plt.hist(X[X[:,0] >= 0], bins=my_bins)
# plt.show()

print(f"Shape of design matrix: {np.shape(X)}")
print(f"Shape of zero matrix: {np.shape(X_zero)}")
print(f"Shape of positive matrix: {np.shape(X_pos)}")
print(f"Shape of negative matrix: {np.shape(X_neg)}")

# We only want to run on data with 0 Secondary Vertices
data = X_pos[:, 1:]
print(f"Shape of the data is {np.shape(data)}")

This should be close to an integer: 7.0
values are all close to integers: True
Shape of design matrix: (399367, 69)
Shape of zero matrix: (224166, 69)
Shape of positive matrix: (175126, 69)
Shape of negative matrix: (75, 69)
Shape of the data is (175126, 68)


In [4]:
batch_size = 32

# Scale our data using a MinMaxScaler that will scale
# each number so that it will be between 0 and 1
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

X_pos = scaler.fit_transform(X_pos)
X_neg = scaler.fit_transform(X_neg)


x_train, x_test = train_test_split(data, test_size=0.20)
original_dim = np.size(data, axis=1)
print(original_dim)

def build_dset(df): 
    df = df.copy()
    dataset = tf.data.Dataset.from_tensor_slices((df, df))
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
    
x_train_dataset = build_dset(x_train)
x_test_dataset = build_dset(x_test)

68


### Sampling Layer

In [5]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

### Defining the Encoding model

In [13]:
latent_dim = 3

inputs = keras.Input(original_dim)
x = layers.Dense(32, activation='relu')(inputs)
x = layers.Dense(16, activation='relu')(x)
z_log_var = layers.Dense(latent_dim, name="log_variance")(x)
z_mean = layers.Dense(latent_dim, name="mean")(x)
z = Sampling(latent_dim, name="sampling")([z_mean, z_log_var])

encoder = keras.Model(inputs, [z_mean, z_log_var, z], name="Encoder")

### Building the decoder

In [14]:
latent_input = keras.Input(latent_dim)
x = layers.Dense(16, activation='relu')(latent_input)
x = layers.Dense(32, activation='relu')(x)
x = layers.Dense(original_dim, activation='sigmoid')(x)
decoder = keras.Model(latent_input, x, name="Decoder")

### Defining the class

In [16]:
class VAE(keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
        
        self.encoder.summary()
        self.decoder.summary()

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    # This was replaced from the orginal by using Otto's model
    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    tf.keras.losses.binary_crossentropy(data, reconstruction), axis=-1
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.encoder.trainable_weights + self.decoder.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }
    
    def call(self, data):
        z_mean, z_log_var, z = self.encoder(data)
        y_pred = self.decoder(z)
        return y_pred 

### Compiling the model and training it

In [None]:
vae = VAE(encoder, decoder)
vae.compile(optimizer="adam")
# vae.fit(x_train_dataset, shuffle=True, epochs=20, batch_size=batch_size)