# import library

In [11]:
from functools import partial

import numpy as np
import pandas as pd
import os
import random
import time
import tensorflow as tf, re, math
from tensorflow.keras import applications
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import optimizers
from tensorflow.keras import metrics
from tensorflow.keras import Model, Sequential
from tensorflow.keras import backend as K 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from matplotlib import pyplot as plt
import gc
import uproot

### Checking for TPUs

In [12]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print("Running on TPU ", tpu.cluster_spec().as_dict()["worker"])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    print("Not connected to a TPU runtime. Using CPU/GPU strategy")
    strategy = tf.distribute.MirroredStrategy()
    
!nvidia-smi

Not connected to a TPU runtime. Using CPU/GPU strategy
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
/usr/bin/sh: nvidia-smi: command not found


### Loading the data

In [13]:
def get_features():
    return ['FatJet_pt', 'FatJet_eta', 'FatJet_phi', 'FatJet_DDX_jetNSecondaryVertices', 'FatJet_DDX_jetNTracks', 'FatJet_DDX_z_ratio', 'FatJet_Proba', 'FatJet_area', 'FatJet_jetId', 'FatJet_lsf3', 'FatJet_mass', 'FatJet_msoftdrop', 'FatJet_rawFactor', 'FatJet_n2b1', 'FatJet_n3b1', 'FatJet_tau1', 'FatJet_DDX_tau1_flightDistance2dSig', 'FatJet_DDX_tau1_trackEtaRel_0', 'FatJet_DDX_tau1_trackEtaRel_1', 'FatJet_DDX_tau1_trackEtaRel_2', 'FatJet_DDX_tau1_trackSip3dSig_0', 'FatJet_DDX_tau1_trackSip3dSig_1', 'FatJet_DDX_tau1_vertexDeltaR', 'FatJet_DDX_tau1_vertexEnergyRatio', 'FatJet_DDX_tau1_vertexMass', 'FatJet_tau2', 'FatJet_DDX_tau2_flightDistance2dSig', 'FatJet_DDX_tau2_trackEtaRel_0', 'FatJet_DDX_tau2_trackEtaRel_1', 'FatJet_DDX_tau2_trackEtaRel_3', 'FatJet_DDX_tau2_trackSip3dSig_0', 'FatJet_DDX_tau2_trackSip3dSig_1', 'FatJet_DDX_tau2_vertexEnergyRatio', 'FatJet_DDX_tau2_vertexMass', 'FatJet_tau3', 'FatJet_tau4', 'FatJet_DDX_trackSip2dSigAboveBottom_0', 'FatJet_DDX_trackSip2dSigAboveBottom_1', 'FatJet_DDX_trackSip2dSigAboveCharm', 'FatJet_DDX_trackSip3dSig_0', 'FatJet_DDX_trackSip3dSig_1', 'FatJet_DDX_trackSip3dSig_2', 'FatJet_DDX_trackSip3dSig_3', 'FatJet_subjet1_pt', 'FatJet_subjet1_eta', 'FatJet_subjet1_phi', 'FatJet_subjet1_Proba', 'FatJet_subjet1_mass', 'FatJet_subjet1_tau1', 'FatJet_subjet1_tau2', 'FatJet_subjet1_tau3', 'FatJet_subjet1_tau4', 'FatJet_subjet1_n2b1', 'FatJet_subjet1_n3b1', 'FatJet_subjet2_pt', 'FatJet_subjet2_eta', 'FatJet_subjet2_phi', 'FatJet_subjet2_Proba', 'FatJet_subjet2_mass', 'FatJet_subjet2_tau1', 'FatJet_subjet2_tau2', 'FatJet_subjet2_tau3', 'FatJet_subjet2_tau4', 'FatJet_subjet2_n2b1', 'FatJet_subjet2_n3b1', 'FatJet_hadronFlavour', 'FatJet_sv_costhetasvpv', 'FatJet_sv_d3dsig', 'FatJet_sv_deltaR', 'FatJet_sv_dxysig', 'FatJet_sv_enration', 'FatJet_sv_mass', 'FatJet_sv_normchi2', 'FatJet_sv_ntracks', 'FatJet_sv_phirel', 'FatJet_sv_pt', 'FatJet_sv_ptrel', 'FatJet_nFatJetPFCands', 'FatJet_pfcand_max_deltar', 'FatJet_pfcand_mean_deltar', 'FatJet_gen_pt', 'FatJet_gen_eta', 'FatJet_gen_phi', 'FatJet_gen_hadronFlavour']

In [14]:
def get_df(root_file_name, filter_name):
    events = uproot.open(root_file_name, filter_name=filter_name)["tree"]
    df = events.arrays(library="pd")
    return df

features = []
# variables: general
features += ['FatJet_pt', 'FatJet_eta', 'FatJet_phi', 'FatJet_DDX_jetNSecondaryVertices', \
             'FatJet_DDX_jetNTracks', 'FatJet_DDX_z_ratio', 'FatJet_Proba', 'FatJet_area', \
             'FatJet_jetId', 'FatJet_lsf3', 'FatJet_rawFactor', 'FatJet_n2b1', 'FatJet_n3b1', \
            ]

# variables: tau1
features += ['FatJet_tau1', 'FatJet_DDX_tau1_flightDistance2dSig', 'FatJet_DDX_tau1_trackEtaRel_0', \
             'FatJet_DDX_tau1_trackEtaRel_1', 'FatJet_DDX_tau1_trackEtaRel_2', 'FatJet_DDX_tau1_trackSip3dSig_0', \
             'FatJet_DDX_tau1_trackSip3dSig_1', 'FatJet_DDX_tau1_vertexDeltaR', 'FatJet_DDX_tau1_vertexEnergyRatio', \
            ]

# variables: tau2
features += ['FatJet_tau2', 'FatJet_DDX_tau2_flightDistance2dSig', 'FatJet_DDX_tau2_trackEtaRel_0', \
             'FatJet_DDX_tau2_trackEtaRel_1', 'FatJet_DDX_tau2_trackEtaRel_3', 'FatJet_DDX_tau2_trackSip3dSig_0', \
             'FatJet_DDX_tau2_trackSip3dSig_1', 'FatJet_DDX_tau2_vertexEnergyRatio', \
            ]

# variables: tau3 and tau4
features += ['FatJet_tau3', 'FatJet_tau4',]

# variables: track
features += ['FatJet_DDX_trackSip2dSigAboveBottom_0', 'FatJet_DDX_trackSip2dSigAboveBottom_1', \
             'FatJet_DDX_trackSip2dSigAboveCharm', 'FatJet_DDX_trackSip3dSig_0', \
             'FatJet_DDX_trackSip3dSig_1', 'FatJet_DDX_trackSip3dSig_2', 'FatJet_DDX_trackSip3dSig_3', \
            ]

# variables: subjet 1
features += ['FatJet_subjet1_pt', 'FatJet_subjet1_eta', 'FatJet_subjet1_phi', \
             'FatJet_subjet1_Proba', 'FatJet_subjet1_tau1', 'FatJet_subjet1_tau2', \
             'FatJet_subjet1_tau3', 'FatJet_subjet1_tau4', 'FatJet_subjet1_n2b1', 'FatJet_subjet1_n3b1', \
            ]

# variables: subjet 2
features += ['FatJet_subjet2_pt', 'FatJet_subjet2_eta', 'FatJet_subjet2_phi', \
             'FatJet_subjet2_Proba', 'FatJet_subjet2_tau1', 'FatJet_subjet2_tau2', \
             'FatJet_subjet2_tau3', 'FatJet_subjet2_tau4', 'FatJet_subjet2_n2b1', 'FatJet_subjet2_n3b1', \
            ]

# variables: fatjet sv
features += ['FatJet_sv_costhetasvpv', 'FatJet_sv_d3dsig', 'FatJet_sv_deltaR', 'FatJet_sv_dxysig', \
             'FatJet_sv_enration', 'FatJet_sv_normchi2', 'FatJet_sv_ntracks', 'FatJet_sv_phirel', \
             'FatJet_sv_pt', 'FatJet_sv_ptrel', \
            ]

features = sorted(features)

root_dir = "/eos/user/a/afriberg/datasets/QCD_samples/"

dirs = os.listdir(root_dir)

first_file = dirs.pop(0)
while ".root" not in first_file:
    first_file = dirs.pop(0)

first_file = root_dir + first_file
df = get_df(first_file, '*')
df.dropna(inplace=True)
df = df[features]
# Prior to this, df is a pandas dataframe
X = df.to_numpy().astype(np.float32)
print(np.shape(X))


for inputfile in dirs:
    if ".root" not in inputfile:
        continue
    inputfile = root_dir + inputfile
    df = get_df(inputfile, '*')
    df.dropna(inplace=True)
    df = df[features]
    # Prior to this, df is a pandas dataframe
    next_data = df.to_numpy().astype(np.float32)
    print(f"next data has shape {np.shape(next_data)}")
    # appending it to the whole thing
    X = np.append(X, next_data, axis=0)

print(f"X has shape {np.shape(X)}")

(376980, 69)
next data has shape (353590, 69)
next data has shape (340485, 69)
next data has shape (74979, 69)
next data has shape (361458, 69)
next data has shape (399367, 69)
X has shape (1906859, 69)


# Run this 
** *only* **

In [None]:
# Scale our data using a MinMaxScaler that will scale
# each number so that it will be between 0 and 1
scaler = MinMaxScaler()
data = scaler.fit_transform(X)

x_train, x_test = train_test_split(data, test_size=0.20)
original_dim = np.size(data, axis=1)
print(original_dim)

In [None]:
batch_size = 32

def build_dset(df): 
    df = df.copy()
    dataset = tf.data.Dataset.from_tensor_slices((df, df))
    dataset = dataset.batch(batch_size, drop_remainder=True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)
    return dataset
    
x_train_dataset = build_dset(x_train)
x_test_dataset = build_dset(x_test)

### loss function

In [None]:
class Sampling(layers.Layer):
    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""

    def call(self, inputs):
        z_mean, z_log_var = inputs
        batch = tf.shape(z_mean)[0]
        dim = tf.shape(z_mean)[1]
        epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

### build model

In [None]:
def get_encoder(original_dim, latent_dim):
    # Encoder
    encoder_inputs = layers.Input(shape=(original_dim,))
    h = layers.Dense(32, activation='relu')(encoder_inputs)
    h = layers.Dense(16, activation='relu')(h)
    h = layers.Dense(8, activation='relu')(h)
    h = layers.Dense(latent_dim, activation='sigmoid')(h)
    z_mu = layers.Dense(latent_dim, name="z_mean")(h)
    z_log_var = layers.Dense(latent_dim, name="z_log_var")(h)
    z = Sampling()([z_mu, z_log_var])
    
    encoder = Model(encoder_inputs, [z_mu, z_log_var, z], name="encoder")
    return encoder
    
def get_decoder(original_dim, latent_dim):
    decoder_inputs = layers.Input(shape=(latent_dim,))
    d = layers.Dense(8, activation='relu')(decoder_inputs)
    d = layers.Dense(16, activation='relu')(d)
    d = layers.Dense(32, activation='relu')(d)
    d = layers.Dense(original_dim, activation='relu')(d)
    
    decoder = Model(decoder_inputs, d, name="decoder")
    return decoder

class vae(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(vae, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.total_loss_tracker = tf.keras.metrics.Mean(name="total_loss")
        self.reconstruction_loss_tracker = tf.keras.metrics.Mean(
            name="reconstruction_loss"
        )
        self.kl_loss_tracker = tf.keras.metrics.Mean(name="kl_loss")
        self.encoder.summary()
        self.decoder.summary()

    @property
    def metrics(self):
        return [
            self.total_loss_tracker,
            self.reconstruction_loss_tracker,
            self.kl_loss_tracker,
        ]

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            reconstruction_loss = tf.reduce_mean(
                tf.reduce_sum(
                    tf.keras.losses.binary_crossentropy(data, reconstruction), axis=-1
                )
            )
            kl_loss = -0.5 * (1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var))
            kl_loss = tf.reduce_mean(tf.reduce_sum(kl_loss, axis=1))
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.encoder.trainable_weights + self.decoder.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.encoder.trainable_weights + self.decoder.trainable_weights))
        self.total_loss_tracker.update_state(total_loss)
        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
        self.kl_loss_tracker.update_state(kl_loss)
        return {
            "loss": self.total_loss_tracker.result(),
            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
            "kl_loss": self.kl_loss_tracker.result(),
        }
    
    def call(self, data):
        z_mean, z_log_var, z = self.encoder(data)
        y_pred = self.decoder(z)
        return y_pred 

### get model

In [None]:
latent_dim = 2

with strategy.scope():
    encoder = get_encoder(original_dim, latent_dim)
    decoder = get_decoder(original_dim, latent_dim)
    model = vae(encoder, decoder)
    model.compile(optimizer=tf.keras.optimizers.Adam(1.e-3))
    #model.compile(optimizer=tf.keras.optimizers.RMSprop())


### train model

In [None]:
def get_lr_callback():
    lr_start   = 0.000001
    lr_max     = 0.01
    lr_min     = 0.000001
    lr_ramp_ep = 5
    lr_sus_ep  = 10
    lr_decay   = 0.8
   
    def lrfn(epoch):
        if epoch < lr_ramp_ep:
            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start   
        elif epoch < lr_ramp_ep + lr_sus_ep:
            lr = lr_max    
        else:
            lr = (lr_max - lr_min) * lr_decay**(epoch - lr_ramp_ep - lr_sus_ep) + lr_min    
        return lr

    lr_callback = tf.keras.callbacks.LearningRateScheduler(lrfn, verbose = True)
    return lr_callback

checkpoint_path = "checkpoints/tsg_vae weights.{epoch:05d}.hdf5"
# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 monitor = 'val_loss',
                                                 save_weights_only=True,
                                                 save_best_only=False,
                                                 mode = 'min',
                                                 verbose=1)

num_epochs = 20

history = model.fit(
    x_train_dataset,
    shuffle=True,
    epochs=num_epochs,
    batch_size=batch_size,
    callbacks=[cp_callback]
    #callbacks=[cp_callback, get_lr_callback()]
)

In [None]:
model.evaluate(x_test)
predictions = model.predict(x_test)

In [None]:
print(history.history.keys())
plt.plot(history.history["loss"])
# plt.plot(history.history["kl_loss"])
# plt.plot(history.history["reconstruction_loss"])
# plt.yscale('log')
plt.show()

# Loading a model from a checkpoint

In [None]:
encoder = get_encoder(original_dim, latent_dim)
decoder = get_decoder(original_dim, latent_dim)
model = vae(encoder, decoder)
model.compile(optimizer=tf.keras.optimizers.Adam(1.e-3))
model.evaluate(x_test)
model.load_weights("checkpoints/tsg_vae weights.00020.hdf5")

### Plotting the test data errors

In [None]:
predict = model.predict(x_test)
err = np.mean(np.abs(predict - x_test), axis=1)
print(np.shape(err))

# There are two humps, so we're plotting those separately to visualize well
first_peaks = []
second_peaks = []
for idx, val in enumerate(err):
    if val < 0.025:
        first_peaks.append(idx)
    elif 0.033 <= val < 0.05:
        second_peaks.append(idx)

low_err = err[first_peaks]
high_err = err[second_peaks]

low_err_test = x_test[first_peaks]
high_err_test = x_test[second_peaks]

In [None]:
bins = np.linspace(0, 0.4, 1000)
# plt.hist(err, density=True)
plt.hist(err, density=True, bins=bins)
plt.xlabel("Mean Absolute Error")
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
plt.title("Aggregate QCD Data")
plt.show()


# There are two humps, so we're plotting those separately to visualize well
plt.hist(low_err, density=True, bins=bins, alpha=0.5, label="Lower error data")
plt.hist(high_err, density=True, bins=bins, alpha=0.5, label="Higher error data")
plt.xlabel("Mean Absolute Error")
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
plt.title("Aggregate QCD Data")
plt.legend()
plt.show()

In [None]:
pt_idx = features.index('FatJet_pt')
eta_idx = features.index('FatJet_eta')
phi_idx = features.index('FatJet_phi')

print(features[pt_idx])
print(features[eta_idx])
print(features[phi_idx])

# Getting the data to plot eta, phi, and pt for both high and low error data
high_err_pt = high_err_test[:, pt_idx]
low_err_pt = low_err_test[:, pt_idx]

high_err_eta = high_err_test[:, eta_idx]
low_err_eta = low_err_test[:, eta_idx]

high_err_phi = high_err_test[:, phi_idx]
low_err_phi = low_err_test[:, phi_idx]

In [None]:
bins = np.linspace(0, 0.6, 1000)
plt.hist(high_err_pt, density=True, bins=bins, alpha=0.5, label="Lower error data")
plt.hist(low_err_pt, density=True, bins=bins, alpha=0.5, label="Higher error data")
plt.xlabel("Transverse Momentum")
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
plt.title("Aggregate QCD Data")
plt.legend()
plt.show()

bins = np.linspace(0.9, 1, 1000)
plt.hist(high_err_eta, density=True, bins=bins, alpha=0.5, label="Lower error data")
plt.hist(low_err_eta, density=True, bins=bins, alpha=0.5, label="Higher error data")
plt.xlabel("Eta")
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
plt.title("Aggregate QCD Data")
plt.legend()
plt.show()

# bins = np.linspace(0, 1, 1000)
plt.hist(high_err_phi, density=True, bins=bins, alpha=0.5, label="Lower error data")
plt.hist(low_err_phi, density=True, bins=bins, alpha=0.5, label="Higher error data")
plt.xlabel("Phi")
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
plt.title("Aggregate QCD Data")
plt.legend()
plt.show()

## Plotting Latent Space Distributions

In [None]:
trained_encoder = model.encoder
z_mean_high, z_log_var_high, z_high = trained_encoder.predict(high_err_test)
z_mean_low, z_log_var_low, z_low = trained_encoder.predict(low_err_test)

In [None]:
print(np.shape(z_high))
num_high = np.size(z_high, axis=0) // 10
num_low = np.size(z_low, axis=0) // 10
print(num_high)
plt.scatter(z_high[:num_high, 0], z_high[:num_high, 1], s=1, alpha=0.5, label="high error data")
plt.scatter(z_low[:num_low, 0], z_low[:num_low, 1], s=1, alpha=0.5, label="low error data")
plt.title("latent space representation")
plt.xlabel("z_0")
plt.ylabel("z_1")
plt.legend()
plt.show()

In [None]:
bins = np.linspace(-4, 4, 500)
plt.hist(z_high[:, 0], density=True, bins=bins, alpha=0.4, label="high error data")
plt.hist(z_low[:, 0], density=True, bins=bins, alpha=0.4, label="low error data")
plt.legend()
plt.xlabel("z_0")
plt.ylabel("density")
plt.show()

plt.hist(z_high[:, 1], density=True, bins=bins, alpha=0.4, label="high error data")
plt.hist(z_low[:, 1], density=True, bins=bins, alpha=0.4, label="low error data")
plt.legend()
plt.xlabel("z_1")
plt.ylabel("density")
plt.show()

In [None]:
# bins = np.linspace(0, 0.6, 1000)
for idx, feat in enumerate(features):
    high_err_feat = high_err_test[:, idx]
    low_err_feat = low_err_test[:, idx]
    
#     plt.hist(low_err_feat, density=True, bins=bins, alpha=0.5, label="Lower error data")
#     plt.hist(high_err_feat, density=True, bins=bins, alpha=0.5, label="Higher error data")
    plt.hist(low_err_feat, density=True, bins=200, alpha=0.5, label="Lower error data")
    plt.hist(high_err_feat, density=True, bins=200, alpha=0.5, label="Higher error data")
    plt.xlabel(feat)
    plt.ylabel("Number of events (density)")
    # Getting the name of the file we ran on
    plt.title(feat)
    plt.legend()
    plt.show()

In [None]:
sv_features = []
for feat in features:
    if "sv" in feat:
        sv_features.append(feat)
        print(feat)

print(len(sv_features))

feat_name = sv_features[0]
print(f"\n\nWorking on {feat_name}")
idx = features.index(feat_name)
high_err_feat = high_err_test[:, idx]
low_err_feat = low_err_test[:, idx]

# bins=200
# bins=np.linspace(0, 1, 100)
# bins=np.linspace(0.85, 1, 100)
bins=np.linspace(0, 0.25, 100)
bins=np.linspace(0, 0.05, 100)
plt.hist(low_err_feat, density=True, bins=bins, alpha=0.5, label="Lower error data")
plt.hist(high_err_feat, density=True, bins=bins, alpha=0.5, label="Higher error data")
plt.xlabel(feat_name)
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
plt.title(feat_name)
plt.legend()
plt.show()

# Checking the features of the training data

In [None]:
predict = model.predict(x_train)
err = np.mean(np.abs(predict - x_train), axis=1)
print(np.shape(err))

In [None]:
bins = np.linspace(0, 0.4, 1000)
plt.hist(err, bins=bins)
plt.xlabel("Error (MAE)")
plt.ylabel("Number of Events")
plt.title("Training data")
plt.show()

### Selecting a new dataset and plotting the error

In [None]:
root_dir = "/eos/user/a/afriberg/datasets/vae_err/"
files = os.listdir(root_dir)

# Not a good way to do it, would use a dictionary but they're annoying to iterate over
err_names = []
errors = []

print(files)
for new_file in files:
    print(f"now working on {new_file}")
    # inputfile = 'QCD_HT500to700.root'
    df = get_df(root_dir + new_file, '*')

    df.dropna(inplace=True)
    df = df[features]

    new_X = df.to_numpy()
    new_X = new_X.astype("float32")

    # Scale our data using a MinMaxScaler that will scale 
    # each number so that it will be between 0 and 1
    scaler = MinMaxScaler()
    new_data = scaler.fit_transform(new_X)
    predict = model.predict(new_data)
    err = np.mean(np.abs(predict - new_data), axis=1)
    print(np.shape(err))
    
    
    bins = np.linspace(0, 0.4, 1000)
    # plt.hist(err, density=True)
    plt.hist(err, density=True, bins=bins)
    plt.xlabel("Mean Absolute Error")
    plt.ylabel("Number of events (density)")
    # Getting the name of the file we ran on
    title = new_file.rpartition('/')[-1].rpartition('.')[0]
    plt.title(title)
#     plt.savefig("images/" + title)
    plt.show()
    
    
    err_names.append(title)
    errors.append(err)

# Data with even more categories

In [None]:
root_dir = "/eos/user/a/afriberg/datasets/stripped/"
new_file = "ZH_HToBB_ZToLL_M125_13TeV_powheg_pythia8.root"
df = get_df(root_dir + new_file, '*')

df.dropna(inplace=True)
df = df[features]

new_X = df.to_numpy()
new_X = new_X.astype("float32")

# Scale our data using a MinMaxScaler that will scale 
# each number so that it will be between 0 and 1
scaler = MinMaxScaler()
new_data = scaler.fit_transform(new_X)
new_predict = model.predict(new_data)
new_err = np.mean(np.abs(new_predict - new_data), axis=1)
print(np.shape(err))    
    
bins = np.linspace(0, 0.4, 1000)
# plt.hist(err, density=True)
plt.hist(err, density=True, bins=bins)
plt.xlabel("Mean Absolute Error")
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
title = new_file.rpartition('/')[-1].rpartition('.')[0]
plt.title(title)
# plt.savefig("images/" + title)
plt.show()
    
# err_names.append(title)
# errors.append(err)

In [None]:
# There are two humps, so we're plotting those separately to visualize well
first_peaks = []
second_peaks = []
for idx, val in enumerate(new_err):
    if val < 0.10:
        first_peaks.append(idx)
    elif 0.17 <= val < 0.22:
        second_peaks.append(idx)

new_low_err = err[first_peaks]
new_high_err = err[second_peaks]

new_low_err_test = x_test[first_peaks]
new_high_err_test = x_test[second_peaks]

In [None]:
# plt.hist(err, density=True)
plt.hist(new_low_err_test, density=True, bins=bins)
plt.hist(new_high_err_test, density=True, bins=bins)
plt.xlabel("Mean Absolute Error")
plt.ylabel("Number of events (density)")
# Getting the name of the file we ran on
title = new_file.rpartition('/')[-1].rpartition('.')[0]
plt.title(title)
# plt.savefig("images/" + title)
plt.show()

# Latent space representation of the new data

In [None]:
trained_encoder = model.encoder
new_z_mean, new_z_log_var, new_z = trained_encoder.predict(new_data)

In [None]:
alpha = 0.7
plt.scatter(new_z[:, 0], new_z[:, 1], s=1, alpha=alpha, label=title[:10])
plt.scatter(z_high[:num_high, 0], z_high[:num_high, 1], s=1, alpha=alpha, label="high error data")
plt.scatter(z_low[:num_low, 0], z_low[:num_low, 1], s=1, alpha=alpha, label="low error data")
plt.title("latent space representation")
plt.xlabel("z_0")
plt.ylabel("z_1")
plt.legend()
plt.show()

plt.scatter(new_z[:, 0], new_z[:, 1], s=1, alpha=alpha, label=title[:10])
plt.legend()
plt.show()
plt.scatter(z_high[:num_high, 0], z_high[:num_high, 1], s=1, alpha=alpha, label="high error data")
plt.legend()
plt.show()
plt.scatter(z_low[:num_low, 0], z_low[:num_low, 1], s=1, alpha=alpha, label="low error data")
plt.legend()
plt.show()