In [None]:
import json
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_curve
import matplotlib.pyplot as plt
import matplotlib

import h5py
import os
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
import tqdm
import gc
import seaborn as sns
cols=["#DB4437", "#4285F4", "#F4B400", "#0F9D58", "purple", "goldenrod", "peru", "coral","turquoise",'gray','navy','m','darkgreen','fuchsia','steelblue'] 

## Style the plots

In [None]:
ROOT = {
    # "font.sans-serif": ["TeX Gyre Heros", "Helvetica", "Arial"],
    "font.family": "sans-serif",
    "mathtext.fontset": "custom",
    "mathtext.rm": "TeX Gyre Heros",
    "mathtext.bf": "TeX Gyre Heros:bold",
    "mathtext.sf": "TeX Gyre Heros",
    "mathtext.it": "TeX Gyre Heros:italic",
    "mathtext.tt": "TeX Gyre Heros",
    "mathtext.cal": "TeX Gyre Heros",
    "mathtext.default": "regular",
    "figure.figsize": (10.0, 10.0),
    "font.size": 26,
    #"text.usetex": True,
    "axes.labelsize": "medium",
    "axes.unicode_minus": False,
    "xtick.labelsize": "small",
    "ytick.labelsize": "small",
    "legend.fontsize": "small",
    "legend.handlelength": 1.5,
    "legend.borderpad": 0.5,
    "xtick.direction": "in",
    "xtick.major.size": 12,
    "xtick.minor.size": 6,
    "xtick.major.pad": 6,
    "xtick.top": True,
    "xtick.major.top": True,
    "xtick.major.bottom": True,
    "xtick.minor.top": True,
    "xtick.minor.bottom": True,
    "xtick.minor.visible": True,
    "ytick.direction": "in",
    "ytick.major.size": 12,
    "ytick.minor.size": 6.0,
    "ytick.right": True,
    "ytick.major.left": True,
    "ytick.major.right": True,
    "ytick.minor.left": True,
    "ytick.minor.right": True,
    "ytick.minor.visible": True,
    "grid.alpha": 0.8,
    "grid.linestyle": ":",
    "axes.linewidth": 2,
    "savefig.transparent": False,
}
plt.style.use(ROOT)

## Load the results

In [None]:
with h5py.File('results_ji_latch_norm_100epochs_v2.h5', 'r') as file:
    print(file.keys())
    vars_test=np.array(file['vars_test'])
    top_vars=np.array(file['top_vars'])
    labels=np.array(file['labels'])
    activations=np.array(file['activations'])
    activations_ood=np.array(file['activations_ood'])
    top_distance=np.array(file['top_ml_distance'])
    qcd_distance=np.array(file['qcd_ml_distance'])
    wz_distance = np.array(file['wz_ml_distance'])
    logits=np.array(file['logits'])
    logits_ood=np.array(file['logits_ood'])
    logits_sm=np.array(file['logits_sm'])
    logits_sm_ood=np.array(file['logits_sm_ood'])
    images=np.array(file['images'])
    images_ood=np.array(file['images_ood'])
    top_distance_lg=np.array(file['top_distance_lg'])
    qcd_distance_lg=np.array(file['qcd_distance_lg'])
    wz_distance_lg=np.array(file['wz_distance_lg'])

In [None]:
from sklearn.decomposition import PCA, KernelPCA
pca = KernelPCA(n_components=2)
pca = pca.fit(activations)
activations_pca = pca.transform(activations)
activations_ood_pca = pca.transform(activations_ood)
activations_pca = activations
activations_ood_pca = activations_ood

In [None]:
act_cov = np.cov(activations_pca.T)
qcd_act_conv = np.cov(activations_pca[labels==0].T)
qcd_act_cov_inv = np.linalg.inv(act_cov)
qcd_act_mean = np.mean(activations_pca[labels==0],axis=0)

wz_act_conv = np.cov(activations_pca[labels==1].T)
wz_act_cov_inv = np.linalg.inv(act_cov)
wz_act_mean = np.mean(activations_pca[labels==1],axis=0)

top_qdistance = np.array([(activations_ood_pca[i]-qcd_act_mean)@qcd_act_cov_inv@(activations_ood_pca[i]-qcd_act_mean).T for i in range(len(activations_ood_pca))])
qcd_qdistance = np.array([(activations_pca[labels==0][i]-qcd_act_mean)@qcd_act_cov_inv@(activations_pca[labels==0][i]-qcd_act_mean).T for i in range(len(activations_pca[labels==0]))])
wz_qdistance = np.array([(activations_pca[labels==1][i]-wz_act_mean)@wz_act_cov_inv@(activations_pca[labels==1][i]-wz_act_mean).T for i in range(len(activations_pca[labels==1]))])

top_wzdistance = np.array([(activations_ood_pca[i]-wz_act_mean)@wz_act_cov_inv@(activations_ood_pca[i]-wz_act_mean).T for i in range(len(activations_ood_pca))])
qcd_wzdistance = np.array([(activations_pca[labels==0][i]-wz_act_mean)@wz_act_cov_inv@(activations_pca[labels==0][i]-wz_act_mean).T for i in range(len(activations_pca[labels==0]))])
wz_wzdistance = np.array([(activations_pca[labels==1][i]-wz_act_mean)@wz_act_cov_inv@(activations_pca[labels==1][i]-wz_act_mean).T for i in range(len(activations_pca[labels==1]))])

_ = plt.hist(top_qdistance, bins=100, alpha=0.5, label='top')
_ = plt.hist(qcd_qdistance, bins=100, alpha=0.5, label='qcd')
_ = plt.hist(wz_qdistance, bins=100, alpha=0.5, label='wz')
plt.legend()
plt.show()

_ = plt.hist(top_wzdistance, bins=100, alpha=0.5, label='top')
_ = plt.hist(qcd_wzdistance, bins=100, alpha=0.5, label='qcd')
_ = plt.hist(wz_wzdistance, bins=100, alpha=0.5, label='wz')
plt.legend()
plt.show()

top_distance = np.concatenate([(top_qdistance[:,None]), (top_wzdistance[:,None])], axis=1)
qcd_distance = np.concatenate([(qcd_qdistance[:,None]), (qcd_wzdistance[:,None])], axis=1)
wz_distance = np.concatenate([(wz_qdistance[:,None]), (wz_wzdistance[:,None])], axis=1)

## Plots for Latent representation

In [None]:
fig = plt.figure(figsize=(20, 20))
spec = fig.add_gridspec(ncols=2, nrows=2, )

ax1 = fig.add_subplot(spec[0, 0])
temp = activations_pca[labels == 0][:,:2]
_ = ax1.hist2d(temp[:, 0], temp[:, 1], bins=100, cmap='Blues')
ax1.set_title('QCD')

ax2 = fig.add_subplot(spec[0, 1],sharex=ax1,sharey=ax1)
temp = activations_pca[labels == 1][:,:2]
_ = ax2.hist2d(temp[:, 0], temp[:, 1], bins=100, cmap='Reds')
ax2.set_title('W/Z')

ax3 = fig.add_subplot(spec[1, 0],sharex=ax1,sharey=ax1)
temp = activations_ood_pca[:,:2]
_ = ax3.hist2d(temp[:, 0], temp[:, 1], bins=100, cmap='Greys')
ax3.set_title('Top')
# plt.show()

ax4 = fig.add_subplot(spec[1, 1],sharex=ax1,sharey=ax1)
mask = (top_vars[:, 48] > 150) * (top_vars[:, 48] < 200)
temp = activations_ood_pca[mask][:,:2]
_ = ax4.hist2d(temp[:, 0], temp[:, 1], bins=100, cmap='Greys')
ax4.set_title('Top')
plt.show()


In [None]:
import seaborn as sns
import matplotlib.lines as  mlines

fig = plt.figure(figsize=(10, 10))
sns.kdeplot(x=activations_pca[labels == 0][:,0], y=activations_pca[labels == 0][:,1], fill=False, label='QCD', levels=5, color=cols[0],alpha=0.5)
sns.kdeplot(x=activations_pca[labels == 1][:,0], y=activations_pca[labels == 1][:,1], fill=False, label='W/Z', levels=5, color=cols[1],alpha=0.5)
# sns.kdeplot(x=activations_ood_pca[:,0], y=activations_ood_pca[:,1], fill=False, label='Top', levels=5, color='grey',alpha=0.5)
sns.kdeplot(x=activations_ood_pca[mask][:,0], y=activations_ood_pca[mask][:,1], fill=False, label='Top (150 < M < 200)', levels=5, color=cols[2],alpha=0.5)
handles = [mlines.Line2D([], [],color=cols[0], label="QCD"),
           mlines.Line2D([], [],color=cols[1], label="W/Z"),
           mlines.Line2D([], [],color=cols[2], label="Top [OOD]")]
plt.legend(handles=handles)
plt.xlabel('Represntation dim. #1')
plt.ylabel('Represntation dim. #2')
# plt.show()
# plt.savefig('PCA_representation.pdf')

In [None]:
bins = np.linspace(-0, 200, 100)
fig = plt.figure()
_ = plt.hist(np.max(top_distance, axis=1), bins=bins, alpha=0.5,
             label='top', density=True, histtype='step', lw=2)
_ = plt.hist(np.max(qcd_distance, axis=1), bins=bins, alpha=0.5,
             label='qcd', density=True, histtype='step', lw=2)
_ = plt.hist(np.max(wz_distance, axis=1), bins=bins, alpha=0.5,
             label='wz', density=True, histtype='step', lw=2)
plt.yscale('log')
plt.ylim(0.0001, 1)
plt.legend()
plt.show()

fig = plt.figure()
bins = np.linspace(-0, 200, 100)
_ = plt.hist(np.min(top_distance[mask], axis=1), bins=bins, alpha=0.5,
             label='Top [OOD Sample]', density=True, histtype='step', lw=2)
_ = plt.hist(np.min(qcd_distance, axis=1), bins=bins, alpha=0.5,
             label='QCD', density=True, histtype='step', lw=2)
_ = plt.hist(np.min(wz_distance, axis=1), bins=bins, alpha=0.5,
             label='WZ', density=True, histtype='step', lw=2)
plt.yscale('log')
plt.ylim(0.0001, 1)
plt.legend()
plt.show()


In [None]:
pred = np.concatenate(
    [np.max(top_distance[mask], axis=1), np.max(qcd_distance, axis=1)])
y = np.concatenate([np.ones(len(top_distance[mask])),
                   np.zeros(len(qcd_distance))])
fpr, tpr, thresholds = roc_curve(y, pred)
auc_roc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc_roc)
plt.legend(loc='lower right')
plt.show()

In [None]:
top_distance = np.concatenate([(top_qdistance[:,None]), (top_wzdistance[:,None])], axis=1)
qcd_distance = np.concatenate([(qcd_qdistance[:,None]), (qcd_wzdistance[:,None])], axis=1)
wz_distance = np.concatenate([(wz_qdistance[:,None]), (wz_wzdistance[:,None])], axis=1)

In [None]:
with h5py.File('results_vae.h5','r') as file:
    vars_test_vae=np.array(file['vars_test'])
    top_vars_vae=np.array(file['top_vars'])
    qcd_loss=np.array(file['qcd_loss'])
    top_loss=np.array(file['top_loss'])


In [None]:
qcd_distance.shape
# vars_test[labels == 0].shape

In [None]:
mask = (top_vars[:, 48] > 150) * (top_vars[:, 48] < 200)
mask_qcd = (vars_test[labels == 0, 48] > 150) * \
    (vars_test[labels == 0, 48] < 200)
mask_wz = (vars_test[labels == 1, 48] > 150) * \
    (vars_test[labels == 1, 48] < 200)
bins = np.linspace(-2, 150, 100)
_ = plt.hist(np.min(top_distance[mask], axis=1), bins=bins, alpha=0.5,
             label='Top [OOD Sample]', density=True, histtype='step', lw=2,color=cols[2])
_ = plt.hist(np.min(qcd_distance, axis=1), bins=bins, alpha=0.5,
             label='QCD', density=True, histtype='step', lw=2,color=cols[0])
_ = plt.hist(np.min(wz_distance, axis=1), bins=bins, alpha=0.5,
             label='WZ', density=True, histtype='step', lw=2,color=cols[1])
plt.yscale('log')
plt.xlabel('Mahalanobis Distance')
plt.legend()
plt.savefig('max_mahalanobis_distance.pdf', bbox_inches='tight')
plt.show()


In [None]:
pred = np.concatenate([np.max(top_distance[mask],axis=1), np.max(qcd_distance,axis=1)])
y = np.concatenate([np.ones(len(top_distance[mask])), np.zeros(len(qcd_distance))])
fpr_, tpr_, thresholds = roc_curve(y, pred)
auc_roc = auc(fpr_, tpr_)
plt.plot(fpr_, tpr_, label='Mahalanobis Distance (AUC = %0.2f)' % auc_roc,color=cols[1])


true_label_top = np.concatenate(( np.ones(top_loss.shape[0]), np.zeros(qcd_loss.shape[0]) ))
predicted_label_top = np.concatenate(( top_loss, qcd_loss ))
fpr_loss, tpr_loss, threshold_loss = roc_curve(true_label_top, predicted_label_top)
auc_loss = auc(fpr_loss, tpr_loss)
plt.plot(fpr_loss, tpr_loss, label='%s (AUC = %0.2f)' %
         ('Variational Autoencoder', auc_loss), color=cols[0])


qcd_max_lg = np.max(logits[labels==0],axis=1)
wz_max_lg = np.max(logits[labels==1],axis=1)
top_max_lg = np.max(logits_ood,axis=1)

pred = np.concatenate([top_max_lg[mask], qcd_max_lg])
y = np.concatenate([np.ones(len(top_max_lg[mask])), np.zeros(len(qcd_max_lg))])
fpr, tpr, thresholds = roc_curve(y, pred)
auc_roc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='Max Logits curve (area = %0.2f)' % auc_roc,color=cols[2])
plt.plot([1e-10,1],[1e-10,1], '--', color='0.75')
plt.legend(loc='lower right')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.xlim(1e-5,1)
plt.ylim(1e-5,1)
# plt.yscale('log')
# plt.xscale('log')
plt.savefig('roc_curve_v2.pdf', bbox_inches='tight')
plt.show()



In [None]:
plt.plot(tpr_,tpr_/np.sqrt(fpr_),color=cols[1],label='Max Mahalanobis Distance')
plt.plot(tpr,tpr/np.sqrt(fpr),color=cols[2],label='Max Logits')
plt.plot(tpr_loss,tpr_loss/np.sqrt(fpr_loss),color=cols[0],label='Variational Autoencoder')
plt.legend()
plt.xlabel('True Positive Rate')
plt.ylabel(r'$ S_{eff} / \sqrt(B_{eff})$')
plt.savefig('sb_curve.pdf', bbox_inches='tight')
print(np.max(tpr_/np.sqrt(fpr_)),np.max(tpr/np.sqrt(fpr)),np.max(tpr_loss/np.sqrt(fpr_loss)))

In [None]:
plt.hist(qcd_max_lg, bins=100, alpha=0.5, label='QCD', density=True, histtype='step', lw=2, color=cols[0])
plt.hist(wz_max_lg, bins=100, alpha=0.5, label='W/Z', density=True, histtype='step', lw=2, color=cols[1])
plt.hist(top_max_lg[mask], bins=100, alpha=0.5, label='Top [OOD Sample]', density=True, histtype='step', lw=2,  color=cols[2])
plt.xlabel('Max Logits')
plt.ylabel('Density')
plt.legend()
plt.savefig('max_logits.pdf', bbox_inches='tight')

In [None]:
qcd_mass = vars_test[labels==0][:,48]
qcd_distance_max = np.max(qcd_distance, axis=1)
th = np.quantile(qcd_distance_max, 0.5)
bins=np.linspace(0, 300, 60)
hist_qcd ,_,_= plt.hist(qcd_mass, bins=bins, alpha=0.5, label='QCD Mass Dist.', density=True, histtype='step', lw=4)
hist_md ,_,_= plt.hist(qcd_mass[qcd_distance_max>th], bins=bins, alpha=0.5, label='After NuRD selection', density=True, histtype='step', lw=2)

qcd_mass_vae = vars_test_vae[:,48]
th_vae = np.quantile(qcd_loss, 0.5)
hist_vae, _, _ = plt.hist(qcd_mass_vae[qcd_loss > th_vae], bins=bins, alpha=0.5,
                         label='After VAE selection', density=True, histtype='step', lw=2)
plt.xlabel('Mass [GeV]')


th_qcd_lg = np.quantile(qcd_max_lg, 0.5)
hist_ml ,_,_= plt.hist(qcd_mass[qcd_max_lg>th_qcd_lg], bins=bins, alpha=0.5, label='After Logits selection', density=True, histtype='step', lw=2)
plt.legend()
plt.ylabel('Normalized Counts')
plt.yscale('log')
# plt.savefig('mass_dist_log_v2.pdf', bbox_inches='tight')
plt.show()

In [None]:
from scipy.stats import wasserstein_distance
print('Wasserstein Distance between QCD and Top [OOD Sample] (NuRD):', wasserstein_distance(hist_qcd*10000, hist_md*10000))
print('Wasserstein Distance between QCD and Top [OOD Sample] (VAE):', wasserstein_distance(hist_qcd*10000, hist_vae*10000))
print('Wasserstein Distance between QCD and Top [OOD Sample] (Logits):', wasserstein_distance(hist_qcd*10000, hist_ml*10000))

In [None]:
from scipy.spatial.distance import jensenshannon
print('Jensen-Shannon Divergence between QCD and Top [OOD Sample] (NuRD-MD):', jensenshannon(hist_qcd, hist_md)**2)
print('Jensen-Shannon Divergence between QCD and Top [OOD Sample] (VAE):', jensenshannon(hist_qcd, hist_vae)**2)
print('Jensen-Shannon Divergence between QCD and Top [OOD Sample] (NuRD-ML):', jensenshannon(hist_qcd, hist_ml)**2)

In [None]:

# bins = np.linspace(0, 1000, 50)
_ = plt.hist(np.min(top_distance, axis=1), bins=bins, alpha=0.5,
             label='Top [OOD Sample]', density=True, histtype='step', lw=2)
_ = plt.hist(np.min(qcd_distance, axis=1), bins=bins, alpha=0.5,
             label='QCD', density=True, histtype='step', lw=2)
_ = plt.hist(np.min(wz_distance, axis=1), bins=bins, alpha=0.5,
             label='WZ', density=True, histtype='step', lw=2)
plt.yscale('log')
plt.ylabel('Fraction')
plt.xlabel('OOD Score')
plt.legend()
plt.show()

# bins = np.linspace(-4, 10, 100)
# _ = plt.hist(np.min(top_distance[mask], axis=1), bins=bins, alpha=0.5,
#              label='Top [OOD Sample]', density=True, histtype='step', lw=2)
# _ = plt.hist(np.min(qcd_distance, axis=1), bins=bins, alpha=0.5,
#              label='QCD', density=True, histtype='step', lw=2)
# _ = plt.hist(np.min(wz_distance, axis=1), bins=bins, alpha=0.5,
#              label='WZ', density=True, histtype='step', lw=2)
# plt.yscale('log')
# plt.legend()
# plt.show()

pred = np.concatenate([np.max(top_distance, axis=1),
                      np.max(qcd_distance, axis=1)])
y = np.concatenate([np.ones(len(top_distance)), np.zeros(len(qcd_distance))])
fpr, tpr, thresholds = roc_curve(y, pred)
auc_roc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % auc_roc)
plt.legend(loc='lower right')
plt.show()


In [None]:
qcd_mass = vars_test[labels==0]
qcd_min_dist = np.max(qcd_distance, axis=1)


In [None]:
qcd_mass = vars_test[labels==0][:,48]
qcd_distance_max = np.max(qcd_distance, axis=1)
th = np.quantile(qcd_distance_max, 0.75)
plt.hist(qcd_mass, bins=100, alpha=0.5, label='QCD Evetyhing', density=True, histtype='step', lw=2)
plt.hist(qcd_mass[qcd_distance_max>th], bins=100, alpha=0.5, label='QCD after OOD selection', density=True, histtype='step', lw=2)
plt.xlabel('Jet Mass[GeV]')
plt.legend()
plt.show()