In [1]:
#Python/Data
import matplotlib.pyplot as plt
from matplotlib import gridspec
import matplotlib.colors as mcolors
import numpy as np
import h5py as h5

import os
import shutil

#ML
import energyflow as ef
from energyflow.archs import PFN
from energyflow.utils import data_split
import tensorflow as tf

**Note: Image HDF5 Files are depricated as of Sep 14 2022**

|        |        |        |  Calo Image  |           |           |
|:------:|:------:|:------:|:------------:|:---------:|:---------:|
| Cell E | Cell X | Cell Y | Cell 'Depth' | Layer 1 Z | Layer 2 Z |
|    0   |    1   |    2   |       3      |     4     |     5     |

| Truth   |        |
|:-------:|:------:|
|Truth $E$|Truth $\theta$|

### Load the Data

In [2]:
#filename = 'rec_pionplus_300k_simple.hdf5'
#filename = 'rec_pionplus_300k.hdf5'
#filename = 'LogE_300k_normed.hdf5'
#filename = 'hcal_only_images.hdf5'
filename = '2M_piplus.hdf5'
#filename = 'rec_pionplus_300ke_1i.hdf5'

label = "2M_piplus_hcal"  #Replace with your own variation!      
path = "./"+label

In [3]:
os.makedirs(path)

In [4]:
learning_rate = 1e-4
dropout_rate = 0.1
batch_size = 1000
N_Epochs = 400
patience = 20
N_Latent = 128
shuffle_split = True #Turn FALSE for images!
train_shuffle = True #Turn TRUE for images!
Y_scalar = True
#loss = 'mae'#try 'swish'
loss = 'mse'


In [5]:
h5_file = h5.File(filename,'r')
print(list(h5_file.keys()))
images = h5_file['hcal']
truth = h5_file['mc']

print(images.shape)
print(truth.shape)
print(images.chunks[0]) #should be = batch_size
print(truth.chunks[0])

['hcal', 'mc']
(2000000, 4, 1861)
(2000000, 10, 29)
100
100


In [6]:
N_Events = images.shape[0]
#N_Events = 120_000
#N_Events = 30_000

X = images[:N_Events]
Y = truth[:N_Events,0,0]

In [None]:
Cut_Energy = False
if (Cut_Energy): 
    MinE = 60.
    X = X[Y>MinE]
    Y = Y[Y>MinE]
    
Cut_Theta = True
if (Cut_Theta):
    min_theta = 5.0
    max_theta = 30.0
    gen_theta = truth[:N_Events,1,0]
    theta_mask = gen_theta>min_theta
    X = X[theta_mask]
    Y = Y[theta_mask]

In [None]:
Y_Mean = np.nanmean(Y)
Y_StDev = np.nanstd(Y)
if (Y_scalar):
    Y = (Y - Y_Mean)/Y_StDev

In [None]:
print(np.shape(X))
print(np.shape(Y))

In [None]:
(X_train, X_val, X_test,
Y_train, Y_val, Y_test) = data_split(X, Y, val=0.2, test=0.3,shuffle=shuffle_split) #shuffle_splite should be FALSE for images with same parent G4 EVent

In [None]:
print(len(np.ravel(np.count_nonzero(X[:,0,:],axis=-1))))

In [None]:
fig = plt.figure(figsize=(20,9))

cm = plt.cm.get_cmap('plasma')
cell_vars = ["Energy","Cell X","Cell Y","Cell Depth","Layer 1 Position", "Layer 2 Position"]
bins = [np.linspace(-50,200,101),np.linspace(-500,751,100),
        np.linspace(-500,751,100),np.linspace(-10,10,100), #,np.linspace(-25,25,100),
        np.linspace(-275,350,100),np.linspace(-275,350,100)]
data=[]

#plt.subplots_adjust(left=None, bottom=1, right=None, top=1.5, wspace=None, hspace=None)
for i in range(X.shape[1]):
    ax = plt.subplot(2, 4, i+1)
    data.insert(i,np.ravel(X[0:int(N_Events/10),i,:]))
    data[i] = data[i][data[i]!=0] #to suppress 0 pads
    plt.hist(data[i],color=cm(i/X.shape[1]))
    plt.title("%s"%(cell_vars[i]),fontsize=20)
    plt.suptitle("Normalized Cell Input Data",fontsize=25)
    
#Plot 2 More Features

#Number of Hits:
ax = plt.subplot(2, 4, X.shape[1]+1)
for i in range(X.shape[1]):
    if (i>3): continue #while L1 and L2 are fixed
    plt.hist(np.ravel(np.count_nonzero(X[:,i,:],axis=-1)),
             bins=1000,alpha=0.2,
             density=True, label=cell_vars[i])

plt.title("Number of Cell Hits",fontsize=20)
plt.legend()
    
ax = plt.subplot(2,4,X.shape[1]+2)
plt.hist(Y_val,alpha=0.5,label="Validation Energy",density=True,color="red")
plt.hist(Y_test,alpha=0.5,label="Test Energy",density=True,color="blue")
plt.legend()
plt.title("Validation and Test Energy")
    
plt.savefig("%s/Normalized_Cell_Data.pdf"%(path))

In [None]:
def lr_decay(epoch, lr):
    min_rate = 1.01e-7
    N_start = 39
    N_epochs = 40
    
    if epoch > N_start and lr >= min_rate:
        if (epoch%N_epochs==0):
            return lr * 0.1
    return lr

In [None]:
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(lr_decay,verbose=0)
early_stopping = tf.keras.callbacks.EarlyStopping(patience=patience)

In [None]:
Phi_sizes, F_sizes = (100, 100, N_Latent), (100, 100, 100)
#Phi_sizes, F_sizes = (20, 20, N_Latent), (20, 20, 20)
output_act, output_dim = 'linear', 1
loss = 'mse' #mean-squared error
pfn = PFN(input_dim=X.shape[-1], 
          Phi_sizes=Phi_sizes, 
          F_sizes=F_sizes, 
          output_act=output_act, 
          output_dim=output_dim, 
          loss=loss, 
          latent_dropout=dropout_rate,
          F_dropouts=dropout_rate,
          optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))

In [None]:
the_fit = pfn.fit(X_train, Y_train,
                  epochs=N_Epochs,
                  batch_size=batch_size,
                  callbacks=[lr_scheduler, early_stopping],
                  shuffle=train_shuffle,
                  validation_data=(X_val, Y_val),
                  verbose=1)

In [None]:
pfn.layers
pfn.save("%s/energy_regression.h5"%(path))
mypreds = pfn.predict(X_test,batch_size=400)

if (Y_scalar):
    mypreds = mypreds*Y_StDev + Y_Mean
    Y_test  =  Y_test*Y_StDev + Y_Mean
    
np.save("%s/predictions.npy"%(path),mypreds)
np.save("%s/y_test.npy"%(path),Y_test)
np.save("%s/x_test.npy"%(path),X_test)

___

In [None]:
#label = "single_layering_300k_dropout_10"
#path = "./"+label
#path = "./single_layering_300k"
#path = "./single_layering_300k_dropout_10"

Y_test = np.load("%s/y_test.npy"%(path))
X_test = np.load("%s/x_test.npy"%(path))
mypreds = np.load("%s/predictions.npy"%(path))

In [None]:
print(path)

In [None]:
fig = plt.figure(figsize=(28,10))
ax = plt.subplot(1, 2, 1)
plt.scatter(Y_test,mypreds,alpha=0.1)
plt.xlabel("Y Test [GeV]",fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tick_params(direction='in',right=True,top=True,length=10)
#plt.ylim(-0.01,100.01)
plt.ylim(-10,110)
plt.ylabel("Y Pred [GeV]",fontsize=22)
_ = plt.title("Prediction vs. Test",fontsize=26)

ax = plt.subplot(1, 2, 2)
plt.plot(the_fit.history['loss'])
plt.plot(the_fit.history['val_loss'])
plt.title('Model Loss vs. Epoch',fontsize=26)
#plt.text(0.73,0.73,"Step Decayed \n Learning Rate \n {:.1e} to {:.1e}".format(learning_rate,1e-6),
#         transform=ax.transAxes,fontsize=20)
plt.text(0.67,-0.08,label,transform=ax.transAxes,fontsize=10)
plt.ylabel('Loss',fontsize=22)
plt.yscale('log')
plt.xlabel('epoch',fontsize=22)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tick_params(direction='in',right=True,top=True,length=10)
plt.tick_params(direction='in',right=True,top=True,which='minor')
plt.xlim([-1,150])
plt.legend(['train', 'validation'], loc='upper right',fontsize=22)
plt.savefig("%s/varphi_StepDecay_Prediction_Test.png"%(path))

In [None]:
from copy import copy
from matplotlib.colors import LogNorm
fig, axes = plt.subplots(nrows=1, figsize=(14, 10), constrained_layout=True)
cmap = copy(plt.cm.plasma)
cmap.set_bad(cmap(0))
edges=np.linspace(-10,110,121)
h, xedges, yedges = np.histogram2d(Y_test, mypreds[:,0], bins=[edges, edges])
pcm = axes.pcolormesh(xedges, yedges, h.T, cmap=cmap,
                         norm=LogNorm(vmin=1.0e-2,vmax=1.0e2), rasterized=True)
cb = fig.colorbar(pcm, ax=axes, pad=0)
cb.set_label("Counts",fontsize=22)
cb.ax.tick_params(labelsize=20)
axes.set_xlabel("Generated Energy",fontsize=22)
plt.yticks(fontsize=20)
plt.xticks(fontsize=20)
plt.ylim(-1,110)
plt.xlim(-0,101)
axes.set_ylabel("Predicted Energy",fontsize=25)
axes.set_title("Predicted vs. Generated Energy",fontsize=30)
plt.savefig("%s/Gen_vs_Pred.pdf"%(path))
print(np.size(yedges))

In [None]:
star_energies = [12,16,20,25,30,50,60,70]
star_res = [0.18, 0.16, 0.15, 0.14, 0.13, 0.098, 0.092, 0.090]

ECCE_res = [0.15,0.127,0.117,0.121,0.106,0.102,0.092,0.098]
ECCE_energies = [10,20,30,40,50,60,80,100]

In [None]:
#Binning
N = 51
E_Max = 100
E_Bins = np.linspace(0,E_Max,N)

#Goal: slices defined by bin of truthE, filled with prediction distributions
indecies = np.digitize(Y_test,E_Bins)-1
max_count = ((np.bincount(indecies).max()))
slices = np.empty((N,max_count))
slices.fill(np.nan)

counter = np.zeros(N,int)
avg_truth = np.zeros(N,float)

pred_over_truth = np.zeros(N,float)

for i in range(len(mypreds)):
    
    #if (Y_test[i] > E_Max): continue
    bin = indecies[i]
    slices[bin][counter[bin]] = mypreds[i] #slices[bin, element inside bin]
    counter[bin]+=1
    avg_truth[bin]+=Y_test[i]
    pred_over_truth[bin] += mypreds[i]/Y_test[i]
    

#Resoluton: stdev(pred)/avg_truth    
avg_truth = avg_truth/counter
stdev_pred = np.nanstd(slices,axis=1)
resolution = stdev_pred/avg_truth

pred_over_truth = pred_over_truth/counter

In [None]:
fig=plt.figure(figsize=(14,10))
plt.title("AI Codesign Resolution",fontsize=25)
plt.ylabel("$(\sigma_{E,\mathrm{Pred}}/E_\mathrm{Truth})$",fontsize=24)
plt.xlabel("$E_\mathrm{Truth}$ [GeV]",fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tick_params(direction='in',right=True,top=True,length=10)
#plt.ylim(-0.02,0.4)
plt.ylim(0,2)
plt.ylim(0,.32)
plt.xlim(-0.01,100.01)
plt.xlim(5.0,100)
errors = 1.0/(np.sqrt(2*counter-2))*stdev_pred
ax = plt.subplot(1,1,1)
first_bin = 0
last_bin = N

plt.text(0.8,-0.08,label,transform=ax.transAxes,fontsize=10)
errors = np.zeros(len(errors))
plt.errorbar(avg_truth[first_bin:last_bin],resolution[first_bin:last_bin],yerr=errors[first_bin:last_bin],
             linestyle="-",linewidth=2.0,capsize=4,capthick=1.2,elinewidth=1.2,ecolor='black',marker="o",color='dodgerblue',alpha=0.7,label="ML")
_ = plt.text(0.7,0.93,"Stat. Error: $\dfrac{\sigma}{\sqrt{2N-2} } $",transform=ax.transAxes,fontsize=20)

plt.plot(ECCE_energies,ECCE_res,"-o",label = "EIC Ref. [ECCE?]",color="limegreen")
plt.plot(star_energies,star_res,"-o",label = "STAR",color="deeppink")
plt.legend(fontsize=15,loc="upper left")
plt.savefig("%s/resolution_plot.pdf"%(path))

In [None]:
fig,axs = plt.subplots(int(N/10),10, figsize=(32, 16),sharex=False,sharey=True,constrained_layout=True)
for i in range(N):
    row = int(i/10)
    col = i%10
    if(row==int(N/10)): continue
    ax = axs[row,col]
    
    if (col==0):
        ax.set_ylabel("Counts",fontsize=15)
        
    temp_bin = np.linspace(avg_truth[i]-2.0,avg_truth[i]+2.0,16)
    ax.set_title("%1.1f $ < E_\mathrm{Truth} < $%1.1f [GeV]"%(E_Bins[i],E_Bins[i+1]))
    ax.set_xlabel("Predicted Eenergy")
    ax.hist(slices[i],label="Predicted Energies")
    ax.axvline(x=avg_truth[i],color='red',alpha=0.3,linestyle="--",label="Avg. $E_\mathrm{Truth} = %1.2f$"%(avg_truth[i]))
    ax.axvline(x=np.nanmean(slices,axis=-1)[i],color='cyan',alpha=0.3,linestyle="--",label="Avg. $E_\mathrm{Pred} = %1.2f$"%(np.nanmean(slices,axis=-1)[i]))

    ax.legend(fontsize=7.5)
    ax.tick_params(direction='in',right=True,top=True,length=5)
plt.savefig("%s/resolutions_slices.pdf"%(path))

In [None]:
fig=plt.figure(figsize=(14,10))
plt.title("AI Codesign Resolution",fontsize=25)
plt.ylabel("$(E_\mathrm{Pred}/E_\mathrm{Truth})$",fontsize=24)
plt.xlabel("$E_\mathrm{Truth}$ [GeV]",fontsize=24)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.tick_params(direction='in',right=True,top=True,length=10)
plt.axhline(y=1.0, color='r', linestyle='--',alpha=0.5)#plt.ylim(-0.02,0.4)
plt.ylim(0.5,1.5)
plt.xlim(-1,101)
errors = 1.0/(np.sqrt(2*counter-2))*stdev_pred
ax = plt.subplot(1,1,1)
first_bin = 0
last_bin = N
plt.errorbar(avg_truth[first_bin:last_bin],pred_over_truth[first_bin:last_bin],yerr=errors[first_bin:last_bin],
             linestyle="-",linewidth=2.0,capsize=4,capthick=1.2,elinewidth=1.2,ecolor='black',marker="o",color='dodgerblue',alpha=0.7)
_ = plt.text(0.7,0.93,"Stat. Error: $\dfrac{\sigma}{\sqrt{2N-2} } $",transform=ax.transAxes,fontsize=20)
plt.savefig("%s/scale_plot.pdf"%(path))

In [None]:
#Find the peak and zoom in
fig = plt.figure(figsize=(18,5))
plt.suptitle("Predictions",fontsize=25)
ax = plt.subplot(1, 2, 1)
plt.xlabel("Predicted Energy [GeV]",fontsize=18)
bins = np.linspace(0,100,300)
freq = plt.hist(mypreds, bins=bins)

ax = plt.subplot(1, 2, 2)
plt.xlabel("Predicted Energy [GeV]",fontsize=18)
plt.text(0.03,0.9,"[Zoomed In]",transform=ax.transAxes,fontsize=18)
maxbin = np.argmax(freq[0])
zoom = np.linspace(bins[maxbin]-0.01,bins[maxbin]+0.01,100)
mask = np.where(np.logical_and(mypreds>=bins[maxbin]-0.01, mypreds<=bins[maxbin]+0.01))[0]
_ = plt.hist(mypreds[mask],alpha=.3,color="black",bins=zoom)
print("%i / %i Events"%(len(mask),len(mypreds)),"[{:.3%}]".format( (len(mask)/len(mypreds)) ) )

In [None]:
#plt.hist(np.count_nonzero(np.ravel(X_test[:,1,:])))
weird_data = X_test[mask]
bins = np.linspace(0,1000,100)
fig = plt.figure(figsize=(18,5))
ax = plt.subplot(1,2,1)
plt.hist(np.ravel(np.count_nonzero(X_test,axis=2)),bins=bins)
plt.title("Entire Image Dataset",fontsize=20)
ax = plt.subplot(1,2,2)
plt.title("Weird Cells",fontsize=20)
plt.hist(np.ravel(np.count_nonzero(weird_data,axis=2)),bins=bins)
plt.suptitle("Number of Cell Hits",fontsize=25)

In [None]:
cm = plt.cm.get_cmap('plasma')
cell_vars = ["Energy","Cell X","Cell Y","Cell Depth","Layer 1 Position", "Layer 2 Position"]
bins = [np.linspace(0.01,500,200),np.linspace(-500,500,100),
        np.linspace(-500,1300,100),np.linspace(-1.5,2.5,100),
        np.linspace(-275,350,100),np.linspace(-275,350,100)]
data=[]
weird_data=X_test[mask]
fig = plt.figure(figsize=(18,9))
#plt.subplots_adjust(left=None, bottom=1, right=None, top=1.5, wspace=None, hspace=None)
for i in range(images.shape[1]):
    ax = plt.subplot(2, 3, i+1)
    data.insert(i,np.ravel(weird_data[:,i,:]))
    data[i] = data[i][data[i]!=0]
    plt.hist(data[i],bins=bins[i],color=cm(i/images.shape[1]))
    plt.title("%s"%(cell_vars[i]),fontsize=20)
    plt.suptitle("'Weird' Normalized Cell Input Data",fontsize=25)
plt.savefig("%s/weird_Cell_Data.pdf"%(path))
    #plt.xlabel('%s [Normalized]'%(cell_vars[i]),fontsize=15)

In [None]:
cm = plt.cm.get_cmap('plasma')
cell_vars = ["Energy","Cell X","Cell Y","Cell Depth","Layer 1 Position", "Layer 2 Position"]
bins = [np.linspace(0.01,500,200),np.linspace(-500,500,100),
        np.linspace(-500,1300,100),np.linspace(-1.5,2.5,100),
        np.linspace(-275,350,100),np.linspace(-275,350,100)]
weird_data=X_test[mask]
fig = plt.figure(figsize=(18,9))
#plt.subplots_adjust(left=None, bottom=1, right=None, top=1.5, wspace=None, hspace=None)
for i in range(images.shape[1]-1):
    ax = plt.subplot(2, 3, i+1)
    plt.scatter(np.ravel(weird_data[:,i,:]),np.ravel(weird_data[:,i+1,:]),s=2)
    plt.xlabel(cell_vars[i])
    plt.ylabel(cell_vars[i+1])
    #plt.title("%s"%(cell_vars[i]),fontsize=20)
    plt.suptitle("'Weird' Normalized Cell Scatter Plots",fontsize=25)
plt.savefig("%s/weird_Cell_Data.pdf"%(path))

In [None]:
print([images.chunks])

In [None]:
batch = images.chunks[0]*4
shape = [batch,images.chunks[1],images.chunks[2]] #TF is quite particular...
ds = tf.data.Dataset.from_generator(
    generator(h5_file)
    tf.float32,
    tf.TensorShape(shape)
)
