### Notebook to make Fast Point-Cloud Diffusion distributions Discrete, like G4

In [1]:
import h5py
import numpy as np
from tqdm import tqdm

In [28]:
geant4_name = "improved_200cells_FPCD.hdf5"
g4 = h5py.File(geant4_name, 'r')
print(g4.keys())
chunk_size=2000
nevts=100_000

<KeysViewHDF5 ['cluster', 'hcal_cells']>


In [31]:
def get_bin_edges(g4_cell_data):
    centers = np.unique(g4_cell_data)
    if (centers[0] == 0):
        centers = centers[1:]
    width = np.round(centers[1] - centers[0],2)

    edges = centers - width/2
    max_edge = centers[-1] + width/2
    edges = (np.append(edges,max_edge))
    
    return centers, edges

In [32]:
bin_dict = {}
var_str = ["E","X","Y","Z"]

for var in range(1,4):
    g4_data = g4['hcal_cells'][:nevts,:,var]
    centers, edges = get_bin_edges(g4_data)
    bin_dict[f"centers{var_str[var]}"],bin_dict[f"edges{var_str[var]}"] = centers, edges 

print(bin_dict.keys())

dict_keys(['centersX', 'edgesX', 'centersY', 'edgesY', 'centersZ', 'edgesZ'])


In [33]:
print((bin_dict["centersZ"]))

[3821.5 3844.9 3868.3 3891.7 3915.1 3938.5 3961.9 3985.3 4008.7 4032.1
 4055.5 4078.9 4102.3 4125.7 4149.1 4172.5 4195.9 4219.3 4242.7 4266.1
 4289.5 4312.9 4336.3 4359.7 4383.1 4406.5 4429.9 4453.3 4476.7 4500.1
 4523.5 4546.9 4570.3 4593.7 4617.1 4640.5 4663.9 4687.3 4710.7 4734.1
 4757.5 4780.9 4804.3 4827.7 4851.1 4874.5 4897.9 4921.3 4944.7 4968.1
 4991.5 5014.9 5038.3 5061.7 5085.1]


In [35]:
diffusion_name = "GSGM_nospikes_300Epochs.h5"
dfsn = h5py.File(diffusion_name,'r')
print(dfsn.keys())

digit_dict = {}
var_str = ["E","X","Y","Z"]
for var in range(1,4):
    diffusion_data = dfsn['cell_features'][:,:,var]
    digits = np.digitize(diffusion_data,bin_dict[f"edges{var_str[var]}"])
    print(var_str[var],": ",digits[100,:10])
    digit_dict[f"digits{var_str[var]}"] = digits - 1  # -1 for indices

<KeysViewHDF5 ['cell_features', 'cluster_features']>
X :  [16 14 16 16 16 16 16 16 16 16]
Y :  [25 26 26 27 28 30 28 27 28 28]
Z :  [10  4 10  4 16 26  3  4  8  9]


In [36]:
np.shape(dfsn['cluster_features'][1])

(2,)

In [43]:
test = 0
nevents = np.shape(dfsn['cell_features'])[0]
ncells = np.shape(dfsn['cell_features'])[1]
nvar = np.shape(dfsn['cell_features'])[2]
ncluster_var = np.shape(dfsn['cluster_features'])[1]
chunk_size = 100
with h5py.File(f'GSGM_Discrete.h5', 'w') as newfile:
    # create empty data set
    dset = newfile.create_dataset('cell_features', 
                                shape=(np.shape(dfsn['cell_features'])),
                                maxshape=(np.shape(dfsn['cell_features'])), 
                                chunks=(chunk_size, ncells, nvar),
                                dtype=np.float32)

    cluster_dset = newfile.create_dataset('cluster_features', data=dfsn['cluster_features'])
    
    
    dset[:,:,0] = dfsn['cell_features'][:,:,0]
    for var in range(1,4):
        
        g4_centers = bin_dict[f"centers{var_str[var]}"]  # what the data will be set to
        n_bins = len(bin_dict[f"centers{var_str[var]}"]) 
        var_mask =  digit_dict[f"digits{var_str[var]}"]  #which data will be edited
        #print("var mask shape = ",np.shape(var_mask))
        diffusion_data = dfsn['cell_features'][:,:,var]
        #print("var_mask = ",var_mask[10])
        for evt in tqdm(range(nevents)):
            for ibin in range(n_bins):
                bin_mask = var_mask[evt]==ibin
                #print("BIN NUMBER",ibin)
                #print("Var Mask sample",var_mask[evt,25:35])
                #print("Data = ",diffusion_data[evt,25:35][var_mask[evt,25:35]==ibin])
                #print("BIN CENTER = ",g4_centers[ibin])
                diffusion_data[evt][bin_mask] = g4_centers[ibin]
                #print("Data Discr= ",diffusion_data[evt,25:35][var_mask[evt,25:35]==ibin])
            
                
        dset[:,:,var] = np.round(diffusion_data,2)
        print(np.round(diffusion_data[100,25:35],2))
        print(f"Done with {var_str[var]}")

100%|██████████| 10000/10000 [00:01<00:00, 6690.70it/s]


[-1100. -1200. -1200. -1200. -1100. -1100. -1300. -1100. -1500. -1100.]
Done with X


100%|██████████| 10000/10000 [00:01<00:00, 7947.17it/s]


[-100. -100. -200. -100. -200.  100.  200.    0. -100. -200.]
Done with Y


100%|██████████| 10000/10000 [00:01<00:00, 7459.80it/s]


[4032.1 3915.1 4149.1 4359.7 3915.1 4172.5 4500.1 3891.7 4102.3 4078.9]
Done with Z


In [45]:
with h5py.File(f'GSGM_Discrete.h5', 'r') as disc:
    print(disc["cell_features"][10,:,3])

[3844.9 4266.1 4195.9 4406.5 4078.9 4125.7 4078.9 3961.9 3844.9 4149.1
 4055.5 4219.3 4266.1 4125.7 3961.9 4266.1 3868.3 3868.3 3985.3 4266.1
 4172.5 3961.9 3938.5 4149.1 3891.7 4008.7 3915.1 3985.3 3915.1 3938.5
 3961.9 3821.5 4102.3 4008.7 3891.7 3938.5 4266.1 3891.7 4055.5 4195.9
 3985.3 4266.1 4102.3 4149.1 4172.5 4102.3 3938.5 3985.3 3985.3 4102.3
 4078.9 3891.7 3961.9 4008.7 3961.9 4289.5 4125.7 4219.3 4195.9 3961.9
 4008.7 4078.9 4242.7 4219.3 3821.5 3961.9 4055.5 3938.5 4078.9 3938.5
 4289.5 4195.9 4008.7 3961.9 4008.7 4055.5 4125.7 3961.9 3985.3 3915.1
 4102.3 3891.7 4195.9 4242.7 3891.7 3985.3 4219.3 3844.9 3821.5 3938.5
 4149.1 3938.5 3985.3 4055.5 3821.5 3961.9 3961.9 3938.5 4172.5 4055.5
 4195.9 4172.5 3985.3 4032.1 3985.3 3938.5 3844.9 3985.3 3844.9 3821.5
 4429.9 3985.3 3985.3 4172.5 3961.9 4149.1 4008.7 3938.5 4149.1 4102.3
 4336.3 3961.9 4055.5 3938.5 4078.9 4078.9 4055.5 4102.3 3985.3 4032.1
 4195.9 3961.9 4032.1    0.     0.     0.     0.     0.     0.     0.
    0. 

In [None]:
print(g4["hcal_cells"][100,:10,3])