### Notebook to make Fast Point-Cloud Diffusion distributions Discrete, like G4

In [15]:
import h5py
import numpy as np
from tqdm import tqdm

In [16]:
geant4_name = "improved_200cells_FPCD.hdf5"
g4 = h5py.File(geant4_name, 'r')
print(g4.keys())
chunk_size=2000
nevts=100_000

<KeysViewHDF5 ['cluster', 'hcal_cells']>


In [17]:
def get_bin_edges(g4_cell_data):
    centers = np.unique(g4_cell_data)
    if (centers[0] == 0):
        centers = centers[1:]
    width = np.round(centers[1] - centers[0],2)

    edges = centers - width/2
    max_edge = centers[-1] + width/2
    edges = (np.append(edges,max_edge))
    
    return centers, edges

In [18]:
bin_dict = {}
var_str = ["E","X","Y","Z"]

for var in range(1,4):
    g4_data = g4['hcal_cells'][:nevts,:,var]
    centers, edges = get_bin_edges(g4_data)
    bin_dict[f"centers{var_str[var]}"],bin_dict[f"edges{var_str[var]}"] = centers, edges 

print(bin_dict.keys())

dict_keys(['centersX', 'edgesX', 'centersY', 'edgesY', 'centersZ', 'edgesZ'])


In [19]:
print((bin_dict["centersX"]))

[-2700. -2600. -2500. -2400. -2300. -2200. -2100. -2000. -1900. -1800.
 -1700. -1600. -1500. -1400. -1300. -1200. -1100. -1000.  -900.  -800.
  -700.  -600.  -500.  -400.  -300.  -200.  -100.     0.   100.   200.
   300.   400.   500.   600.   700.   800.   900.  1000.  1100.  1200.
  1300.  1400.  1500.  1600.  1700.  1800.  1900.  2000.  2100.  2200.
  2300.  2400.  2500.  2600.  2700.]


In [53]:
diffusion_name = "GSGM.h5"
dfsn = h5py.File(diffusion_name,'r')
print(dfsn.keys())

digit_dict = {}
var_str = ["E","X","Y","Z"]
for var in range(1,4):
    diffusion_data = dfsn['cell_features'][:,:,var]
    digits = np.digitize(diffusion_data,bin_dict[f"edges{var_str[var]}"])
    print(var_str[var],": ",digits[100,:10])
    digit_dict[f"digits{var_str[var]}"] = digits - 1  # -1 for indices

<KeysViewHDF5 ['cell_features', 'cluster_features']>
X :  [16 19 15 17 16 14 18 16 16 14]
Y :  [29 27 29 28 29 27 27 29 29 28]
Z :  [ 6 22 16  8  9 20 28 12 10 23]


In [21]:
np.shape(dfsn['cluster_features'][1])

(2,)

In [80]:
test = 0
nevents = np.shape(dfsn['cell_features'])[0]
ncells = np.shape(dfsn['cell_features'])[1]
nvar = np.shape(dfsn['cell_features'])[2]
ncluster_var = np.shape(dfsn['cluster_features'])[1]
chunk_size = 100
with h5py.File(f'GSGM_Discrete.h5', 'w') as newfile:
    # create empty data set
    dset = newfile.create_dataset('cell_features', 
                                shape=(np.shape(dfsn['cell_features'])),
                                maxshape=(np.shape(dfsn['cell_features'])), 
                                chunks=(chunk_size, ncells, nvar),
                                dtype=np.float32)
    cluster_dset = newfile.create_dataset('cluster_features', 
                            shape=(np.shape(dfsn['cluster_features'])),
                            maxshape=(np.shape(dfsn['cluster_features'])), 
                            chunks=(chunk_size, 2),
                            dtype=np.float32,
                            data=dfsn['cluster_features'])
    
    for var in range(1,4):
        
        g4_centers = bin_dict[f"centers{var_str[var]}"]  # what the data will be set to
        n_bins = len(bin_dict[f"centers{var_str[var]}"]) 
        var_mask =  digit_dict[f"digits{var_str[var]}"]  #which data will be edited
        #print("var mask shape = ",np.shape(var_mask))
        diffusion_data = dfsn['cell_features'][:,:,var]
        #print("var_mask = ",var_mask[10])
        for evt in tqdm(range(nevents)):
            for ibin in range(n_bins):
                bin_mask = var_mask[evt]==ibin
                #print("BIN NUMBER",ibin)
                #print("Var Mask sample",var_mask[evt,25:35])
                #print("Data = ",diffusion_data[evt,25:35][var_mask[evt,25:35]==ibin])
                diffusion_data[evt][bin_mask] = g4_centers[ibin]
                #print("Data Discr= ",diffusion_data[evt,25:35][var_mask[evt,25:35]==ibin])
            
                
        dset[:,:,var] = diffusion_data
        print(f"Done with {var_str[var]}")

100%|██████████| 100000/100000 [00:11<00:00, 8355.65it/s]


Done with X


100%|██████████| 100000/100000 [00:11<00:00, 8371.58it/s]


Done with Y


100%|██████████| 100000/100000 [00:12<00:00, 7718.22it/s]


Done with Z


In [83]:
with h5py.File(f'GSGM_Discrete.h5', 'r') as disc:
    print(disc["cell_features"][1000,:,3])

[3891.7 3821.5 3868.3 3868.3 3821.5 3844.9 3938.5 3915.1 3821.5 3844.9
 3868.3 3844.9 3844.9 4008.7 4312.9 4219.3 3961.9 3891.7 4219.3 4032.1
 3868.3 3961.9 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1 4149.1
 4149.

In [32]:
print(g4["hcal_cells"][100,:10,1])

[ 200.  200.  200.  200.  200.  200.  200.    0. -100.  100.]
