In [7]:
import pandas as pd
import numpy as np
import h5py

In [2]:
def getNumberOfEvents(f):
  return f['event_table']['event_id'].shape[0]

In [3]:
# Some code modified from:
# https://github.com/vhewes/pynuml/blob/main/pynuml/io/file.py

possibleKeys = {
             "event_table": ['event_id', 'is_cc', 'lep_energy', 'nu_dir', 'nu_energy', 'nu_pdg', 'nu_vtx', 'nu_vtx_corr', 'nu_vtx_wire_pos', 'nu_vtx_wire_time'],
             "hit_table": ['hit_id', 'integral', 'local_plane', 'local_time', 'local_wire', 'rms', 'tpc'],
             "particle_table": ['category', 'end_position', 'end_position_corr', 'end_process',
                                'end_wire_pos', 'end_wire_time', 'g4_id', 'g4_pdg', 'instance',
                                'momentum', 'parent_id', 'start_position', 'start_position_corr',
                                'start_process', 'start_wire_pos', 'start_wire_time'],
             "edep_table": ['energy', 'energy_fraction', 'g4_id', 'hit_id'],
             "spacepoint_table": ['hit_id', 'position', 'spacepoint_id']
            }

colmap = {
            "event_table": {
                "event_id": [ "run", "subrun", "event" ],
                "nu_dir": [ "nu_dir_x", "nu_dir_y", "nu_dir_z" ],
                "nu_vtx": [ "nu_vtx_x", "nu_vtx_y", "nu_vtx_z" ],
                "nu_vtx_corr": [ "nu_vtx_corr_x", "nu_vtx_corr_y", "nu_vtx_corr_z" ],
            },
            "particle_table": {
                "start_position": [ "start_position_x", "start_position_y", "start_position_z" ],
                "end_position": [ "end_position_x", "end_position_y", "end_position_z" ],
                "start_position_corr": [ "start_position_corr_x", "start_position_corr_y", "start_position_corr_z" ],
                "end_position_corr": [ "end_position_corr_x", "end_position_corr_y", "end_position_corr_z" ],
            },
            "spacepoint_table": {
                "hit_id": [ "hit_id_u", "hit_id_v", "hit_id_y" ],
                "position": [ "position_x", "position_y", "position_z" ],
            },
            "pandoraPrimary_table": {
                "vtx": [ "vtx_x", "vtx_y", "vtx_z" ],
            },
        }

def cols(f, group, key):
  if group in colmap and key in colmap[group].keys():
    return colmap[group][key]
  elif f[group][key].shape[1]==1:
    return [key]
  else:
    return [ key+"_"+str(c) for c in range(0,f[group][key].shape[1])]

def buildDataframe(f, group, startevt=0, nevts=None, keys=None):

  # Get table
  assert(group in list(f.keys())), "Error: %s not in file's keys"%(str(group))
  assert(group in ['event_table', 'hit_table', 'particle_table', 'edep_table', 'spacepoint_table']), "Error: Code currently not equipped to handle %s group. Please select from the following options: 'event_table', 'hit_table', 'particle_table'"%(str(group))
  table = f[group]
  if keys is None:
    keys = possibleKeys[group]

  # A few sanity checks to set nevts
  if nevts is not None:
    assert(nevts >= 1), "Error: nevts must be at least 1"
    assert(nevts <= getNumberOfEvents(f)), "Error: nevts must be less than the total number of events in file"
  else:
    nevts = table['event_id.seq_cnt'].shape[0] - startevt # Set to be the total number of events

  # Convert startevt and nevts into start_table_indx and end_table_indx
  #   Note that not every evtindx has a particle, so we have to be a bit more careful when selecting
  #   i.e. startevt:endevt won't work, so we'll use masks instead (this is more general)
  endevt = startevt+nevts

  if group != 'event_table':
    # Create a mask of rows corresponding to event indices within range
    mask = (table['event_id.seq_cnt'][:,0] >= startevt) & (table['event_id.seq_cnt'][:,0] < endevt)
    newstart   = np.where(mask==True)[0][0]
    newend     = np.where(mask==True)[0][-1] + 1

    start_table_indx  = np.sum(table['event_id.seq_cnt'][0:newstart, 1]) # Starting particle index
    nRows             = np.sum(table['event_id.seq_cnt'][newstart:newend, 1])
    end_table_indx    = start_table_indx + nRows
  else:
    mask = (table['event_id.seq_cnt'][:,0] >= startevt) & (table['event_id.seq_cnt'][:,0] < endevt)
    start_table_indx, end_table_indx = startevt, endevt

  # Create individual pandas dataframes for each column and then merge them
  #   Some columns are expanded from their form in the table, the function cols(group, key) provides the mapping
  #   e.g. for group=='event_table', key='event_id' maps to 3 columns in the dataframe ['run', 'subrun', 'event']
  dfs  = [ pd.DataFrame(np.array(table[key][start_table_indx:end_table_indx]), columns=cols(f, group, key)) for key in keys ]
  df = pd.concat(dfs, axis="columns")

  # Calculate and add a column corresponding to evt_indx
  evt_idx_col = []
  for seq in table['event_id.seq_cnt'][mask]:
    evt_idx_col += seq[1]*[seq[0]]
  df['evt_idx'] = evt_idx_col

  return df

In [8]:
with h5py.File('../eta-pi-data/merged_Eta_update_sequenced.h5', 'r') as f:

    df = buildDataframe(f, 'spacepoint_table', nevts=100)

    print(df)

       hit_id_u  hit_id_v  hit_id_y  position_x  position_y   position_z  \
0           426      1293        -1  210.103897   40.460758  1019.099976   
1           426      1293        -1  210.103897   40.460758  1019.299988   
2           426      1294        -1  210.079239   40.633965  1019.400024   
3           436      1293        -1  210.143066   40.287552  1019.400024   
4           426      1295        -1  210.053909   40.807167  1019.700012   
...         ...       ...       ...         ...         ...          ...   
51272       355        -1      1093   77.738907   20.542173  1030.400024   
51273       354        -1      1093   77.650505   20.888584  1030.400024   
51274       357        -1      1094   77.281883   20.022558  1030.699951   
51275       359        -1      1094   77.348061   19.676149  1030.699951   
51276       360        -1      1094   77.363998   19.502943  1030.800049   

       spacepoint_id  evt_idx  
0                  0        3  
1                  1   