In [68]:
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import h5py
from torch.utils.data import Dataset, DataLoader
import numpy as np
import preprocessing

In [13]:
train_dir = "/global/ml4hep/spss/mfong/transfer_learning/delphes_train/"
train_file_names = os.listdir(train_dir)

In [14]:
train_file_names

['train_10.h5',
 'train_5.h5',
 'train_0.h5',
 'train_8.h5',
 'train_14.h5',
 'train_4.h5',
 'train_1.h5',
 'train_6.h5',
 'train_12.h5',
 'train_7.h5',
 'train_9.h5',
 'train_2.h5',
 'train_13.h5',
 'train_11.h5',
 'train_3.h5']

In [72]:
class H5Dataset(Dataset):
    def __init__(self, file_paths, batch_size, transform=None):
        self.file_paths = file_paths
        self.batch_size = batch_size
        self.transform = transform
        self.sample_indices = []  # Store the indices of samples within each file
        self.FEATURE_KEYS = ['fjet_clus_eta', 'fjet_clus_phi', 'fjet_clus_pt', 'fjet_clus_E']

        for file_path in file_paths:
            with h5py.File(file_path, 'r') as file:
                num_samples = len(file[self.FEATURE_KEYS[0]])  # 'data' is the dataset name in your HDF5 file
                indices = list(range(num_samples))
                self.sample_indices.extend([(file_path, idx) for idx in indices])

    def __len__(self):
        return len(self.sample_indices)

    def __getitem__(self, idx):
        file_path, sample_idx = self.sample_indices[idx]
        
        with h5py.File(file_path, 'r') as file:
            # data = file['data'][sample_idx]  # Load a single sample
            data = {k: v[sample_idx:sample_idx+1] for k, v in file.items() if k in self.FEATURE_KEYS}

        if self.transform:
            data = self.transform(data)

        return data

In [73]:
dataset = H5Dataset([train_dir + filename for filename in train_file_names[0:1]], batch_size=256, transform=lambda x: preprocessing.constituent(x, 200))

In [74]:
len(dataset)

70871110

In [83]:
dataset[0]

array([[[ 0.00000000e+00, -0.00000000e+00,  1.32332802e+01, ...,
         -1.59273541e+00, -1.59765470e+00,  0.00000000e+00],
        [ 3.72529030e-09, -1.13548025e-01,  1.28406582e+01, ...,
         -1.98535752e+00, -1.98184884e+00,  1.13548025e-01],
        [ 1.40889362e-03, -1.13734171e-01,  1.25467110e+01, ...,
         -2.27930427e+00, -2.27592731e+00,  1.13742895e-01],
        ...,
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
          0.00000000e+00,  0.00000000e+00,  0.00000000e+00]]],
      dtype=float32)

In [22]:
f = h5py.File(train_dir + train_file_names[0], 'r')

In [62]:
[a for a in f.items()]

[('fjet_clus_E',
  <HDF5 dataset "fjet_clus_E": shape (5000000, 200), type "<f4">),
 ('fjet_clus_eta',
  <HDF5 dataset "fjet_clus_eta": shape (5000000, 200), type "<f4">),
 ('fjet_clus_phi',
  <HDF5 dataset "fjet_clus_phi": shape (5000000, 200), type "<f4">),
 ('fjet_clus_pt',
  <HDF5 dataset "fjet_clus_pt": shape (5000000, 200), type "<f4">),
 ('fjet_eta', <HDF5 dataset "fjet_eta": shape (5000000,), type "<f4">),
 ('fjet_m', <HDF5 dataset "fjet_m": shape (5000000,), type "<f4">),
 ('fjet_phi', <HDF5 dataset "fjet_phi": shape (5000000,), type "<f4">),
 ('fjet_pt', <HDF5 dataset "fjet_pt": shape (5000000,), type "<f4">),
 ('labels', <HDF5 dataset "labels": shape (5000000,), type "<i4">),
 ('training_weights',
  <HDF5 dataset "training_weights": shape (5000000,), type "<f4">)]

In [49]:
f['fjet_clus_E'][0:1]

array([[5.6002412e+05, 3.8137597e+05, 2.8420822e+05, 2.0101989e+05,
        1.8412698e+05, 1.5626641e+05, 8.8647594e+04, 7.8707836e+04,
        7.1114000e+04, 6.3091988e+04, 6.2576590e+04, 5.3210391e+04,
        5.2817859e+04, 5.0586125e+04, 3.8327656e+04, 3.8165348e+04,
        3.5620992e+04, 3.4259398e+04, 3.3245188e+04, 3.1260291e+04,
        2.6229201e+04, 2.2826189e+04, 1.8487025e+04, 1.7112283e+04,
        1.5398617e+04, 1.3776995e+04, 1.2058328e+04, 1.0108385e+04,
        9.4241221e+03, 9.4153037e+03, 9.4462227e+03, 8.8406191e+03,
        8.3386689e+03, 7.6338198e+03, 6.1374756e+03, 5.4986523e+03,
        4.9362178e+03, 4.6161245e+03, 4.2351646e+03, 4.1940547e+03,
        4.5491201e+03, 4.7815537e+03, 3.4107178e+03, 3.2732021e+03,
        3.2599397e+03, 3.1238564e+03, 2.9079082e+03, 2.8839543e+03,
        2.5343967e+03, 2.2693206e+03, 2.1025007e+03, 2.1537783e+03,
        1.7450792e+03, 2.2507239e+03, 1.7162832e+03, 1.5913851e+03,
        1.0824010e+03, 1.5985997e+03, 9.7579523e

In [34]:
f["fjet_clus_E"]

<HDF5 dataset "fjet_clus_E": shape (5000000, 200), type "<f4">