In [1]:
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from typing import List
import h5py
from torch.utils.data import Dataset, DataLoader
import numpy as np
import preprocessing
from tqdm import tqdm

In [None]:
train_dir = "/global/ml4hep/spss/mfong/transfer_learning/delphes_train/"
train_dir_preprocess = "/clusterfs/ml4hep/mfong/delphes_train_preprocessed/"
train_filepaths = [train_dir + x for x in os.listdir(train_dir)]

# preprocess and save everything
def preprocess_data(train_filepaths: List[str], target_dir: str, force=False):
    """Preprocesses h5 data from train_filepaths using the preprocess.py script and stores it as h5 files in target_dir. Use absolute paths"""
    target_dir_filepaths = [target_dir + x for x in os.listdir(target_dir)]
    for filepath in tqdm(train_filepaths):
        filename = filepath.split("/")[-1]
        print(f"Starting preprocessing on {filepath}")
        target_filepath = f"{target_dir}preprocessed_{filename}"
        # check if this file was already preprocessed
        if not force and target_filepath in target_dir_filepaths:
            print(f"{target_dir}preprocessed_{filename} is already in target_dir, skipping this file")
            continue
        with h5py.File(filepath, 'r') as file:
            processed_data = preprocessing.constituent(file, 200)
            labels = file["labels"]
        print(f"Saving preprocessed data to {target_filepath}")
        with h5py.File(target_filepath, "w") as file:
            dset = file.create_dataset("data", processed_data.shape)
            dset[:] = processed_data
        with h5py.File(target_filepath, "w") as file:
            labels_dset = file.create_dataset("labels", labels.shape)
            labels_dset[:] = labels


target_dir = "/clusterfs/ml4hep/mfong/delphes_train_preprocessed/"
preprocess_data(train_filepaths, target_dir, force=True)
train_preprocess_file_names = os.listdir(train_dir_preprocess)

  0%|          | 0/16 [00:00<?, ?it/s]

Starting preprocessing on /global/ml4hep/spss/mfong/transfer_learning/delphes_train/train_10.h5


In [38]:
f = h5py.File(train_dir_preprocess + "preprocessed_train_10.h5", "w")
f

<HDF5 file "preprocessed_train_10.h5" (mode r+)>

In [39]:
labels_dset = f.create_dataset("labels", (5000000,))

In [40]:
dset = f.create_dataset("data", (5000000, 1400))

In [41]:
f.create_dataset("asdfasdf", (5000000, 1400))

<HDF5 dataset "asdfasdf": shape (5000000, 1400), type "<f4">

In [42]:
f.create_dataset("zzzzz", (5000000, 1400))

<HDF5 dataset "zzzzz": shape (5000000, 1400), type "<f4">

In [44]:
f["data"]

<HDF5 dataset "data": shape (5000000, 1400), type "<f4">

In [45]:
f.close()

In [149]:
class H5Dataset(Dataset):
    def __init__(self, file_paths: List[str], transform=None):
        self.file_paths = file_paths
        self.transform = transform
        self.sample_indices = []  # Store the indices of samples within each file
        # self.FEATURE_KEYS = ['fjet_clus_eta', 'fjet_clus_phi', 'fjet_clus_pt', 'fjet_clus_E']
        self.DATA_KEY = "data"
        self.LABEL_KEY = "labels"
        self.current_loaded_file_idx = None     # self.file_paths index that is currently loaded into memory and processed
        self.current_data = None

        for file_path_idx, file_path in enumerate(file_paths):
            with h5py.File(file_path, 'r') as file:
                num_samples = len(file[self.FEATURE_KEYS[0]])  # 'data' is the dataset name in your HDF5 file
                indices = list(range(num_samples))
                self.sample_indices.extend([(filepath_idx, idx) for idx in indices])
 
    def __len__(self):
        return len(self.sample_indices)

    def __getitem__(self, idx: str):
        file_path_idx, sample_idx = self.sample_indices[idx]
        
        if file_path_idx != self.current_loaded_file_idx
        
        file_path = self.file_paths[file_path_idx]
        
        # with h5py.File(file_path, 'r') as file:
        #     # data = file['data'][sample_idx]  # Load a single sample
        #     data = {k: v[sample_idx:sample_idx+1] for k, v in file.items() if k in self.FEATURE_KEYS}
        #     label = file[self.LABEL_KEY][sample_idx]

        if self.transform:
            x = self.transform(data).reshape(-1)

        return x, label

In [133]:
dataset = H5Dataset(train_preprocess_file_names[0:2], batch_size=256, transform=lambda x: preprocessing.constituent(x, 200))

In [134]:
dataloader = DataLoader(dataset, batch_size=256, shuffle=False)

In [151]:
len(dataset)

5000000

In [139]:
dataset[0][0].shape

  log_pt = np.log(pt)
  log_energy = np.log(energy)
  lognorm_pt = np.log(pt / sum_pt[:,np.newaxis])
  lognorm_energy = np.log(energy / sum_energy[:,np.newaxis])


(1400,)

In [144]:
x, y = next(iter(dataloader))

  log_pt = np.log(pt)
  log_energy = np.log(energy)
  lognorm_pt = np.log(pt / sum_pt[:,np.newaxis])
  lognorm_energy = np.log(energy / sum_energy[:,np.newaxis])


In [None]:
for x, y in dataloader:
    print(x.shape)
    print(y.shape)

In [141]:
x.shape

torch.Size([256, 1400])

In [143]:
y.shape

torch.Size([256])

In [153]:
with h5py.File(train_dir + train_file_names[0]) as file:
    data = preprocessing.constituent(file, 200)

  log_pt = np.log(pt)
  log_energy = np.log(energy)
  lognorm_pt = np.log(pt / sum_pt[:,np.newaxis])
  lognorm_energy = np.log(energy / sum_energy[:,np.newaxis])


In [155]:
data.shape.reshape()

(5000000, 200, 7)