In [1]:
import sys
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

from typing import List
import h5py
from torch.utils.data import Dataset, DataLoader
import numpy as np
import preprocessing
from tqdm import tqdm

In [67]:
train_dir = "/global/ml4hep/spss/mfong/transfer_learning/delphes_train/"
train_dir_preprocess = "/global/ml4hep/spss/mfong/transfer_learning/delphes_train_processed/"
# train_dir_preprocess = "/clusterfs/ml4hep/mfong/delphes_train_preprocessed/"
train_filepaths = [train_dir + x for x in os.listdir(train_dir)]

# preprocess and save everything
def preprocess_data(train_filepaths: List[str], target_dir: str, force=False):
    """Preprocesses h5 data from train_filepaths using the preprocess.py script and stores it as h5 files in target_dir. Use absolute paths"""
    target_dir_filepaths = [target_dir + x for x in os.listdir(target_dir)]
    for filepath in tqdm(train_filepaths):
        filename = filepath.split("/")[-1]
        print(f"Starting preprocessing on {filepath}")
        target_filepath = f"{target_dir}preprocessed_{filename}"
        # check if this file was already preprocessed
        if not force and target_filepath in target_dir_filepaths:
            print(f"{target_filepath} is already in target_dir, skipping this file")
            continue
        with h5py.File(filepath, 'r') as old_file, h5py.File(target_filepath, "w") as new_file:
            processed_data = preprocessing.constituent(old_file, 200)
            labels = old_file["labels"]
            print(f"Saving preprocessed data to {target_filepath}")
            dset = new_file.create_dataset("data", processed_data.shape)
            dset[:] = processed_data
            labels_dset = new_file.create_dataset("labels", labels.shape)
            labels_dset[:] = labels

In [71]:
preprocess_data(train_filepaths, train_dir_preprocess, force=False)
train_preprocess_file_names = os.listdir(train_dir_preprocess)
train_preprocess_filepaths = [train_dir_preprocess + name for name in train_preprocess_file_names]

100%|██████████| 15/15 [00:00<00:00, 48657.82it/s]

Starting preprocessing on /global/ml4hep/spss/mfong/transfer_learning/delphes_train/train_10.h5
/global/ml4hep/spss/mfong/transfer_learning/delphes_train_processed/preprocessed_train_10.h5 is already in target_dir, skipping this file
Starting preprocessing on /global/ml4hep/spss/mfong/transfer_learning/delphes_train/train_5.h5
/global/ml4hep/spss/mfong/transfer_learning/delphes_train_processed/preprocessed_train_5.h5 is already in target_dir, skipping this file
Starting preprocessing on /global/ml4hep/spss/mfong/transfer_learning/delphes_train/train_0.h5
/global/ml4hep/spss/mfong/transfer_learning/delphes_train_processed/preprocessed_train_0.h5 is already in target_dir, skipping this file
Starting preprocessing on /global/ml4hep/spss/mfong/transfer_learning/delphes_train/train_8.h5
/global/ml4hep/spss/mfong/transfer_learning/delphes_train_processed/preprocessed_train_8.h5 is already in target_dir, skipping this file
Starting preprocessing on /global/ml4hep/spss/mfong/transfer_learning/




In [51]:
class H5Dataset(Dataset):
    def __init__(self, filepaths: List[str], transform=None):
        self.filepaths = filepaths
        self.transform = transform
        self.sample_indices = []  # Store the indices of samples within each file
        # self.FEATURE_KEYS = ['fjet_clus_eta', 'fjet_clus_phi', 'fjet_clus_pt', 'fjet_clus_E']
        self.DATA_KEY = "data"
        self.LABEL_KEY = "labels"
        # self.current_loaded_file_idx = None     # self.filepaths index that is currently loaded into memory
        # self.current_data = None

        for filepath_idx, file_path in enumerate(filepaths):
            with h5py.File(file_path, 'r') as file:
                num_samples = len(file[self.LABEL_KEY])
                indices = list(range(num_samples))
                self.sample_indices.extend([(filepath_idx, idx) for idx in indices])
 
    def __len__(self):
        return len(self.sample_indices)

    def __getitem__(self, idx):
        filepath_idx, sample_idx = self.sample_indices[idx]              
        filepath = self.filepaths[filepath_idx]

        with h5py.File(filepath, 'r') as file:
            data = file[self.DATA_KEY][sample_idx]  # Load a single sample
            labels = file[self.LABEL_KEY][sample_idx]

        if self.transform:
            data = self.transform(data)
            
        return data, labels

In [52]:
dataset = H5Dataset(train_preprocess_filepaths, transform=None)

In [53]:
dataloader = DataLoader(dataset, batch_size=256, shuffle=False)

In [54]:
len(dataset)

15871110

In [60]:
dataset[0:5][0].shape

ValueError: too many values to unpack (expected 2)

In [61]:
x, y = next(iter(dataloader))

In [66]:
# for x, y in dataloader:
#     print(x.shape)
#     print(y.shape)

In [141]:
x.shape

torch.Size([256, 1400])

In [143]:
y.shape

torch.Size([256])

In [153]:
with h5py.File(train_dir + train_file_names[0]) as file:
    data = preprocessing.constituent(file, 200)

  log_pt = np.log(pt)
  log_energy = np.log(energy)
  lognorm_pt = np.log(pt / sum_pt[:,np.newaxis])
  lognorm_energy = np.log(energy / sum_energy[:,np.newaxis])
