In [1]:
import uproot
import awkward as ak

In [2]:
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
from pathlib import Path
import h5py

### set I/O path
data_dir = Path.cwd().parent.joinpath('data')
root_dir = data_dir.joinpath('root')
h5_dir = data_dir.joinpath('hdf5')
h5_dir.mkdir(parents=True, exist_ok=True)

root_train_path = root_dir.joinpath('train_50k.root')
root_test_path = root_dir.joinpath('test_40k.root')
h5_train_path = h5_dir.joinpath('train.h5')
h5_test_path = h5_dir.joinpath('test.h5')

h_norm_train_path = h5_dir.joinpath('train_norm.h5')
h_norm_test_path = h5_dir.joinpath('test_norm.h5')

In [4]:
def npy2h5(h5, N, x_name, x_npy, y_name, y_npy):
    with h5py.File(h5, 'a') as hf:
        if x_name not in hf.keys():
            hf.create_dataset(x_name, (N, 3, 256, 256), maxshape=(None,3,256,256), dtype='f', chunks=True)
            hf[x_name][-x_npy.shape[0]:] = x_npy
        else:
            hf[x_name].resize((hf[x_name].shape[0] + x_npy.shape[0]), axis = 0)
            hf[x_name][-x_npy.shape[0]:] = x_npy
        
        if y_name not in hf.keys():
            hf.create_dataset(y_name, (N,5), maxshape=(None,5), dtype='f', chunks=True)
            hf[y_name][-y_npy.shape[0]:] = y_npy
        else:
            hf[y_name].resize((hf[y_name].shape[0] + y_npy.shape[0]), axis = 0)
            hf[y_name][-y_npy.shape[0]:] = y_npy
    return

In [5]:
mean = 0.37174486416699054
std = 4439.282558540287

In [6]:
h_train = h5py.File(h5_train_path)
print(h_train['X_train'].shape, h_train['y_train'].shape)

(46865, 3, 256, 256) (46865, 5)


In [7]:
N_train = 46865

In [14]:
N_loop = 20
interval = int(N_train/N_loop)
N_processed = 0

for i in tqdm(range(N_loop)):
    start = interval*i
    end = start + np.min([interval, N_train-N_processed])

    X = (h_train['X_train'][start:end]-mean)/std
    y = h_train['y_train'][start:end]
    
    N_proccessed_per_it = end-start
    
    npy2h5(h_norm_train_path, N_proccessed_per_it, 'X_train', X, 'y_train', y)
    N_processed += (end-start)

100%|██████████| 20/20 [03:24<00:00, 10.21s/it]


In [6]:
h_test = h5py.File(h5_test_path)
print(h_test['X_test'].shape, h_test['y_test'].shape)

(37523, 3, 256, 256) (37523, 5)


In [7]:
N_test = 37523

In [8]:
N_loop = 20
interval = int(N_test/N_loop)
N_processed = 0

for i in tqdm(range(N_loop)):
    start = interval*i
    end = start + np.min([interval, N_test-N_processed])

    X = (h_test['X_test'][start:end]-mean)/std
    y = h_test['y_test'][start:end]
    
    N_proccessed_per_it = end-start
    
    npy2h5(h_norm_test_path, N_proccessed_per_it, 'X_test', X, 'y_test', y)
    N_processed += (end-start)

100%|██████████| 20/20 [03:11<00:00,  9.59s/it]
