In [1]:
import numpy as np
import pandas as pd

from icecube import constants

# Data setting
train_batch_id_first = 54
train_batch_id_last = 95

train_batch_ids = range(train_batch_id_first, train_batch_id_last + 1)

max_pulse_count = 128
bin_num = 8
n_features = 9  # time, charge, aux, x, y, z, r_err, z_err, rank


# sensor_geometry
sensor_geometry_df = pd.read_csv("/media/eden/sandisk/projects/icecube/input/icecube/icecube-neutrinos-in-deep-ice/sensor_geometry.csv")

# counts
doms_per_string = 60
string_num = 86

# index
outer_long_strings = np.concatenate(
    [np.arange(0, 25), np.arange(27, 34), np.arange(37, 44), np.arange(46, 78)]
)
inner_long_strings = np.array([25, 26, 34, 35, 36, 44, 45])
inner_short_strings = np.array([78, 79, 80, 81, 82, 83, 84, 85])

# known specs
outer_xy_resolution = 125.0 / 2
inner_xy_resolution = 70.0 / 2
long_z_resolution = 17.0 / 2
short_z_resolution = 7.0 / 2

# evaluate error
sensor_x = sensor_geometry_df.x
sensor_y = sensor_geometry_df.y
sensor_z = sensor_geometry_df.z
sensor_r_err = np.ones(doms_per_string * string_num)
sensor_z_err = np.ones(doms_per_string * string_num)

for string_id in outer_long_strings:
    sensor_r_err[
        string_id * doms_per_string : (string_id + 1) * doms_per_string
    ] *= outer_xy_resolution
    
for string_id in np.concatenate([inner_long_strings, inner_short_strings]):
    sensor_r_err[
        string_id * doms_per_string : (string_id + 1) * doms_per_string
    ] *= inner_xy_resolution

for string_id in outer_long_strings:
    sensor_z_err[
        string_id * doms_per_string : (string_id + 1) * doms_per_string
    ] *= long_z_resolution

for string_id in np.concatenate([inner_long_strings, inner_short_strings]):
    for dom_id in range(doms_per_string):
        z = sensor_z[string_id * doms_per_string + dom_id]
        if (z < -156.0) or (z > 95.5 and z < 191.5):
            sensor_z_err[
                string_id * doms_per_string + dom_id
            ] *= short_z_resolution

# register
sensor_geometry_df["r_err"] = sensor_r_err
sensor_geometry_df["z_err"] = sensor_z_err

# detector constants
c_const = 0.299792458  # speed of light [m/ns]

x_min = sensor_x.min()
x_max = sensor_x.max()
y_min = sensor_y.min()
y_max = sensor_y.max()
z_min = sensor_z.min()
z_max = sensor_z.max()

detector_length = np.sqrt((x_max - x_min)**2 + (y_max - y_min)**2 + (z_max - z_min)**2)
t_valid_length = detector_length / c_const

print("t_valid_length: ", t_valid_length, " ns")


# read single event from batch_meta_df
def read_event(event_idx, batch_meta_df, max_pulse_count, batch_df, train=True):
    # read metadata
    batch_id, first_pulse_index, last_pulse_index = batch_meta_df.iloc[event_idx][["batch_id", "first_pulse_index", "last_pulse_index"]].astype("int")

    # read event
    event_feature = batch_df[first_pulse_index:last_pulse_index + 1]
    sensor_id = event_feature.sensor_id
    event_id = event_feature.index.unique()
    assert len(event_id) == 1
    event_id = event_id[0]
    
    # merge features into single structured array
    dtype = [
        ("time", "float16"),
        ("charge", "float16"),
        ("auxiliary", "float16"),
        ("x", "float16"),
        ("y", "float16"),
        ("z", "float16"),
        ("r_err", "float16"),
        ("z_err", "float16"),
        ("rank", "short"),
    ]
    event_x = np.zeros(last_pulse_index - first_pulse_index + 1, dtype)

    event_x["time"] = event_feature.time.values - event_feature.time.min()
    event_x["charge"] = event_feature.charge.values
    event_x["auxiliary"] = event_feature.auxiliary.values

    event_x["x"] = sensor_geometry_df.x[sensor_id].values
    event_x["y"] = sensor_geometry_df.y[sensor_id].values
    event_x["z"] = sensor_geometry_df.z[sensor_id].values

    event_x["r_err"] = sensor_geometry_df.r_err[sensor_id].values
    event_x["z_err"] = sensor_geometry_df.z_err[sensor_id].values
    
    # For long event, pick-up
    if len(event_x) > max_pulse_count:
        # Find valid time window
        t_peak = event_x["time"][event_x["charge"].argmax()]
        t_valid_min = t_peak - t_valid_length
        t_valid_max = t_peak + t_valid_length

        t_valid = (event_x["time"] > t_valid_min) * (event_x["time"] < t_valid_max)

        # rank
        event_x["rank"] = 2 * (1 - event_x["auxiliary"]) + (t_valid)

        # sort by rank and charge (important goes to backward)
        event_x = np.sort(event_x, order=["rank", "charge"])

        # pick-up from backward
        event_x = event_x[-max_pulse_count:]

        # resort by time
        event_x = np.sort(event_x, order="time")

    # for train data, give angles together
    if train:
        azimuth, zenith = batch_meta_df.iloc[event_idx][["azimuth", "zenith"]].astype("float16")
        event_y = np.array([azimuth, zenith], dtype="float16")
        
        return event_id, len(event_x), event_x, event_y
    
    # for test data, just give feature 
    else:
        return event_id, len(event_x), event_x

t_valid_length:  6199.700247193777  ns


In [2]:
train_format = '/media/eden/sandisk/projects/icecube/input/icecube/icecube-neutrinos-in-deep-ice/train/batch_{batch_id:d}.parquet'
hdf5_file = "/media/eden/sandisk/projects/icecube/input/icecube/hdf5/dataset.hdf5"

df_meta = pd.read_parquet("/media/eden/sandisk/projects/icecube/input/icecube/icecube-neutrinos-in-deep-ice/train_meta.parquet")

In [3]:
import h5py
import gc
import multiprocessing
from tqdm import tqdm


with h5py.File(hdf5_file, 'w') as h5:
    group_train_data = h5.create_group("X_train")
    group_train_label = h5.create_group("y_train")

    for batch_id in train_batch_ids:
        batch_train = group_train_data.create_group(str(batch_id))
        batch_label = group_train_label.create_group(str(batch_id))

        print("Reading batch ", batch_id, "... ", end="")
        # get batch meta data and data
        batch_meta_df = df_meta[df_meta.batch_id == batch_id]
        batch_df = pd.read_parquet(train_format.format(batch_id=batch_id))

        # register pulses
        batch_x = np.zeros((max_pulse_count, n_features), dtype="float16")
        batch_y = np.zeros((2), dtype="float16")
        
        batch_x[:, 2] = -1

        def read_event_local(event_idx):
            return read_event(event_idx, batch_meta_df, max_pulse_count, batch_df, train=True)

        X = {}
        y = {}
        # scan events
        iterator = range(len(batch_meta_df))
        with multiprocessing.Pool() as pool:
            for event_id, pulse_count, event_x, event_y in pool.map(read_event_local, iterator):
                batch_x[:pulse_count, 0] = event_x["time"]
                batch_x[:pulse_count, 1] = event_x["charge"]
                batch_x[:pulse_count, 2] = event_x["auxiliary"]
                batch_x[:pulse_count, 3] = event_x["x"]
                batch_x[:pulse_count, 4] = event_x["y"]
                batch_x[:pulse_count, 5] = event_x["z"]
                batch_x[:pulse_count, 6] = event_x["r_err"]
                batch_x[:pulse_count, 7] = event_x["z_err"]
                batch_x[:pulse_count, 8] = event_x["rank"]

                X[str(event_id)] = batch_x
                y[str(event_id)] = event_y

        # Write hdf5
        for event_id in X:
            batch_train.create_dataset(event_id, data=X[event_id])
            batch_train.create_dataset(event_id, data=y[event_id])

        del batch_meta_df, batch_df, X, y, batch_x, batch_y
        gc.collect()

        # save
        print(" DONE!")


Reading batch  54 ...  DONE!
Reading batch  55 ...  DONE!
Reading batch  56 ...  DONE!
Reading batch  57 ... 

KeyboardInterrupt: 