# Preprocessing

In [1]:
import numpy as np
import dask.array as da
import h5py

Input files and datasets:

In [2]:
sigf = h5py.File("sig1P_track_shuffle.h5", "r")
bkgf = h5py.File("bkg1P_track_shuffle.h5", "r")
sig_ds = sigf["track"]
bkg_ds = bkgf["track"]

In [3]:
sig = da.from_array(sig_ds, chunks=sig_ds.chunks)
bkg = da.from_array(bkg_ds, chunks=bkg_ds.chunks)

Slimmed list of variables:

In [4]:
invars = ["TauTracks.qOverP", "TauTracks.z0sinThetaTJVA", "TauTracks.d0",
          "TauTracks.dRJetSeedAxis", "TauTracks.rConvII", "TauTracks.nInnermostPixelHits",
          "TauTracks.nPixelHits", "TauTracks.nSiHits", "TauTracks.eProbabilityHT"]

Sample fractions for training/validation

In [5]:
total_frac = 0.2
idx = int(total_frac * min(len(sig), len(bkg)))
val_frac = 0.25
val_split = int((1.0 - val_frac) * idx)
n_tracks = 10

print("[0:{}] for RNN-training".format(val_split))
print("[{}:{}] for validation".format(val_split, idx))

[0:988859] for RNN-training
[988859:1318479] for validation


## Loading, slimming, splitting, merging

In [6]:
# Fraction of entire dataset
sig_split = sig[:idx, :n_tracks]
bkg_split = bkg[:idx, :n_tracks]

In [7]:
# Slim variables
sig_split = sig_split[invars]
bkg_split = bkg_split[invars]

In [8]:
# Convert structured array to ndarray
sig_split = sig_split.view(np.float32).reshape(sig_split.shape + (-1,))
bkg_split = bkg_split.view(np.float32).reshape(bkg_split.shape + (-1,))

In [9]:
# Set tracks with all zeros to nan
sig_equal_zero = sig_split == 0
bkg_equal_zero = bkg_split == 0

# All properties of a single track are zero
sig_all_zero = da.all(sig_equal_zero, axis=2, keepdims=True)
bkg_all_zero = da.all(bkg_equal_zero, axis=2, keepdims=True)

# Broadcast the reduced axis back to full size
sig_mask = da.broadcast_to(sig_all_zero, sig_split.shape)
bkg_mask = da.broadcast_to(bkg_all_zero, bkg_split.shape)

# Set nan
sig_split[sig_mask] = np.nan
bkg_split[bkg_mask] = np.nan

In [10]:
# Train split
sig_train = sig_split[:val_split]
bkg_train = bkg_split[:val_split]

# Validation split
sig_val = sig_split[val_split:idx]
bkg_val = bkg_split[val_split:idx]

# Concatenate sig & bkg
train = da.vstack((sig_train, bkg_train))
validation = da.vstack((sig_val, bkg_val))

# Sample labels
train_label = da.concatenate(
    (da.ones(val_split, dtype=np.float32, chunks=1000000),
     da.zeros(val_split, dtype=np.float32, chunks=1000000))
)
validation_label = da.concatenate(
    (da.ones(idx - val_split, dtype=np.float32, chunks=1000000), 
     da.zeros(idx - val_split, dtype=np.float32, chunks=1000000)))

## Subtract mean, divide by stddev

In [11]:
offset = da.nanmean(train, axis=0)
scale = da.nanstd(train, axis=0)

train -= offset
train /= scale

validation -= offset
validation /= scale

## Save array and auxiliary info

In [13]:
outf = "split1P.h5"

In [14]:
# Check if exists

In [15]:
da.to_hdf5(outf, {"/train": train, "/train_label": train_label,
                  "/validation": validation, "/validation_label": validation_label,
                  "/offset": offset, "/scale": scale},
           compression="gzip", compression_opts=9)


This code may break in numpy 1.13 because this will return a view instead of a copy -- see release notes for details.
  return function(*args2, **kwargs)


In [16]:
with h5py.File(outf, "a") as f:
    f.create_dataset("/variables", data=np.array(invars, dtype="S"))
    f.create_dataset("/splits", data=np.array([val_split, idx], dtype=np.int32))

In [17]:
sigf.close()
bkgf.close()