# Create TFRecord Input X for just one file

In [17]:
import matplotlib.pyplot as plt

In [18]:
#====================
# Load Utils ========
#====================

import numpy as np
import uproot as ur
import awkward as ak
import time as t
import os
print("Awkward version: "+str(ak.__version__))
print("Uproot version: "+str(ur.__version__))

Awkward version: 1.4.0
Uproot version: 4.0.11


In [19]:
#====================
# Metadata ==========
#====================
track_branches = ['trackEta_EMB1', 'trackPhi_EMB1', 'trackEta_EMB2', 'trackPhi_EMB2', 'trackEta_EMB3', 'trackPhi_EMB3',
                  'trackEta_TileBar0', 'trackPhi_TileBar0', 'trackEta_TileBar1', 'trackPhi_TileBar1',
                  'trackEta_TileBar2', 'trackPhi_TileBar2']

event_branches = ["cluster_nCells", "cluster_cell_ID", "cluster_cell_E", 'cluster_nCells', "nCluster", "eventNumber",
                  "nTrack", "nTruthPart", "truthPartPdgId", "cluster_Eta", "cluster_Phi", 'trackPt', 'trackP',
                  'trackMass', 'trackEta', 'trackPhi', 'truthPartE', 'cluster_ENG_CALIB_TOT', "cluster_E", 'truthPartPt']

ak_event_branches = ["cluster_nCells", "cluster_cell_ID", "cluster_cell_E", "cluster_nCells",
                  "nTruthPart", "truthPartPdgId", "cluster_Eta", "cluster_Phi", "trackPt", "trackP",
                  "trackMass", "trackEta", "trackPhi", "truthPartE", "cluster_ENG_CALIB_TOT", "cluster_E", "truthPartPt"]

np_event_branches = ["nCluster", "eventNumber", "nTrack", "nTruthPart"]

geo_branches = ["cell_geo_ID", "cell_geo_eta", "cell_geo_phi", "cell_geo_rPerp", "cell_geo_sampling"]


# The functions

In [20]:
def dict_from_tree(tree, branches=None, np_branches=None):
    ''' Loads branches as default awkward arrays and np_branches as numpy arrays. '''
    dictionary = dict()
    if branches is not None:
        for key in branches:
            branch = tree.arrays()[key]
            dictionary[key] = branch
            
    if np_branches is not None:
        for np_key in np_branches:
            np_branch = np.ndarray.flatten(tree.arrays()[np_key].to_numpy())
            dictionary[np_key] = np_branch
    
    if branches is None and np_branches is None:
        raise ValueError("No branches passed to function.")
        
    return dictionary

In [21]:
def find_max_dim_tuple(events, event_dict):
    nEvents = len(events)
    max_clust = 0
    
    for i in range(nEvents):
        event = events[i,0]
        track_nums = events[i,1]
        clust_nums = events[i,2]
        
        clust_num_total = 0
        # set this to six for now to handle single track events, change later
        track_num_total = 6
        
        # Check if there are clusters, None type object may be associated with it
        if clust_nums is not None:
            # Search through cluster indices
            for clst_idx in clust_nums:
                nInClust = len(event_dict['cluster_cell_ID'][event][clst_idx])
                # add the number in each cluster to the total
                clust_num_total += nInClust

        total_size = clust_num_total + track_num_total
        if total_size > max_clust:
            max_clust = total_size
    
    # 6 for energy, eta, phi, rperp, track flag, sample layer
    return (nEvents, max_clust, 6)

In [22]:
def find_index_1D(values, dictionary):
    ''' Use a for loop and a dictionary. values are the IDs to search for. dict must be in format 
    (cell IDs: index) '''
    idx_vec = np.zeros(len(values), dtype=np.int32)
    for i in range(len(values)):
        idx_vec[i] = dictionary[values[i]]
    return idx_vec

# The data files

In [23]:
#====================
# File setup ========
#====================
# user.angerami.24559744.OutputStream._000001.root
# Number of files
Nfile = 1
fileNames = []
file_prefix = 'user.angerami.24559744.OutputStream._000'
for i in range(1,Nfile+1):
    endstring = f'{i:03}'
    fileNames.append(file_prefix + endstring + '.root')


In [24]:
#====================
# Load Data Files ===
#====================

## GEOMETRY DICTIONARY ##
geo_file = ur.open('/fast_scratch/atlas_images/v01-45/cell_geo.root')
CellGeo_tree = geo_file["CellGeo"]
geo_dict = dict_from_tree(tree=CellGeo_tree, branches=None, np_branches=geo_branches)

# cell geometry data
cell_geo_ID = geo_dict['cell_geo_ID']
cell_ID_dict = dict(zip(cell_geo_ID, np.arange(len(cell_geo_ID))))

# for event dictionary
events_prefix = '/fast_scratch/atlas_images/v01-45/pipm/'

# Use this to compare with the dimensionality of new events
firstArray = True

# Loop over files: 

In [25]:
k = 1 # tally used to keep track of file number
tot_nEvts = 0 # used for keeping track of total number of events
max_nPoints = 0 # used for keeping track of the largest 'point cloud'
t_tot = 0 # total time

In [26]:
#for currFile in fileNames:
#    
#    # Check for file, a few are missing
#    if not os.path.isfile(events_prefix+currFile):
#        print()
#        print('File '+events_prefix+currFile+' not found..')
#        print()
##        k += 1
#        continue
#    
#    else:
#        print()
#        print('Working on File: '+str(currFile)+' - '+str(k)+'/'+str(Nfile))
#        k += 1

Just test one file

In [27]:
currFile = 'user.angerami.24559744.OutputStream._000001.root'

In [28]:
## EVENT DICTIONARY ##
event = ur.open(events_prefix+currFile)
event_tree = event["EventTree"]
event_dict = dict_from_tree(tree=event_tree, branches=ak_event_branches, np_branches=np_event_branches)

ak_event_branches -> Cluster, tracks, etcs. arrays per event

np_branches -> Event-level variables. one value per event

In [29]:
ak_event_branches

['cluster_nCells',
 'cluster_cell_ID',
 'cluster_cell_E',
 'cluster_nCells',
 'nTruthPart',
 'truthPartPdgId',
 'cluster_Eta',
 'cluster_Phi',
 'trackPt',
 'trackP',
 'trackMass',
 'trackEta',
 'trackPhi',
 'truthPartE',
 'cluster_ENG_CALIB_TOT',
 'cluster_E',
 'truthPartPt']

In [30]:
np_event_branches

['nCluster', 'eventNumber', 'nTrack', 'nTruthPart']

In [31]:
import collections
elements_count = collections.Counter(event_dict['nCluster'])
for key, value in elements_count.items():
    if key == 0:
       print(f"Events with {key} clusters: {value}")
print("Total number of events: ", len(event_dict['nCluster']))

Events with 0 clusters: 6597
Total number of events:  20000


# Event level cuts

In [32]:
#===================
# APPLY CUTS =======
#===================
# create ordered list of events to use for index slicing
nEvents = len(event_dict['eventNumber'])
all_events = np.arange(0,nEvents,1,dtype=np.int32) #array with event index

In [33]:
nCluster = event_dict['nCluster']

In [34]:
filtered_event_mask = nCluster != 0

In [35]:
filtered_event = all_events[filtered_event_mask]

In [36]:
print("* selected events: ", len(filtered_event), "/", len(all_events))

* selected events:  13403 / 20000


First event does not have clusters and was filtered out:

Second event have one cluster with 105 cells

# Loop over events to create an index array:

In [37]:
#============================================#
## CREATE INDEX ARRAY FOR  CLUSTERS ##
#============================================#
event_indices = []
t0 = t.time()

In [38]:

    for evt in filtered_event:
        # pull cluster number, don't need zero index as it's loaded as a np array
        nClust = event_dict["nCluster"][evt]
        cluster_idx = np.arange(nClust)

        ## Cluster properties
        clusterEs = event_dict["cluster_E"][evt].to_numpy()
        clus_phi = event_dict["cluster_Phi"][evt].to_numpy()
        clus_eta = event_dict["cluster_Eta"][evt].to_numpy()

        ## ENERGY SELECTION AND ETA SELECTION
        eta_mask = abs(clus_eta) < 0.7
        e_mask = clusterEs > 0.5
        selection = eta_mask & e_mask
        selected_clusters = cluster_idx[selection]

        ## CREATE LIST ##
        if np.count_nonzero(selection) > 0:
            event_indices.append((evt, 0, selected_clusters))
            if (evt % 1000==0):
                print("** Event ", evt, " has ", np.count_nonzero(selection), "/" ,nClust," selected clusters")

** Event  3000  has  1 / 2  selected clusters
** Event  5000  has  3 / 4  selected clusters
** Event  11000  has  3 / 3  selected clusters
** Event  13000  has  1 / 1  selected clusters


In [40]:
event_indices = np.array(event_indices, dtype=np.object_)

# Max dimension of X array:

In [44]:
    #=========================#
    ## DIMENSIONS OF X ARRAY ##
    #=========================#
    max_dims = find_max_dim_tuple(event_indices, event_dict)
    evt_tot = max_dims[0]
    tot_nEvts += max_dims[0]
    # keep track of the largest point cloud to use for saving later
    if max_dims[1] > max_nPoints:
        max_nPoints = max_dims[1]

    print('* Events with selected clusters: '+str(evt_tot))
    print('* Total number of cells: '+str(max_dims[1]))
    print('* Dim of largest point cloud: '+str(max_nPoints))

    # Create arrays
    Y_new = np.zeros((max_dims[0],3))
    X_new = np.zeros(max_dims)

* Events with selected clusters: 3722
* Total number of cells: 942
* Dim of largest point cloud: 942


# Fill the entires

In [45]:
for i in range(max_dims[0]):
        # pull all relevant indices
        evt = event_indices[i,0]
        track_idx = event_indices[i,1]
        # recall this now returns an array
        cluster_nums = event_indices[i,2]

        ##############
        ## CLUSTERS ##
        ##############
        # set up to have no clusters, further this with setting up the same thing for tracks
        target_ENG_CALIB_TOT = -1
        if cluster_nums is not None:

            # find averaged center of clusters
            cluster_Eta = event_dict['cluster_Eta'][evt].to_numpy()
            cluster_Phi = event_dict['cluster_Phi'][evt].to_numpy()
            av_Eta = np.mean(cluster_Eta)
            av_Phi = np.mean(cluster_Phi)

            nClust_current_total = 0
            target_ENG_CALIB_TOT = 0

            for c in cluster_nums:            
                # cluster data
                target_ENG_CALIB_TOT += event_dict['cluster_ENG_CALIB_TOT'][evt][c]
                cluster_cell_ID = event_dict['cluster_cell_ID'][evt][c].to_numpy()
                nInClust = len(cluster_cell_ID)
                cluster_cell_E = event_dict['cluster_cell_E'][evt][c].to_numpy()            
                cell_indices = find_index_1D(cluster_cell_ID, cell_ID_dict)

                cluster_cell_Eta = geo_dict['cell_geo_eta'][cell_indices]
                cluster_cell_Phi = geo_dict['cell_geo_phi'][cell_indices]
                cluster_cell_rPerp = geo_dict['cell_geo_rPerp'][cell_indices]
                cluster_cell_sampling = geo_dict['cell_geo_sampling'][cell_indices]

                # input all the data
                # note here we leave the fourth entry zeros (zero for flag!!!)
                low = nClust_current_total
                high = low + nInClust
                X_new[i,low:high,0] = np.log(cluster_cell_E)
                # Normalize to average cluster centers
                X_new[i,low:high,1] = cluster_cell_Eta - av_Eta #cluster_cell_Eta - event_dict['cluster_Eta'][evt][c]
                X_new[i,low:high,2] = cluster_cell_Phi - av_Phi #cluster_cell_Phi -event_dict['cluster_Phi'][evt][c]
                X_new[i,low:high,3] = cluster_cell_rPerp
                X_new[i,low:high,5] = cluster_cell_sampling * 0.1

                nClust_current_total += nInClust

In [47]:
X_new.shape

(3722, 942, 6)

In [49]:
#####################
## TARGET ENERGIES ##
#####################
# this is needed for energy regression
Y_new[i,0] = event_dict['truthPartE'][evt][0]
Y_new[i,1] = event_dict['truthPartPt'][evt][track_idx]
Y_new[i,2] = target_ENG_CALIB_TOT

# Create TFRecord

https://medium.com/mostly-ai/tensorflow-records-what-they-are-and-how-to-use-them-c46bc4bbb564

Use tf.train.Example

If your dataset consist of features, where each feature is a list of values of the same type

In [66]:
import tensorflow as tf

2021-07-26 08:34:03.065275: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


## Features

Order in X_new[:,:,i]: i= 0:energy, 1:eta, 2:phi, 3:rperp, 4:sampleID

- *tf.train.Feature* wraps a list of data of a specific type so Tensorflow can understand it. 


single attribute: union of bytes_list/float_list/int64_list.
the stored list can be of type tf.train.BytesList, tf.train.FloatList, or tf.train.Int64List 

In [89]:
def _floats_feature(value):
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

Features: Energy, $\eta$, $\phi$, sampleID

In [92]:
ene_arr_flat= ak.flatten(X_new[:,:,0].astype(np.float64))
eta_arr_flat= ak.flatten(X_new[:,:,1].astype(np.float64))
phi_arr_flat= ak.flatten(X_new[:,:,2].astype(np.float64))
sam_arr_flat= ak.flatten(X_new[:,:,4].astype(np.float64))

- *tf.train.Features* is a collection of named features.

Single attribute: feature that expects a dictionary where the key is the name of the features and the value a tf.train.Feature.

In [94]:
features_dic = {
    "ene": _floats_feature(ene_arr_flat),
    "eta": _floats_feature(eta_arr_flat),
    "phi": _floats_feature(phi_arr_flat),
    "sam": _floats_feature(sam_arr_flat),
}

In [95]:
classif_features = tf.train.Features(feature=features_dic)

- *tf.train.Example* stores features in a single attribute features of type tf.train.Features.


Note: 
- *tf.train.SequenceExample* In contrast to tf.train.Example, it does not store a list of bytes, floats or int64s, but a *list of lists* of bytes, floats or int64s

In [97]:
example = tf.train.Example(features=classif_features)

In [96]:
#Just for eta
example_eta = tf.train.Example(features=tf.train.Features(feature={"eta": _floats_feature(eta_arr_flat)}))
print(example)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



## Write

- *tf.python_io.TFRecordWriter* to write

Serialize and store the data

For tf2: tf.compat.v1.python_io.TFRecordWriter instead of tf.python_io.TFRecordWriter

In [100]:
with tf.compat.v1.python_io.TFRecordWriter('/data/atlas/dportill/X_1file.tfrecord') as writer:
  writer.write(example.SerializeToString())

Let's check the serialized example:

In [118]:
serialized_example = example.SerializeToString()

To decode the message use the tf.train.Example.FromString method.

In [119]:
example_proto = tf.train.Example.FromString(serialized_example)

In [None]:
example_proto

## Read

Initialize TFRecordDataset for the TFRecord file

In [130]:
#Initilizaing the TFRecordDataset for train and test TFRecord file
tfrecord_dataset=tf.data.TFRecordDataset('/data/atlas/dportill/X_1file.tfrecord')

1) Create the feature dictionary describing the features using tf.FixedLenFeature and tf.VarLenFeature. This should match with the feature names used while writing the data to the TFRecord file.

2) Extract the dictionary object using parse_single_example for each of the data records.

In [128]:
def read_tfrecord(serialized_example):
    feature_description={
        'ene':  tf.io.FixedLenFeature((), tf.float32),
        'eta':  tf.io.FixedLenFeature((), tf.float32),
        'phi':  tf.io.FixedLenFeature((), tf.float32),
        'sam':  tf.io.FixedLenFeature((), tf.float32),               
    }
    example= tf.io.parse_single_example(serialized_example, feature_description)
    
    return example

In [132]:
dataset = tfrecord_dataset.map(read_tfrecord)

## Create an input pipeline

Prefetch, shuffle create a batch of records from the training dataset but skip shuffling for the test dataset.
(haven't dive it yet into train/test)

In [137]:
dataset.prefetch(tf.data.experimental.AUTOTUNE)
dataset = dataset.shuffle(True)
dataset = dataset.batch(10)

# Initialise a Dataset


it seems like it might be possible to avoid the TFRecords and use the numpy files directly via tf.data.Dataset.from_generator ? That would let us use the Dataset formalism (and the shuffling etc. it allows) without worrying about TFRecords

tf.data.Dataset.from_generator ?

* Importing data from numpy:

In [143]:
dataset_X_new = tf.data.Dataset.from_tensor_slices(X_new)

In [1]:
dataset_X_new

NameError: name 'dataset_X_new' is not defined

* Importing data from generator:

In [147]:
sequence = X_new

In [148]:
def generator():
    for el in sequence:
        yield el

In [150]:
dataset_generator = tf.data.Dataset().batch(1).from_generator(generator)

TypeError: Can't instantiate abstract class DatasetV2 with abstract methods _inputs, element_spec

In [114]:
## Before TensorFlow 2:
"""
# Read and print data:
sess = tf.InteractiveSession()

# Read TFRecord file
reader = tf.TFRecordReader()
filename_queue = tf.train.string_input_producer(['/data/atlas/dportill/X_1file.tfrecord'])

_, serialized_example = reader.read(filename_queue)

# Define features

sequence_features = {
    'ene': tf.FixedLenSequenceFeature([], dtype=tf.float32),
}

# Extract features from serialized data
sequence_data = tf.parse_single_sequence_example(
    serialized=serialized_example, sequence_features=sequence_features)

# Many tf.train functions use tf.train.QueueRunner,
# so we need to start it before we read
tf.train.start_queue_runners(sess)


print('\nData')
for name, tensor in sequence_data.items():
    print('{}: {}'.format(name, tensor.eval()))
    
"""

"\n# Read and print data:\nsess = tf.InteractiveSession()\n\n# Read TFRecord file\nreader = tf.TFRecordReader()\nfilename_queue = tf.train.string_input_producer(['/data/atlas/dportill/X_1file.tfrecord'])\n\n_, serialized_example = reader.read(filename_queue)\n\n# Define features\n\nsequence_features = {\n    'ene': tf.FixedLenSequenceFeature([], dtype=tf.float32),\n}\n\n# Extract features from serialized data\nsequence_data = tf.parse_single_sequence_example(\n    serialized=serialized_example, sequence_features=sequence_features)\n\n# Many tf.train functions use tf.train.QueueRunner,\n# so we need to start it before we read\ntf.train.start_queue_runners(sess)\n\n\nprint('\nData')\nfor name, tensor in sequence_data.items():\n    print('{}: {}'.format(name, tensor.eval()))\n    \n"