# Parse JSON Data

In [None]:
PATH_PREFIX = '../'
DATA_DIR = 'InclusiveEvents_Iteration1/'

PARSED_DATA_DIR = 'InclusiveEvents_Iteration1_Parsed/'
PARSED_FILE_NAME = PATH_PREFIX + PARSED_DATA_DIR + 'dataset_{:04d}.json'

NUMBER_OF_THREADS = 20

In [None]:
from __future__ import division
import os
import bz2
import glob
import ujson
import threading

def ParseTracks(raw_data):
        
    def ValidTrack(track, id_map, layer_threshold=3):
        valid = [0] * 6
        for hit_id in track:
            hit = id_map[hit_id]
            ind = hit["Layer"] * 2 + hit["HalfLayer"]
            valid[ind] = 1

        return sum(valid) >= layer_threshold

    # prepare matrices index
    ZIndex_max = raw_data["Events"][0]["MetaData"]["Layer0"]["PixelZIndex_Count"]
    L0Phi_max = raw_data["Events"][0]["MetaData"]["Layer0"]["PixelPhiIndexInLayer_Count"]
    L1Phi_max = raw_data["Events"][0]["MetaData"]["Layer1"]["PixelPhiIndexInLayer_Count"]
    L2Phi_max = raw_data["Events"][0]["MetaData"]["Layer2"]["PixelPhiIndexInLayer_Count"]
    #Phi = L0Phi_max
    Phi = 1024
    Z = 1024
    Phi_ratio = [Phi / L0Phi_max, Phi / L1Phi_max, Phi / L2Phi_max]
    Z_ratio = Z / ZIndex_max

    dataset = []
    for event in raw_data["Events"]:
        id_map, tracks = {}, {}
        for hit in event["RawHit"]["MVTXHits"]:
            # Store hit according to id
            id_map[hit["ID"]["HitSequenceInEvent"]] = hit["ID"]
        for i, track in enumerate(event["TruthHit"]["TruthTracks"]):
            if ValidTrack(track["HitSequenceInEvent"], id_map, 3):
                for hit_id in track["HitSequenceInEvent"]:
                    hit = id_map[hit_id]
                    hit_layer = hit["Layer"] * 2 + hit["HalfLayer"]
                    layer_and_hit_coordinate = (hit_layer, int(hit["PixelZIndex"] * Z_ratio), int(hit["PixelPhiIndexInLayer"] * Phi_ratio[hit["Layer"]]))
                    if (i+1) not in tracks:
                        tracks[i+1] = set([])
                    tracks[i+1].add(layer_and_hit_coordinate)
        dataset.append({'tracks': tracks})
    return dataset

In [None]:
data_dir = sorted(glob.glob(PATH_PREFIX + DATA_DIR + '/*.bz2'))
file_number = 1
thread_pool = []
lock = threading.Lock()

def parsing_thread(zip_file, file_number):
    lock.acquire()
    print('parsing data file {}'.format(os.path.basename(zip_file)))
    lock.release()
    with open(zip_file) as z:
        data = ParseTracks(ujson.loads(bz2.decompress(z.read())))
    file_name = PARSED_FILE_NAME.format(file_number)
    f = open(file_name, 'w')
    ujson.dump(data, f)
    f.close()
    lock.acquire()
    print('parsed file written to {}'.format(file_name))
    lock.release()
    
for zip_file in data_dir:
    if len(thread_pool) == NUMBER_OF_THREADS:
        thread = thread_pool.pop(0)
        thread.join()
    new_thread = threading.Thread(target=parsing_thread, args=(zip_file, file_number))
    thread_pool.append(new_thread)
    new_thread.start()
    file_number += 1

for thread in thread_pool:    
    thread.join()