In [10]:
import ast
import numpy as np
import re
import glob 
import pickle

In [11]:
pattern_new_jet = r"""NEWJET: ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) ([-+]?\d*\.\d+([Ee][-+]?\d+)?|[-+]?\d+) (.*) (Z\:([01]))"""
pattern_jet_tree = r"""JETHISTORYTREE: (.*)"""
pattern_jet_tree_content = r"""JETHISTORYCONTENT: (.*)"""

def load_from_text(filename, take="first"):
    with open(filename, "r") as f:
        jets = []
        skip_rest = True
        
        for i, line in enumerate(f):
            if line.startswith("NEWEVENT"):
                skip_rest = False
            
            elif line.startswith("NEWJET"):
                jet = {}
                try:
                    m = re.match(pattern_new_jet, line).groups()
                    jet["root_id"] = int(m[0])
                    jet["radius"] = float(m[2])
                    jet["pt"] = float(m[4])
                    jet["eta"] = float(m[6])
                    jet["phi"] = float(m[8])
                    jet["energy"] = float(m[10])
                    jet["tau21"] = float(m[12])
                    jet["tau32"] = float(m[14])
                    jet["d2"] = float(m[16])
                    jet["z"] = int(m[20])
                except:
                    skip_rest = True
                
            elif line.startswith("JETHISTORYTREE"):
                if not skip_rest:
                    m = re.match(pattern_jet_tree, line).groups()
                    jet["tree"] = ast.literal_eval(m[0])
                
            elif line.startswith("JETHISTORYCONTENT"):
                if not skip_rest:
                    m = re.match(pattern_jet_tree_content, line).groups()
                    jet["content"] = ast.literal_eval(m[0])
                
                    if take == "first":
                        jets.append(jet)
                        skip_rest = True
                        
                    elif take == "all":
                        jets.append(jet)
                        
                    elif take == "z" and jet["z"] == 1:
                        jets.append(jet)
                        skip_rest = True
                
    for jet in jets:
        tree = np.array([jet["tree"][n] for n in sorted(jet["tree"])], dtype=np.int)
        tree[tree == -2] = -1
        content = np.array([jet["content"][n][1:5] for n in sorted(jet["content"])], dtype=np.float)
        jet["tree"] = tree
        jet["content"] = content
    
    return jets

In [14]:
background = []

for f in glob.glob("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/*.dat"):
    print(f)
    background.extend(load_from_text(f, take="first"))
    
fd = open("../data/z/kt-background.pickle", "wb")

for jet in background:
    pickle.dump(jet, fd, protocol=pickle.HIGHEST_PROTOCOL)

fd.close()

/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/MC15.361039.Sherpa_CT10_SinglePhoton_noPtSlice_CVetoBVeto_KT_43.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/MC15.361039.Sherpa_CT10_SinglePhoton_noPtSlice_CVetoBVeto_KT_37.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/MC15.361039.Sherpa_CT10_SinglePhoton_noPtSlice_CVetoBVeto_KT_40.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/MC15.361039.Sherpa_CT10_SinglePhoton_noPtSlice_CVetoBVeto_KT_15.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/MC15.361039.Sherpa_CT10_SinglePhoton_noPtSlice_CVetoBVeto_KT_29.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/MC15.361039.Sherpa_CT10_SinglePhoton_noPtSlice_CVetoBVeto_KT_2.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/background/MC15.361039.Sherpa_CT10_Si

In [16]:
signal = []

for f in glob.glob("/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/*.dat"):
    print(f)
    signal.extend(load_from_text(f, take="z"))
    
fd = open("../data/z/kt-mH700.pickle", "wb")

for jet in signal:
    pickle.dump(jet, fd, protocol=pickle.HIGHEST_PROTOCOL)

fd.close()

/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/MC15.343581.PowhegPythia8EvtGen_CT10_AZNLOCTEQ6L1_ggH700_Zqqgam_KT_17.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/MC15.343581.PowhegPythia8EvtGen_CT10_AZNLOCTEQ6L1_ggH700_Zqqgam_KT_50.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/MC15.343581.PowhegPythia8EvtGen_CT10_AZNLOCTEQ6L1_ggH700_Zqqgam_KT_44.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/MC15.343581.PowhegPythia8EvtGen_CT10_AZNLOCTEQ6L1_ggH700_Zqqgam_KT_12.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/MC15.343581.PowhegPythia8EvtGen_CT10_AZNLOCTEQ6L1_ggH700_Zqqgam_KT_6.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/MC15.343581.PowhegPythia8EvtGen_CT10_AZNLOCTEQ6L1_ggH700_Zqqgam_KT_25.dat
/home/gilles/gdrive/research/sandbox/learning-qcd-rnn/data/z/large/KT/mH700/MC15.343581.PowhegPythia8