In [14]:
import argparse
import json
import os
import pathlib
import pickle as pkl
import shutil
import sys
import time

import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import uproot
import glob

import sklearn
from sklearn import model_selection

In [17]:
! ls ../datafiles/TaggerInput

[34mTaggerInputFeb9_2017[m[m [34mTaggerInput_2016APV[m[m  [34mTaggerInput_2018[m[m
[34mTaggerInput_2016[m[m     [34mTaggerInput_2017[m[m     [34mremove[m[m


# Postprocessing

In [3]:
def rename_vars(df):
    
    df = df.rename(columns={
        "mt_lep_met": "lep_met_mt", 
        "lep_dR_fj": "lep_fj_dr",
    })
    return df

# we must add the "fj_isVBF" & "fj_isggF" labels that we forgot to include in the processor
def postprocess(df, sample):
    if "HToWW" in sample:        
        if "VBF" in sample:
            df["fj_isggF"] = 0
            df["fj_isVBF"] = 1
        elif "GluGluHToWW" in sample:
            df["fj_isggF"] = 1
            df["fj_isVBF"] = 0      
    else:
        df["fj_isggF"] = 0
        df["fj_isVBF"] = 0  

    df = rename_vars(df)
    return df

In [13]:
! ls ../datafiles/new/

In [4]:
OUTPATH = "../datafiles/new/"

for sample in os.listdir(OUTPATH):
    if "DS_Store" in sample:
        continue
            
    print(f"Processing {sample}")

    outdir = f"{OUTPATH}/{sample}/outfiles/"        
    df = pd.read_parquet(f"{outdir}/out.parquet")
    df.drop(columns=["fj_genH_pt"], inplace=True)

    df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.4)

    print("making train directory")
    os.system(f"mkdir -p {outdir}/train")
    with uproot.recreate(f"{outdir}/train/out.root", compression=uproot.LZ4(4)) as rfile:
        rfile["Events"] = ak.Array(df_train.to_dict(orient="list", index=True))

    print("making test directory")        
    os.system(f"mkdir -p {outdir}/test")            
    with uproot.recreate(f"{outdir}/test/out.root", compression=uproot.LZ4(4)) as rfile:
        rfile["Events"] = ak.Array(df_test.to_dict(orient="list", index=True))

    print("--------------------------") 

Processing WJetsToLNu_2J
making train directory
making test directory
--------------------------
Processing WJetsToLNu_1J
making train directory
making test directory
--------------------------
Processing JHUVariableWMass_part3
making train directory
making test directory
--------------------------
Processing JHUVariableWMass_part2
making train directory
making test directory
--------------------------


In [None]:
for key in df:
    print(key)

In [13]:
df.drop(columns=["fj_genH_jet"], inplace=True)

In [14]:
df["fj_genH_jet"]

KeyError: 'fj_genH_jet'

In [5]:
OUTPATH = "../datafiles/TaggerInput/"

for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):
    if "DS_Store" in dir_TaggerInput:
        continue    

    for sample in os.listdir(f"{OUTPATH}/{dir_TaggerInput}"):
        if "DS_Store" in sample:
            continue    
        if "run_skimmer" in sample:
            continue    
        if "inputprocessor" in sample:
            continue            

        print(f"Processing {sample} from {dir_TaggerInput}")
            
        outdir = f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles"        
        df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))
        df = postprocess(df, sample)

        df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.4)

        os.system(f"mkdir -p {outdir}/train")
        with uproot.recreate(f"{outdir}/train/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df_train.to_dict(orient="list", index=True))

        os.system(f"mkdir -p {outdir}/test")            
        with uproot.recreate(f"{outdir}/test/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df_test.to_dict(orient="list", index=True))

        print("--------------------------")    

Processing TTToSemiLeptonic from TaggerInput_2018


NameError: name 'postprocess' is not defined

# Convert parquets to root

In [125]:
OUTPATH = "../datafiles/ntuples/"

for sample in os.listdir(f"{OUTPATH}/"):
    if "DS_Store" in sample:
        continue    
    if "run_skimmer" in sample:
        continue    
    if "inputprocessor" in sample:
        continue            

    for year in os.listdir(f"{OUTPATH}/{sample}/"):
        
        if "DS_Store" in year:
            continue
        
        # train dataset
        outdir = f"{OUTPATH}/{sample}/{year}/train"
        df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))

        with uproot.recreate(f"{outdir}/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df.to_dict(orient="list", index=True))

        # test dataset
        outdir = f"{OUTPATH}/{sample}/{year}/test"
        df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))

        with uproot.recreate(f"{outdir}/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df.to_dict(orient="list", index=True))
                             
        print("--------------------------")    

--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------


In [197]:
for key in df:
    print(key)

fj_eta
fj_phi
fj_mass
fj_pt
fj_msoftdrop
fj_lsf3
fj_genjetmass
fj_genRes_pt
fj_genRes_eta
fj_genRes_phi
fj_genRes_mass
fj_genH_pt
fj_genH_jet
fj_genV_dR
fj_genVstar
genV_genVstar_dR
fj_isHVV
fj_isHVV_Matched
fj_isHVV_4q
fj_isHVV_elenuqq
fj_isHVV_munuqq
fj_isHVV_taunuqq
fj_isHVV_Vlepton
fj_isHVV_Vstarlepton
fj_nquarks
fj_lepinprongs
fj_isV
fj_isV_Matched
fj_isV_2q
fj_isV_elenu
fj_isV_munu
fj_isV_taunu
fj_nprongs
fj_ncquarks
fj_isV_lep
fj_isTop
fj_isTop_Matched
fj_Top_numMatched
fj_isTop_W_lep_b
fj_isTop_W_lep
fj_isTop_W_ele_b
fj_isTop_W_ele
fj_isTop_W_mu_b
fj_isTop_W_mu
fj_isTop_W_tau_b
fj_isTop_W_tau
fj_Top_nquarksnob
fj_Top_nbquarks
fj_Top_ncquarks
fj_Top_nleptons
fj_Top_nele
fj_Top_nmu
fj_Top_ntau
fj_Top_taudecay
fj_isQCD
fj_isQCD_Matched
fj_isQCDb
fj_isQCDbb
fj_isQCDc
fj_isQCDcc
fj_isQCDothers
met_pt
met_relpt
met_fj_dphi
abs_met_fj_dphi
mt_lep_met
lep_dR_fj
lep_pt
lep_pt_ratio
lep_reliso
lep_miso
n_bjets_L
n_bjets_M
n_bjets_T
rec_W_lnu_pt
rec_W_lnu_m
rec_W_qq_pt
rec_W_qq_m
rec_higg

# Inspecting NaNs

In [9]:
OUTPATH = "../datafiles/TaggerInput/"

for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):
    if "DS_Store" in dir_TaggerInput:
        continue
    if "remove" in dir_TaggerInput:
        continue        

    for sample in os.listdir(f"{OUTPATH}/{dir_TaggerInput}"):

        if "DS_Store" in sample:
            continue    
        if "run_skimmer" in sample:
            continue    
        if "inputprocessor" in sample:
            continue

        for file in os.listdir(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles//train/"):
            if "root" not in file:
                continue
            print(f"Inspecting file {OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/{file}")
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/{file}")["Events"]

            for var in events.keys():
                nans = (np.isnan(events[var].array().to_numpy())).sum()
                if nans != 0:
                    print(file, nans, f"nan {var} values")


        for file in os.listdir(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/"):
            if "root" not in file:
                continue
            print(f"Inspecting file {OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/{file}")                
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/{file}")["Events"]

            for var in events.keys():
                nans = (np.isnan(events[var].array().to_numpy())).sum()
                if nans != 0:
                    print(file, nans, f"nan {var} values")

        print("--------------------------")

Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/TTToSemiLeptonic/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/TTToSemiLeptonic/outfiles/test/out.root
--------------------------
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/WJetsToLNu_2J/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/WJetsToLNu_2J/outfiles/test/out.root
--------------------------
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/WJetsToLNu_1J/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/WJetsToLNu_1J/outfiles/test/out.root
--------------------------
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/QCD_Pt_600to800/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/QCD_Pt_600to800/outfiles/test/out.root
--------------------------
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/QCD_Pt_300to470/outfiles/train/out.root
Inspecting 

In [12]:
! ls ../datafiles/TaggerInput/TaggerInput_2017

[34mJHUVariableWMass_part1[m[m [34mJHUVariableWMass_part2[m[m [34mJHUVariableWMass_part3[m[m


# Check Number of events in ntuples

In [9]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",
#     "VBFHToWWToLNuQQ_M-125_withDipoleRecoil",
#     "JHUVariableWMass_part1",
#     "JHUVariableWMass_part2",
#     "JHUVariableWMass_part3",    
#     "TTToSemiLeptonic",
#     "WJetsToLNu_1J",
#     "WJetsToLNu_2J",    
#     "WJetsToLNu_HT-200To400",
#     "WJetsToLNu_HT-400To600",
#     "WJetsToLNu_HT-600To800",
#     "QCD_Pt_170to300",
#     "QCD_Pt_300to470",
#     "QCD_Pt_470to600",
#     "QCD_Pt_600to800",
]


OUTPATH = "../datafiles/TaggerInput/"

num_events = {}


for sample in samples:

    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
        if "remove" in dir_TaggerInput:
            continue            
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/out.root")["Events"]
            num += events.num_entries
        except:
            continue
    print(sample, "train", num)


    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/out.root")["Events"]
            num += events.num_entries
        except:
            continue
    print(sample, "test", num)
    
    print("--------------------------")

GluGluHToWW_Pt-200ToInf_M-125 train 58345
GluGluHToWW_Pt-200ToInf_M-125 test 38898
--------------------------


# Check Number of events after selection

In [13]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",    
#     "VBFHToWWToLNuQQ_M-125_withDipoleRecoil",
#     "JHUVariableWMass_part1",
#     "JHUVariableWMass_part2",
#     "JHUVariableWMass_part3",    
    "TTToSemiLeptonic",
#     "WJetsToLNu_1J",
#     "WJetsToLNu_2J",    
    "WJetsToLNu_HT-200To400",
    "WJetsToLNu_HT-400To600",
    "WJetsToLNu_HT-600To800",
    "QCD_Pt_170to300",
    "QCD_Pt_300to470",
    "QCD_Pt_470to600",
    "QCD_Pt_600to800",
]

OUTPATH = "../datafiles/TaggerInput/"

num_events = {}


for sample in samples:

    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue 
            
        if "remove" in dir_TaggerInput:
            continue             
                        
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/out.root")["Events"]
            ### put selection below
            selection1 = (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8)
            selection2 = ( (events["fj_genRes_mass"].array()==125) & (events["fj_isHVV_Matched"].array()==1) & (events["fj_lepinprongs"].array()==1) & (events["fj_nquarks"].array()==2) & ((events["fj_isHVV_elenuqq"].array()==1) | (events["fj_isHVV_munuqq"].array()==1)) ) | (events["fj_genRes_mass"].array()!=125)
                    
            num += ak.sum(selection1 & selection2)

        except:
#             print("no file found for", f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles")
            continue
    print(sample, "train", num)


    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue   
        if "remove" in dir_TaggerInput:
            continue                         
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/out.root")["Events"]
            ### put selection below
            selection1 = (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8)
            selection2 = ( (events["fj_genRes_mass"].array()==125) & (events["fj_isHVV_Matched"].array()==1) & (events["fj_lepinprongs"].array()==1) & (events["fj_nquarks"].array()==2) & ((events["fj_isHVV_elenuqq"].array()==1) | (events["fj_isHVV_munuqq"].array()==1)) ) | (events["fj_genRes_mass"].array()!=125)
            
            num += ak.sum(selection1 & selection2)
        except:
            continue
    print(sample, "test", num)
    
    print("--------------------------")        

GluGluHToWW_Pt-200ToInf_M-125 train 56005
GluGluHToWW_Pt-200ToInf_M-125 test 37342
--------------------------
TTToSemiLeptonic train 794799
TTToSemiLeptonic test 529868
--------------------------
WJetsToLNu_HT-200To400 train 130212
WJetsToLNu_HT-200To400 test 86852
--------------------------
WJetsToLNu_HT-400To600 train 393385
WJetsToLNu_HT-400To600 test 262543
--------------------------
WJetsToLNu_HT-600To800 train 296645
WJetsToLNu_HT-600To800 test 197694
--------------------------
QCD_Pt_170to300 train 257404
QCD_Pt_170to300 test 171220
--------------------------
QCD_Pt_300to470 train 301000
QCD_Pt_300to470 test 200713
--------------------------
QCD_Pt_470to600 train 532099
QCD_Pt_470to600 test 354596
--------------------------
QCD_Pt_600to800 train 94978
QCD_Pt_600to800 test 63220
--------------------------


In [45]:
! ls  "../datafiles/TaggerInput/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/train"

out.root


In [52]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",    
#     "VBFHToWWToLNuQQ_M-125_withDipoleRecoil",
#     "JHUVariableWMass_part1",
#     "JHUVariableWMass_part2",
#     "JHUVariableWMass_part3",    
    "TTToSemiLeptonic",
#     "WJetsToLNu_1J",
#     "WJetsToLNu_2J",    
    "WJetsToLNu_HT-200To400",
    "WJetsToLNu_HT-400To600",
    "WJetsToLNu_HT-600To800",
    "QCD_Pt_170to300",
    "QCD_Pt_300to470",
    "QCD_Pt_470to600",
    "QCD_Pt_600to800",
]

OUTPATH = "../datafiles/TaggerInput/"

num_events = {}


for sample in samples:

    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue 
            
        if "rm_" in dir_TaggerInput:
            continue             
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/out.root")["Events"]
            ### put selection below
            selection1 = (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8)
            selection2 = ( (events["fj_genRes_mass"].array()==125) & (events["fj_isHVV_Matched"].array()==1) & (events["fj_lepinprongs"].array()==1) & (events["fj_nquarks"].array()==2) & ((events["fj_isHVV_elenuqq"].array()==1) | (events["fj_isHVV_munuqq"].array()==1)) ) | (events["fj_genRes_mass"].array()!=125)
                    
            num += ak.sum(selection1 & selection2)

        except:
            continue
    print(sample, "train", num)


    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue   
        if "rm_" in dir_TaggerInput:
            continue                         
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/out.root")["Events"]
            ### put selection below
            selection1 = (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8)
            selection2 = ( (events["fj_genRes_mass"].array()==125) & (events["fj_isHVV_Matched"].array()==1) & (events["fj_lepinprongs"].array()==1) & (events["fj_nquarks"].array()==2) & ((events["fj_isHVV_elenuqq"].array()==1) | (events["fj_isHVV_munuqq"].array()==1)) ) | (events["fj_genRes_mass"].array()!=125)
            
            num += ak.sum(selection1 & selection2)
        except:
            continue
    print(sample, "test", num)
    
    print("--------------------------")        

GluGluHToWW_Pt-200ToInf_M-125 train 56005
GluGluHToWW_Pt-200ToInf_M-125 test 37342
--------------------------
TTToSemiLeptonic train 794799
TTToSemiLeptonic test 529868
--------------------------
WJetsToLNu_HT-200To400 train 130212
WJetsToLNu_HT-200To400 test 86852
--------------------------
WJetsToLNu_HT-400To600 train 393385
WJetsToLNu_HT-400To600 test 262543
--------------------------
WJetsToLNu_HT-600To800 train 296645
WJetsToLNu_HT-600To800 test 197694
--------------------------
QCD_Pt_170to300 train 257404
QCD_Pt_170to300 test 171220
--------------------------
QCD_Pt_300to470 train 301000
QCD_Pt_300to470 test 200713
--------------------------
QCD_Pt_470to600 train 532099
QCD_Pt_470to600 test 354596
--------------------------
QCD_Pt_600to800 train 94978
QCD_Pt_600to800 test 63220
--------------------------
