In [1]:
import argparse
import json
import os
import pathlib
import pickle as pkl
import shutil
import sys
import time

import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import uproot
import glob

import sklearn
from sklearn import model_selection

In [2]:
! ls ../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles

ls: ../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles: No such file or directory


In [3]:
! ls ../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/240-250/parquet

ls: ../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/240-250/parquet: No such file or directory


In [4]:
pd.read_parquet(../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/240-250/parquet)

SyntaxError: invalid syntax (4258556986.py, line 1)

In [None]:
# for dir_ in os.listdir("../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/"):
#     if ".parquet" in dir_:
#         continue
            
#     print(f"../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/{dir_}/parquet")
    
#     a = pd.read_parquet(f"../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/{dir_}/parquet/")
    
#     a.to_parquet(f"../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/{dir_}.parquet")

# Postprocessing

In [20]:
def rename_vars(df):
    
    df = df.rename(columns={
        "mt_lep_met": "lep_met_mt", 
        "lep_dR_fj": "lep_fj_dr",
    })
    return df

# we must add the "fj_isVBF" & "fj_isggF" labels that we forgot to include in the processor
def postprocess(df, sample):
    if "HToWW" in sample:        
        if "VBF" in sample:
            df["fj_isggF"] = 0
            df["fj_isVBF"] = 1
        elif "GluGluHToWW" in sample:
            df["fj_isggF"] = 1
            df["fj_isVBF"] = 0      
    else:
        df["fj_isggF"] = 0
        df["fj_isVBF"] = 0  

    df = rename_vars(df)
    return df

In [21]:
OUTPATH = "../datafiles/TaggerInputFeb9_2017/"

for sample in os.listdir(f"{OUTPATH}/"):
    if "DS_Store" in sample:
        continue    
    if "run_skimmer" in sample:
        continue    
    if "inputprocessor" in sample:
        continue            

    print(f"Processing {sample}")

    outdir = f"{OUTPATH}/{sample}/outfiles"        
    df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))
    df = postprocess(df, sample)

    df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.4)

    os.system(f"mkdir -p {outdir}/train")
    with uproot.recreate(f"{outdir}/train/out.root", compression=uproot.LZ4(4)) as rfile:
        rfile["Events"] = ak.Array(df_train.to_dict(orient="list", index=True))

    os.system(f"mkdir -p {outdir}/test")            
    with uproot.recreate(f"{outdir}/test/out.root", compression=uproot.LZ4(4)) as rfile:
        rfile["Events"] = ak.Array(df_test.to_dict(orient="list", index=True))

    print("--------------------------")    

Processing GluGluHToWW_Pt-200ToInf_M-125
--------------------------


In [22]:
! ls ../datafiles/TaggerInputFeb9_2017/GluGluHToWW_Pt-200ToInf_M-125/outfiles/

0-10.parquet      1540-1550.parquet 210-220.parquet   480-490.parquet
10-20.parquet     1550-1560.parquet 2100-2110.parquet 490-500.parquet
100-110.parquet   1560-1570.parquet 2110-2120.parquet 50-60.parquet
1000-1010.parquet 1570-1580.parquet 2120-2130.parquet 500-510.parquet
1010-1020.parquet 1580-1590.parquet 2130-2140.parquet 510-520.parquet
1020-1030.parquet 1590-1600.parquet 2140-2150.parquet 520-530.parquet
1030-1040.parquet 160-170.parquet   2150-2160.parquet 530-540.parquet
1040-1050.parquet 1600-1610.parquet 2160-2170.parquet 540-550.parquet
1050-1060.parquet 1610-1620.parquet 2170-2180.parquet 550-560.parquet
1060-1070.parquet 1620-1630.parquet 2180-2190.parquet 560-570.parquet
1070-1080.parquet 1630-1640.parquet 2190-2200.parquet 570-580.parquet
1080-1090.parquet 1640-1650.parquet 220-230.parquet   580-590.parquet
1090-1100.parquet 1650-1660.parquet 2200-2210.parquet 590-600.parquet
110-120.parquet   1660-1670.parquet 2210-2220.parquet 60-70.parquet
1100-1110.

# Inspecting NaNs

In [23]:
OUTPATH = "../datafiles/TaggerInputFeb9_2017/"     

for sample in os.listdir(f"{OUTPATH}"):

    if "DS_Store" in sample:
        continue    
    if "run_skimmer" in sample:
        continue    
    if "inputprocessor" in sample:
        continue

    for file in os.listdir(f"{OUTPATH}/{sample}/outfiles//train/"):
        if "root" not in file:
            continue
        print(f"Inspecting file {OUTPATH}/{sample}/outfiles/train/{file}")
        events = uproot.open(f"{OUTPATH}/{sample}/outfiles/train/{file}")["Events"]

        for var in events.keys():
            nans = (np.isnan(events[var].array().to_numpy())).sum()
            if nans != 0:
                print(file, nans, f"nan {var} values")


    for file in os.listdir(f"{OUTPATH}/{sample}/outfiles/test/"):
        if "root" not in file:
            continue
        print(f"Inspecting file {OUTPATH}/{sample}/outfiles/test/{file}")                
        events = uproot.open(f"{OUTPATH}/{sample}/outfiles/test/{file}")["Events"]

        for var in events.keys():
            nans = (np.isnan(events[var].array().to_numpy())).sum()
            if nans != 0:
                print(file, nans, f"nan {var} values")

    print("--------------------------")

Inspecting file ../datafiles/TaggerInputFeb9_2017//GluGluHToWW_Pt-200ToInf_M-125/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInputFeb9_2017//GluGluHToWW_Pt-200ToInf_M-125/outfiles/test/out.root
--------------------------


# Check Number of events in ntuples

In [24]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",
]


OUTPATH = "../datafiles/TaggerInputFeb9_2017/"     

num_events = {}


for sample in samples:

    num = 0    
    try:
        events = uproot.open(f"{OUTPATH}/{sample}/outfiles/train/out.root")["Events"]
        num += events.num_entries
    except:
        continue
    print(sample, "train", num)


    num = 0    

    try:
        events = uproot.open(f"{OUTPATH}/{sample}/outfiles/test/out.root")["Events"]
        num += events.num_entries
    except:
        continue
    print(sample, "test", num)
    
    print("--------------------------")

GluGluHToWW_Pt-200ToInf_M-125 train 302203
GluGluHToWW_Pt-200ToInf_M-125 test 201470
--------------------------


# Check Number of events after selection

In [34]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",
]

OUTPATH = "../datafiles/TaggerInput/TaggerInputFeb9_2017"

num_events = {}


for sample in samples:

    num = 0    

    try:
        events = uproot.open(f"{OUTPATH}/{sample}/outfiles/train/out.root")["Events"]
        ### put selection below
#         selection1 = (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8) & (events["n_bjets_T"].array()==0)
#         selection2 = ( (events["fj_genRes_mass"].array()==125) & (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8) & (events["fj_isHVV_Matched"].array()==1) & (events["fj_lepinprongs"].array()==1) & (events["fj_nquarks"].array()==2) & ((events["fj_isHVV_elenuqq"].array()==1) | (events["fj_isHVV_munuqq"].array()==1)) ) | (events["fj_genRes_mass"].array()!=125)


        selection2 = events["fj_pt"].array()>190
        num += ak.sum(selection2)

    except:
        continue
    print(sample, "train", num)


    num = 0    

    try:
        events = uproot.open(f"{OUTPATH}/{sample}/outfiles/test/out.root")["Events"]
        ### put selection below
        selection1 = (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8) & (events["n_bjets_T"].array()==0)
        selection2 = ( (events["fj_genRes_mass"].array()==125) & (events["lep_fj_dr"].array()>0.03) & (events["lep_fj_dr"].array()<0.8) & (events["fj_isHVV_Matched"].array()==1) & (events["fj_lepinprongs"].array()==1) & (events["fj_nquarks"].array()==2) & ((events["fj_isHVV_elenuqq"].array()==1) | (events["fj_isHVV_munuqq"].array()==1)) ) | (events["fj_genRes_mass"].array()!=125)

        num += ak.sum(selection1 & selection2)
    except:
        continue
    print(sample, "test", num)
    
    print("--------------------------")        

GluGluHToWW_Pt-200ToInf_M-125 train 302203
GluGluHToWW_Pt-200ToInf_M-125 test 35245
--------------------------
