In [135]:
import argparse
import json
import os
import pathlib
import pickle as pkl
import shutil
import sys
import time

import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

import uproot
import glob

import sklearn
from sklearn import model_selection

In [195]:
! ls ../datafiles/TaggerInput/*

../datafiles/TaggerInput/TaggerInput_2016:
[34mGluGluHToWW_Pt-200ToInf_M-125[m[m          [34mVBFHToWWToLNuQQ_M-125_withDipoleRecoil[m[m

../datafiles/TaggerInput/TaggerInput_2016APV:
[34mGluGluHToWW_Pt-200ToInf_M-125[m[m          [34mVBFHToWWToLNuQQ_M-125_withDipoleRecoil[m[m

../datafiles/TaggerInput/TaggerInput_2017:
[34mGluGluHToWW_Pt-200ToInf_M-125[m[m          [34mVBFHToWWToLNuQQ_M-125_withDipoleRecoil[m[m

../datafiles/TaggerInput/TaggerInput_2018:
[34mGluGluHToWW_Pt-200ToInf_M-125[m[m          [34mTTToSemiLeptonic[m[m
[34mQCD_Pt_170to300[m[m                        [34mVBFHToWWToLNuQQ_M-125_withDipoleRecoil[m[m
[34mQCD_Pt_300to470[m[m                        [34mWJetsToLNu_HT-200To400[m[m
[34mQCD_Pt_470to600[m[m                        [34mWJetsToLNu_HT-400To600[m[m
[34mQCD_Pt_600to800[m[m                        [34mWJetsToLNu_HT-600To800[m[m


# Postprocessing

In [137]:
# we must add the "fj_isVBF" & "fj_isggF" labels that we forgot to include in the processor
def postprocess(df, sample):
    if "HToWW" in sample:        
        if "VBF" in sample:
            df["fj_isggF"] = 0
            df["fj_isVBF"] = 1
        elif "GluGluHToWW" in sample:
            df["fj_isggF"] = 1
            df["fj_isVBF"] = 0      
    else:
        df["fj_isggF"] = 0
        df["fj_isVBF"] = 0    
    return df

In [175]:
OUTPATH = "../datafiles/TaggerInput/"

for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):
    if "DS_Store" in dir_TaggerInput:
        continue    

    for sample in os.listdir(f"{OUTPATH}/{dir_TaggerInput}"):
        if "DS_Store" in sample:
            continue    
        if "run_skimmer" in sample:
            continue    
        if "inputprocessor" in sample:
            continue            

        outdir = f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles"        
        df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))
        df = postprocess(df, sample)

        df_train, df_test = sklearn.model_selection.train_test_split(df, test_size=0.4)

        os.mkdir(f"{outdir}/train")
        with uproot.recreate(f"{outdir}/train/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df_train.to_dict(orient="list", index=True))

        os.mkdir(f"{outdir}/test")            
        with uproot.recreate(f"{outdir}/test/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df_test.to_dict(orient="list", index=True))

        print("--------------------------")    

--------------------------
--------------------------


# Convert parquets to root

In [125]:
OUTPATH = "../datafiles/ntuples/"

for sample in os.listdir(f"{OUTPATH}/"):
    if "DS_Store" in sample:
        continue    
    if "run_skimmer" in sample:
        continue    
    if "inputprocessor" in sample:
        continue            

    for year in os.listdir(f"{OUTPATH}/{sample}/"):
        
        if "DS_Store" in year:
            continue
        
        # train dataset
        outdir = f"{OUTPATH}/{sample}/{year}/train"
        df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))

        with uproot.recreate(f"{outdir}/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df.to_dict(orient="list", index=True))

        # test dataset
        outdir = f"{OUTPATH}/{sample}/{year}/test"
        df = pd.read_parquet(glob.glob(f"{outdir}/*.parquet"))

        with uproot.recreate(f"{outdir}/out.root", compression=uproot.LZ4(4)) as rfile:
            rfile["Events"] = ak.Array(df.to_dict(orient="list", index=True))
                             
        print("--------------------------")    

--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------
--------------------------


# Inspecting NaNs

In [180]:
OUTPATH = "../datafiles/TaggerInput/"

for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):
    if "DS_Store" in dir_TaggerInput:
        continue    

    for sample in os.listdir(f"{OUTPATH}/{dir_TaggerInput}"):
        if "DS_Store" in sample:
            continue    
        if "run_skimmer" in sample:
            continue    
        if "inputprocessor" in sample:
            continue

        for file in os.listdir(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles//train/"):
            if "root" not in file:
                continue
            print(f"Inspecting file {OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/{file}")
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/{file}")["Events"]

            for var in events.keys():
                nans = (np.isnan(events[var].array().to_numpy())).sum()
                if nans != 0:
                    print(file, nans, f"nan {var} values")


        for file in os.listdir(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/"):
            if "root" not in file:
                continue
            print(f"Inspecting file {OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/{file}")                
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/{file}")["Events"]

            for var in events.keys():
                nans = (np.isnan(events[var].array().to_numpy())).sum()
                if nans != 0:
                    print(file, nans, f"nan {var} values")

        print("--------------------------")

Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/VBFHToWWToLNuQQ_M-125_withDipoleRecoil/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/VBFHToWWToLNuQQ_M-125_withDipoleRecoil/outfiles/test/out.root
--------------------------
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/TTToSemiLeptonic/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/TTToSemiLeptonic/outfiles/test/out.root
--------------------------
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/WJetsToLNu_HT-200To400/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/WJetsToLNu_HT-200To400/outfiles/test/out.root
--------------------------
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/GluGluHToWW_Pt-200ToInf_M-125/outfiles/train/out.root
Inspecting file ../datafiles/TaggerInput//TaggerInput_2018/GluGluHToWW_Pt-200ToInf_M-125/outfiles/test/out.root
--------------------------
Inspecting fil

# Check Number of events in ntuples

In [184]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",
    "TTToSemiLeptonic",
    "WJetsToLNu_HT-200To400",
    "WJetsToLNu_HT-400To600",
    "WJetsToLNu_HT-600To800",
    "QCD_Pt_170to300",
    "QCD_Pt_300to470",
    "QCD_Pt_470to600",
    "QCD_Pt_600to800",
]

OUTPATH = "../datafiles/TaggerInput/"

num_events = {}


for sample in samples:

    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/out.root")["Events"]
            num += events.num_entries
        except:
            continue
    print(sample, "train", num)


    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/out.root")["Events"]
            num += events.num_entries
        except:
            continue
    print(sample, "test", num)
    
    print("--------------------------")

GluGluHToWW_Pt-200ToInf_M-125 train 230992
GluGluHToWW_Pt-200ToInf_M-125 test 153996
--------------------------
TTToSemiLeptonic train 1013771
TTToSemiLeptonic test 675848
--------------------------
WJetsToLNu_HT-200To400 train 403144
WJetsToLNu_HT-200To400 test 268764
--------------------------
WJetsToLNu_HT-400To600 train 780325
WJetsToLNu_HT-400To600 test 520218
--------------------------
WJetsToLNu_HT-600To800 train 547966
WJetsToLNu_HT-600To800 test 365312
--------------------------
QCD_Pt_170to300 train 496477
QCD_Pt_170to300 test 330986
--------------------------
QCD_Pt_300to470 train 593239
QCD_Pt_300to470 test 395494
--------------------------
QCD_Pt_470to600 train 1098982
QCD_Pt_470to600 test 732655
--------------------------
QCD_Pt_600to800 train 203296
QCD_Pt_600to800 test 135532
--------------------------


# Check Number of events after selection

In [186]:
samples = [
    "GluGluHToWW_Pt-200ToInf_M-125",
    "TTToSemiLeptonic",
    "WJetsToLNu_HT-200To400",
    "WJetsToLNu_HT-400To600",
    "WJetsToLNu_HT-600To800",
    "QCD_Pt_170to300",
    "QCD_Pt_300to470",
    "QCD_Pt_470to600",
    "QCD_Pt_600to800",
]

OUTPATH = "../datafiles/TaggerInput/"

num_events = {}


for sample in samples:

    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/train/out.root")["Events"]
            ### put selection below
            selection = (events["fj_pt"].array()>300) & (events["fj_pt"].array()<400)
            num += ak.sum(selection)
        except:
            continue
    print(sample, "train", num)


    num = 0    
    for dir_TaggerInput in os.listdir(f"{OUTPATH}/"):   
        if "DS_Store" in dir_TaggerInput:
            continue    
            
        try:
            events = uproot.open(f"{OUTPATH}/{dir_TaggerInput}/{sample}/outfiles/test/out.root")["Events"]
            ### put selection below
            selection = (events["fj_pt"].array()>300) & (events["fj_pt"].array()<400)
            num += ak.sum(selection)
        except:
            continue
    print(sample, "test", num)
    
    print("--------------------------")        

GluGluHToWW_Pt-200ToInf_M-125 train 56305
GluGluHToWW_Pt-200ToInf_M-125 test 37348
--------------------------
TTToSemiLeptonic train 213367
TTToSemiLeptonic test 141642
--------------------------
WJetsToLNu_HT-200To400 train 38310
WJetsToLNu_HT-200To400 test 25504
--------------------------
WJetsToLNu_HT-400To600 train 176549
WJetsToLNu_HT-400To600 test 117179
--------------------------
WJetsToLNu_HT-600To800 train 238595
WJetsToLNu_HT-600To800 test 158614
--------------------------
QCD_Pt_170to300 train 57234
QCD_Pt_170to300 test 38501
--------------------------
QCD_Pt_300to470 train 329364
QCD_Pt_300to470 test 219841
--------------------------
QCD_Pt_470to600 train 66515
QCD_Pt_470to600 test 44313
--------------------------
QCD_Pt_600to800 train 5362
QCD_Pt_600to800 test 3647
--------------------------
