In [1]:
import argparse
import json
import os
import pathlib
import pickle as pkl
import shutil
import sys
import time
import warnings
from collections import defaultdict
from typing import Dict, List, Optional

import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

### schema
import uproot
from coffea import nanoevents, processor
from coffea.analysis_tools import PackedSelection, Weights
from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory
from coffea.nanoevents.methods import candidate, vector

import mplhep as hep

plt.style.use(hep.style.CMS)

### awkward 1.10.0
sys.path.append("../")

nanoevents.PFNanoAODSchema.mixins["PFCands"] = "PFCand"
nanoevents.PFNanoAODSchema.mixins["SV"] = "PFCand"

warnings.filterwarnings("ignore", message="Found duplicate branch ")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="Missing cross-reference index ")
warnings.filterwarnings("ignore", message="divide by zero encountered in log")
np.seterr(invalid="ignore")



{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [2]:
%load_ext autoreload
%autoreload 2

In [189]:
! ls ../datafiles/new/*

../datafiles/ntuples/inputprocessor.py [31m../datafiles/ntuples/run_skimmer.sh[m[m

../datafiles/ntuples/GluGluHToWW_Pt-200ToInf_M-125:
[34m2016[m[m    [34m2016APV[m[m [34m2017[m[m    [34m2018[m[m

../datafiles/ntuples/QCD_Pt_170to300:
[34m2018[m[m

../datafiles/ntuples/QCD_Pt_300to470:
[34m2018[m[m

../datafiles/ntuples/QCD_Pt_470to600:
[34m2018[m[m

../datafiles/ntuples/QCD_Pt_600to800:
[34m2018[m[m

../datafiles/ntuples/TTToSemiLeptonic:
[34m2018[m[m

../datafiles/ntuples/VBFHToWWToLNuQQ_M-125_withDipoleRecoil:
[34m2016[m[m    [34m2016APV[m[m [34m2017[m[m    [34m2018[m[m

../datafiles/ntuples/WJetsToLNu_HT-200To400:
[34m2018[m[m

../datafiles/ntuples/WJetsToLNu_HT-400To600:
[34m2018[m[m

../datafiles/ntuples/WJetsToLNu_HT-600To800:
[34m2018[m[m


# Higgs (ggF)

In [None]:
events = uproot.open("../datafiles/ntuples/GluGluHToWW_Pt-200ToInf_M-125/2018/train/out.root")["Events"]


In [9]:
! ls ../datafiles/TaggerInput/TaggerInput_2017/JHUVariableWMass_part1/outfiles/train/out.root

../datafiles/TaggerInput/TaggerInput_2017/JHUVariableWMass_part1/outfiles/train/out.root


In [10]:
events = uproot.open("../datafiles/TaggerInput/TaggerInput_2017/JHUVariableWMass_part1/outfiles/train/out.root")["Events"]
events.keys()

['fj_eta',
 'fj_phi',
 'fj_mass',
 'fj_pt',
 'fj_msoftdrop',
 'fj_lsf3',
 'fj_genjetmass',
 'fj_isggF',
 'fj_isVBF',
 'fj_isJHUVariableWMass',
 'fj_isBulkGraviton',
 'fj_genRes_pt',
 'fj_genRes_eta',
 'fj_genRes_phi',
 'fj_genRes_mass',
 'fj_genH_jet',
 'fj_genV_dR',
 'fj_genVstar',
 'genV_genVstar_dR',
 'fj_isHVV',
 'fj_isHVV_Matched',
 'fj_isHVV_4q',
 'fj_isHVV_elenuqq',
 'fj_isHVV_munuqq',
 'fj_isHVV_taunuqq',
 'fj_isHVV_Vlepton',
 'fj_isHVV_Vstarlepton',
 'fj_nquarks',
 'fj_lepinprongs',
 'fj_isV',
 'fj_isV_Matched',
 'fj_isV_2q',
 'fj_isV_elenu',
 'fj_isV_munu',
 'fj_isV_taunu',
 'fj_nprongs',
 'fj_ncquarks',
 'fj_isV_lep',
 'fj_isTop',
 'fj_isTop_Matched',
 'fj_Top_numMatched',
 'fj_isTop_W_lep_b',
 'fj_isTop_W_lep',
 'fj_isTop_W_ele_b',
 'fj_isTop_W_ele',
 'fj_isTop_W_mu_b',
 'fj_isTop_W_mu',
 'fj_isTop_W_tau_b',
 'fj_isTop_W_tau',
 'fj_Top_nquarksnob',
 'fj_Top_nbquarks',
 'fj_Top_ncquarks',
 'fj_Top_nleptons',
 'fj_Top_nele',
 'fj_Top_nmu',
 'fj_Top_ntau',
 'fj_Top_taudecay',


In [190]:
events = uproot.open("../datafiles/ntuples/GluGluHToWW_Pt-200ToInf_M-125/2018/train/out.root")["Events"]
events.keys()

['fj_eta',
 'fj_phi',
 'fj_mass',
 'fj_pt',
 'fj_msoftdrop',
 'fj_lsf3',
 'fj_genjetmass',
 'fj_genRes_pt',
 'fj_genRes_eta',
 'fj_genRes_phi',
 'fj_genRes_mass',
 'fj_genH_pt',
 'fj_genH_jet',
 'fj_genV_dR',
 'fj_genVstar',
 'genV_genVstar_dR',
 'fj_isHVV',
 'fj_isHVV_Matched',
 'fj_isHVV_4q',
 'fj_isHVV_elenuqq',
 'fj_isHVV_munuqq',
 'fj_isHVV_taunuqq',
 'fj_isHVV_Vlepton',
 'fj_isHVV_Vstarlepton',
 'fj_nquarks',
 'fj_lepinprongs',
 'fj_isV',
 'fj_isV_Matched',
 'fj_isV_2q',
 'fj_isV_elenu',
 'fj_isV_munu',
 'fj_isV_taunu',
 'fj_nprongs',
 'fj_ncquarks',
 'fj_isV_lep',
 'fj_isTop',
 'fj_isTop_Matched',
 'fj_Top_numMatched',
 'fj_isTop_W_lep_b',
 'fj_isTop_W_lep',
 'fj_isTop_W_ele_b',
 'fj_isTop_W_ele',
 'fj_isTop_W_mu_b',
 'fj_isTop_W_mu',
 'fj_isTop_W_tau_b',
 'fj_isTop_W_tau',
 'fj_Top_nquarksnob',
 'fj_Top_nbquarks',
 'fj_Top_ncquarks',
 'fj_Top_nleptons',
 'fj_Top_nele',
 'fj_Top_nmu',
 'fj_Top_ntau',
 'fj_Top_taudecay',
 'fj_isQCD',
 'fj_isQCD_Matched',
 'fj_isQCDb',
 'fj_isQCDb

In [11]:
(events["fj_isggF"].array()==0).to_numpy().sum()   # all is ggF

861108

In [14]:
(events["fj_isVBF"].array()==1).to_numpy().sum()   # no vbf

0

In [15]:
(events["fj_genRes_mass"].array()!=125).to_numpy().sum()  # all fj_genRes_mass is 125

861108

In [28]:
events["fj_ParT_hidNeuron001"].array()

<Array [-4.17, -0.562, -3.86, ... -1.21, -1.74] type='861108 * float64'>

In [16]:
ishww = events["fj_isHVV"].array()
print("ishww:", ak.sum(ishww))
print("~ishww:", ak.sum(~ishww))

ishww: 861108
~ishww: 0


In [18]:
matched_higgs = events["fj_isHVV_Matched"].array()
print("isHVV_Matched:", ak.sum(matched_higgs))
print("~isHVV_Matched:", ak.sum(~matched_higgs))

isHVV_Matched: 838170
~isHVV_Matched: 22938


In [38]:
print("isHVV_Matched:", ak.sum(events["fj_isHVV_4q"].array()))

isHVV_Matched: 0


In [29]:
one_lep = events["fj_isHVV_elenuqq"].array() | events["fj_isHVV_munuqq"].array()
ak.sum(one_lep)

0

In [31]:
for key in events:
    print(key)

<TBranch 'fj_eta' at 0x00016eb50490>
<TBranch 'fj_phi' at 0x00017f947fa0>
<TBranch 'fj_mass' at 0x00017f10cd90>
<TBranch 'fj_pt' at 0x00017f112ca0>
<TBranch 'fj_msoftdrop' at 0x00016ec06c10>
<TBranch 'fj_lsf3' at 0x00016ec20460>
<TBranch 'fj_genjetmass' at 0x00016ec20eb0>
<TBranch 'fj_isggF' at 0x00016eb36970>
<TBranch 'fj_isVBF' at 0x00016ec04760>
<TBranch 'fj_isJHUVariableWMass' at 0x00016ec04fd0>
<TBranch 'fj_isBulkGraviton' at 0x00016ec08700>
<TBranch 'fj_genRes_pt' at 0x00016ec08ee0>
<TBranch 'fj_genRes_eta' at 0x00016eb3f760>
<TBranch 'fj_genRes_phi' at 0x00016ec38a90>
<TBranch 'fj_genRes_mass' at 0x0001037b62e0>
<TBranch 'fj_genH_jet' at 0x00016eb56610>
<TBranch 'fj_genV_dR' at 0x00016ec322b0>
<TBranch 'fj_genVstar' at 0x00016ec32a00>
<TBranch 'genV_genVstar_dR' at 0x00016eb522e0>
<TBranch 'fj_isHVV' at 0x00016eb52e50>
<TBranch 'fj_isHVV_Matched' at 0x00017f2442e0>
<TBranch 'fj_isHVV_4q' at 0x00017f244a90>
<TBranch 'fj_isHVV_elenuqq' at 0x00016ec32160>
<TBranch 'fj_isHVV_munuqq'

In [26]:
one_lep = events["fj_isHVV_elenuqq"].array() | events["fj_isHVV_munuqq"].array()
one_lep_inprongs = events["fj_lepinprongs"].array()==1
two_quarks = events["fj_nquarks"].array()==2

ak.sum(one_lep & one_lep_inprongs & two_quarks)

0

In [20]:
ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs)

0

In [21]:
dr = events["lep_dR_fj"].array()<0.8

ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs & dr)

KeyInFileError: not found: 'lep_dR_fj'

    Available keys: 'lep_fj_dr', 'lep_pt', 'lep_met_mt', 'lep_miso', 'lep_reliso', 'fj_pt', 'mjj', 'fj_eta', 'fj_phi', 'fj_genV_dR', 'fj_isV', 'met_pt', 'lep_pt_ratio', 'deta', 'j1_m', 'j2_m', 'ht', 'fj_mass', 'fj_lsf3', 'met_fj_dphi'...

in file ../datafiles/TaggerInput/TaggerInput_2017/JHUVariableWMass_part1/outfiles/train/out.root
in object /Events;1

# Higgs (VBF)

In [139]:
events = uproot.open("../datafiles/ntuples/VBFHToWWToLNuQQ_M-125_withDipoleRecoil/2018/train/out.root")["Events"]

In [143]:
(events["fj_isVBF"].array()==0).to_numpy().sum()   # all is vbf

0

In [144]:
(events["fj_isggF"].array()==1).to_numpy().sum()   # no ggF

0

In [146]:
(events["fj_genRes_mass"].array()!=125).to_numpy().sum()  # MOST fj_genRes_mass is 125

5

In [147]:
ishww = events["fj_isHVV"].array()
print("ishww:", ak.sum(ishww))
print("~ishww:", ak.sum(~ishww))

ishww: 1238
~ishww: 0


In [148]:
matched_higgs = events["fj_isHVV_Matched"].array()
print("isHVV_Matched:", ak.sum(matched_higgs))
print("~isHVV_Matched:", ak.sum(~matched_higgs))

isHVV_Matched: 832
~isHVV_Matched: 406


In [149]:
one_lep = events["fj_isHVV_elenuqq"].array() | events["fj_isHVV_munuqq"].array()
one_lep_inprongs = events["fj_lepinprongs"].array()==1
two_quarks = events["fj_nquarks"].array()==2

ak.sum(one_lep & one_lep_inprongs & two_quarks)

450

In [150]:
ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs)

450

In [151]:
dr = events["lep_dR_fj"].array()<0.8

ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs & dr)

450

# QCD file

In [154]:
events = uproot.open("../datafiles/ntuples/QCD_Pt_300to470/2018/train/out.root")["Events"]

In [156]:
(events["fj_genRes_mass"].array()!=0).to_numpy().sum()

0

In [159]:
(events["fj_isQCD"].array()!=1).to_numpy().sum()

0

# WJets file

In [197]:
events = uproot.open("../datafiles/ntuples/WJetsToLNu_HT-200To400/2018/train/out.root")["Events"]

In [198]:
(events["fj_genRes_mass"].array()!=0).to_numpy().sum()

0

In [199]:
(events["fj_isV"].array()!=1).to_numpy().sum()

0

In [200]:
matched_higgs = events["fj_isV_Matched"].array()
print("isV_Matched:", ak.sum(matched_higgs))
print("~isV_Matched:", ak.sum(~matched_higgs))

isV_Matched: 7531
~isV_Matched: 4698


In [202]:
one_lep = events["fj_isV_elenu"].array() | events["fj_isV_munu"].array()
one_tau = events["fj_isV_taunu"].array()
one_lep_inprongs = events["fj_lepinprongs"].array()==1
two_quarks = events["fj_isV_2q"].array()

ak.sum( (one_lep & one_lep_inprongs) | two_quarks)

7513

In [219]:
print(ak.sum((two_quarks | one_lep | one_tau)==1))
print(ak.sum((two_quarks | one_lep | one_tau)!=1))

12229
0


In [None]:
 ( (fj_isV_2q==1) | (fj_isV_munu==1) | (fj_isV_elenu==1) | (fj_isV_taunu==1) )

In [211]:
(two_quarks | one_lep | one_tau)

<Array [1, 1, 1, 1, 1, 1, ... 1, 1, 1, 1, 1, 1] type='12229 * int64'>

In [221]:
ak.sum(two_quarks)

0

# Top file

In [222]:
events = uproot.open("../datafiles/ntuples/TTToSemiLeptonic/2018/train/out.root")["Events"]

In [223]:
(events["fj_isTop"].array()!=1).to_numpy().sum()

0

In [179]:
matched_higgs = events["fj_isTop_Matched"].array()
print("isTop_Matched:", ak.sum(matched_higgs))
print("~isTop_Matched:", ak.sum(~matched_higgs))

isTop_Matched: 124432
~isTop_Matched: 25366


In [181]:
print("numMatched==0:", ak.sum(events["fj_Top_numMatched"].array()==0))
print("numMatched==1:", ak.sum(events["fj_Top_numMatched"].array()==1))
print("numMatched==2:", ak.sum(events["fj_Top_numMatched"].array()==2))
print("numMatched==3:", ak.sum(events["fj_Top_numMatched"].array()==3))

numMatched==0: 25108
numMatched==1: 119101
numMatched==2: 5589
numMatched==3: 0


In [226]:
print("Top_Wlep==1:", (events["fj_isTop_W_lep"].array()==1).to_numpy().sum())
print("Top_Wlep_b==1:", (events["fj_isTop_W_lep_b"].array()==1).to_numpy().sum())

Top_Wlep==1: 84508
Top_Wlep_b==1: 62895


In [230]:
ak.sum((events["fj_isTop_W_lep"].array()==1) | (events["fj_isTop_W_lep_b"].array()==1))

84508