In [72]:
import argparse
import json
import os
import pathlib
import pickle as pkl
import shutil
import sys
import time
import warnings
from collections import defaultdict
from typing import Dict, List, Optional

import awkward as ak
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

### schema
import uproot
from coffea import nanoevents, processor
from coffea.analysis_tools import PackedSelection, Weights
from coffea.nanoevents import BaseSchema, NanoAODSchema, NanoEventsFactory
from coffea.nanoevents.methods import candidate, vector

import mplhep as hep

plt.style.use(hep.style.CMS)

### awkward 1.10.0
sys.path.append("../")

nanoevents.PFNanoAODSchema.mixins["PFCands"] = "PFCand"
nanoevents.PFNanoAODSchema.mixins["SV"] = "PFCand"

warnings.filterwarnings("ignore", message="Found duplicate branch ")
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", message="Missing cross-reference index ")
warnings.filterwarnings("ignore", message="divide by zero encountered in log")
np.seterr(invalid="ignore")

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}

In [73]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [74]:
! ls ../datafiles/ntuples/*

../datafiles/ntuples/inputprocessor.py [31m../datafiles/ntuples/run_skimmer.sh[m[m

../datafiles/ntuples/GluGluHToWW_Pt-200ToInf_M-125:
[34m2016[m[m    [34m2016APV[m[m [34m2017[m[m    [34m2018[m[m

../datafiles/ntuples/QCD_Pt_170to300:
[34m2018[m[m

../datafiles/ntuples/QCD_Pt_300to470:
[34m2018[m[m

../datafiles/ntuples/QCD_Pt_470to600:
[34m2018[m[m

../datafiles/ntuples/QCD_Pt_600to800:
[34m2018[m[m

../datafiles/ntuples/TTToSemiLeptonic:
[34m2018[m[m

../datafiles/ntuples/VBFHToWWToLNuQQ_M-125_withDipoleRecoil:
[34m2016[m[m    [34m2016APV[m[m [34m2017[m[m    [34m2018[m[m

../datafiles/ntuples/WJetsToLNu_HT-200To400:
[34m2018[m[m

../datafiles/ntuples/WJetsToLNu_HT-400To600:
[34m2018[m[m

../datafiles/ntuples/WJetsToLNu_HT-600To800:
[34m2018[m[m


# Higgs (ggF)

In [127]:
events = uproot.open("../datafiles/ntuples/GluGluHToWW_Pt-200ToInf_M-125/2018/train/out.root")["Events"]
events.keys()

['fj_eta',
 'fj_phi',
 'fj_mass',
 'fj_pt',
 'fj_msoftdrop',
 'fj_lsf3',
 'fj_genjetmass',
 'fj_genRes_pt',
 'fj_genRes_eta',
 'fj_genRes_phi',
 'fj_genRes_mass',
 'fj_genH_pt',
 'fj_genH_jet',
 'fj_genV_dR',
 'fj_genVstar',
 'genV_genVstar_dR',
 'fj_isHVV',
 'fj_isHVV_Matched',
 'fj_isHVV_4q',
 'fj_isHVV_elenuqq',
 'fj_isHVV_munuqq',
 'fj_isHVV_taunuqq',
 'fj_isHVV_Vlepton',
 'fj_isHVV_Vstarlepton',
 'fj_nquarks',
 'fj_lepinprongs',
 'fj_isV',
 'fj_isV_Matched',
 'fj_isV_2q',
 'fj_isV_elenu',
 'fj_isV_munu',
 'fj_isV_taunu',
 'fj_nprongs',
 'fj_ncquarks',
 'fj_isV_lep',
 'fj_isTop',
 'fj_isTop_Matched',
 'fj_Top_numMatched',
 'fj_isTop_W_lep_b',
 'fj_isTop_W_lep',
 'fj_isTop_W_ele_b',
 'fj_isTop_W_ele',
 'fj_isTop_W_mu_b',
 'fj_isTop_W_mu',
 'fj_isTop_W_tau_b',
 'fj_isTop_W_tau',
 'fj_Top_nquarksnob',
 'fj_Top_nbquarks',
 'fj_Top_ncquarks',
 'fj_Top_nleptons',
 'fj_Top_nele',
 'fj_Top_nmu',
 'fj_Top_ntau',
 'fj_Top_taudecay',
 'fj_isQCD',
 'fj_isQCD_Matched',
 'fj_isQCDb',
 'fj_isQCDb

In [129]:
(events["fj_isggF"].array()==0).to_numpy().sum()   # all is ggF

0

In [131]:
(events["fj_isVBF"].array()==1).to_numpy().sum()   # no vbf

0

In [132]:
(events["fj_genRes_mass"].array()!=125).to_numpy().sum()  # all fj_genRes_mass is 125

0

In [133]:
ishww = events["fj_isHVV"].array()
print("ishww:", ak.sum(ishww))
print("~ishww:", ak.sum(~ishww))

ishww: 52107
~ishww: 0


In [134]:
matched_higgs = events["fj_isHVV_Matched"].array()
print("isHVV_Matched:", ak.sum(matched_higgs))
print("~isHVV_Matched:", ak.sum(~matched_higgs))

isHVV_Matched: 32978
~isHVV_Matched: 19129


In [135]:
one_lep = events["fj_isHVV_elenuqq"].array() | events["fj_isHVV_munuqq"].array()
one_lep_inprongs = events["fj_lepinprongs"].array()==1
two_quarks = events["fj_nquarks"].array()==2

ak.sum(one_lep & one_lep_inprongs & two_quarks)

10783

In [136]:
ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs)

10782

In [137]:
dr = events["lep_dR_fj"].array()<0.8

ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs & dr)

10782

# Higgs (VBF)

In [139]:
events = uproot.open("../datafiles/ntuples/VBFHToWWToLNuQQ_M-125_withDipoleRecoil/2018/train/out.root")["Events"]

In [143]:
(events["fj_isVBF"].array()==0).to_numpy().sum()   # all is vbf

0

In [144]:
(events["fj_isggF"].array()==1).to_numpy().sum()   # no ggF

0

In [146]:
(events["fj_genRes_mass"].array()!=125).to_numpy().sum()  # MOST fj_genRes_mass is 125

5

In [147]:
ishww = events["fj_isHVV"].array()
print("ishww:", ak.sum(ishww))
print("~ishww:", ak.sum(~ishww))

ishww: 1238
~ishww: 0


In [148]:
matched_higgs = events["fj_isHVV_Matched"].array()
print("isHVV_Matched:", ak.sum(matched_higgs))
print("~isHVV_Matched:", ak.sum(~matched_higgs))

isHVV_Matched: 832
~isHVV_Matched: 406


In [149]:
one_lep = events["fj_isHVV_elenuqq"].array() | events["fj_isHVV_munuqq"].array()
one_lep_inprongs = events["fj_lepinprongs"].array()==1
two_quarks = events["fj_nquarks"].array()==2

ak.sum(one_lep & one_lep_inprongs & two_quarks)

450

In [150]:
ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs)

450

In [151]:
dr = events["lep_dR_fj"].array()<0.8

ak.sum(one_lep & one_lep_inprongs & two_quarks & matched_higgs & dr)

450

# QCD file

In [154]:
events = uproot.open("../datafiles/ntuples/QCD_Pt_300to470/2018/train/out.root")["Events"]

In [156]:
(events["fj_genRes_mass"].array()!=0).to_numpy().sum()

0

In [159]:
(events["fj_isQCD"].array()!=1).to_numpy().sum()

0

# WJets file

In [161]:
events = uproot.open("../datafiles/ntuples/WJetsToLNu_HT-200To400/2018/train/out.root")["Events"]

In [163]:
(events["fj_genRes_mass"].array()!=0).to_numpy().sum()

0

In [164]:
(events["fj_isV"].array()!=1).to_numpy().sum()

0

In [168]:
matched_higgs = events["fj_isV_Matched"].array()
print("isV_Matched:", ak.sum(matched_higgs))
print("~isV_Matched:", ak.sum(~matched_higgs))

isV_Matched: 7531
~isV_Matched: 4698


In [170]:
one_lep = events["fj_isV_elenu"].array() | events["fj_isV_munu"].array()
one_lep_inprongs = events["fj_lepinprongs"].array()==1
two_quarks = events["fj_isV_2q"].array()

ak.sum( (one_lep & one_lep_inprongs) | two_quarks)

7513

# Top file

In [176]:
events = uproot.open("../datafiles/ntuples/TTToSemiLeptonic/2018/train/out.root")["Events"]

In [178]:
(events["fj_isTop"].array()!=1).to_numpy().sum()

0

In [179]:
matched_higgs = events["fj_isTop_Matched"].array()
print("isTop_Matched:", ak.sum(matched_higgs))
print("~isTop_Matched:", ak.sum(~matched_higgs))

isTop_Matched: 124432
~isTop_Matched: 25366


In [181]:
print("numMatched==0:", ak.sum(events["fj_Top_numMatched"].array()==0))
print("numMatched==1:", ak.sum(events["fj_Top_numMatched"].array()==1))
print("numMatched==2:", ak.sum(events["fj_Top_numMatched"].array()==2))
print("numMatched==3:", ak.sum(events["fj_Top_numMatched"].array()==3))

numMatched==0: 25108
numMatched==1: 119101
numMatched==2: 5589
numMatched==3: 0


In [186]:
print("Top_Wlep==1:", (events["fj_isTop_W_lep"].array()==1).to_numpy().sum())
print("Top_Wlep_b==1:", (events["fj_isTop_W_lep_b"].array()==1).to_numpy().sum())

Top_Wlep==1: 84508
Top_Wlep_b==1: 62895
