In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os

import awkward as ak
import uproot
import numpy as np
import glob
from coffea.nanoevents import NanoEventsFactory, BaseSchema, NanoAODSchema
from coffea import hist, processor
# register our candidate behaviors
from coffea.nanoevents.methods import candidate
ak.behavior.update(candidate.behavior)

from functools import partial

from tools.helpers import get_four_vec_fromPtEtaPhiM, match

from yahist import Hist1D, Hist2D

import json

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import mplhep as hep

plt.style.use(hep.style.CMS)  # or ATLAS/LHCb

In [None]:
from tools.helpers import dasWrapper
from analysis.tagger import desired_output

redirector_ucsd = 'root://xcache-redirector.t2.ucsd.edu:2042/'

# maybe we'll need the number of events for weighting, but not at the moment
def get_nevents(name):
    res = dasWrapper(name, query='summary')
    return json.loads(res[0])[0]['nevents']


In [None]:
samples = [
    #'/ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM',
    '/ZJetsToNuNu_HT-100To200_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/ZJetsToNuNu_HT-200To400_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/ZJetsToNuNu_HT-400To600_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/ZJetsToNuNu_HT-600To800_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/ZJetsToNuNu_HT-800To1200_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/ZJetsToNuNu_HT-1200To2500_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/ZJetsToNuNu_HT-2500ToInf_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
]

tt_samples = [
    #'/TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/TTTo2L2Nu_TuneCP5_13TeV-powheg-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
]

QCD_samples = [
    #'/QCD_bEnriched_HT100to200_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/QCD_bEnriched_HT200to300_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/QCD_bEnriched_HT300to500_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/QCD_bEnriched_HT500to700_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/QCD_bEnriched_HT700to1000_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/QCD_bEnriched_HT1000to1500_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/QCD_bEnriched_HT1500to2000_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/QCD_bEnriched_HT2000toInf_TuneCP5_13TeV-madgraph-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
]

W_samples = [
    '/W0JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM'
    #'/W1JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/W2JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/W3JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM',
    #'/W4JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v2/NANOAODSIM',
]

In [None]:
fileset = {s.split('/')[1]: [redirector_ucsd+p for p in dasWrapper(s)] for s in samples}

In [None]:
fileset_QCD = {s.split('/')[1]: [redirector_ucsd+p for p in dasWrapper(s)] for s in QCD_samples}
fileset_W = {s.split('/')[1]: [redirector_ucsd+p for p in dasWrapper(s)] for s in W_samples}
fileset_tt = {s.split('/')[1]: [redirector_ucsd+p for p in dasWrapper(s)][:10] for s in tt_samples}

In [None]:
dasWrapper('/W1JetsToLNu_TuneCP5_13TeV-madgraphMLM-pythia8/RunIISummer20UL18NanoAODv9-106X_upgrade2018_realistic_v16_L1v1-v1/NANOAODSIM')

In [None]:
fileset_W

In [None]:
exe_args = {
    'workers': 12,
    "schema": NanoAODSchema,
    "skipbadfiles": True,
}
exe = processor.futures_executor

## Get the efficiencies

In [None]:
from analysis.tagger import measure_eff

output = processor.run_uproot_job(
            fileset,
            "Events",
            measure_eff(accumulator=desired_output),
            exe,
            exe_args,
            chunksize=500000,
        )

In [None]:
def get_efficiency(region, process, mass_range=slice(100,150)):

    pt_bins  = hist.Bin('pt', r'$p_{T} \ (GeV)$', [200, 300, 400, 600, 800])
    eta_bins = hist.Bin('eta', r'$\eta$', 3, 0, 2.4)

    inclusive = output[region][process].rebin('pt', pt_bins).rebin('eta', eta_bins)
    tagged = output[region+'_tagged'][process].rebin('pt', pt_bins).rebin('eta', eta_bins)

    h1 = Hist2D.from_bincounts(
        inclusive.integrate('mass', int_range=mass_range).sum('phi', 'dataset').values()[()].T,
        (
            inclusive.axis('pt').edges(),
            inclusive.axis('eta').edges(),
        )
    )
    
    h2 = Hist2D.from_bincounts(
        tagged.integrate('mass', int_range=mass_range).sum('phi', 'dataset').values()[()].T,
        (
            tagged.axis('pt').edges(),
            tagged.axis('eta').edges(),
        )
    )
    

    return h2.divide(h1)

In [None]:
# check that stuff actually ran

output['0b'].sum('pt', 'eta', 'phi', 'mass').values()

In [None]:
for s in fileset.keys():
    print (s)
    for b in ['0b', '1b', '2b', '1h']:
        #print (b)
        h = get_efficiency(b, s, mass_range=slice(0,500))
        #h.plot()

        h.to_json(os.path.expandvars("../data/htag/eff_%s_%s.json"%(s,b)))
        
        #del h
    
h = get_efficiency(
    '2b',
    'TTTo2L2Nu_TuneCP5_13TeV-powheg-pythia8',
    mass_range=slice(0,500),
)
fig, ax = plt.subplots(1,1,figsize=(7,7))
h.plot()

In [None]:
h = get_efficiency(
    '2b',
    'TTToSemiLeptonic_TuneCP5_13TeV-powheg-pythia8',
    mass_range=slice(0,500),
)
fig, ax = plt.subplots(1,1,figsize=(7,7))
h.plot()

## Apply the efficiency

In [None]:
effs = {}
for s in fileset.keys():
    effs[s] = {}
    print (s)
    for b in ['0b', '1b', '2b', '1h']:
        #print (b)
        #h = get_efficiency(b, s, mass_range=slice(0,500))
        #h.plot()
        effs[s][b] = Hist2D.from_json(os.path.expandvars("../data/htag/eff_%s_%s.json"%(s,b)))

In [None]:
effs['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8']['1h'].plot()

In [None]:
np.isnan(sum(sum(effs['ZJetsToNuNu_HT-2500ToInf_TuneCP5_13TeV-madgraphMLM-pythia8']['1h'].counts)))

In [None]:
from tools.helpers import yahist_2D_lookup
yahist_2D_lookup(
    effs['ZJetsToNuNu_HT-2500ToInf_TuneCP5_13TeV-madgraphMLM-pythia8']['0b'],
    ak.Array([[700]]),
    ak.Array([[2.]]),
)

In [None]:
from analysis.tagger import apply_eff, desired_output

output = processor.run_uproot_job(
            fileset,
            "Events",
            apply_eff(
                accumulator=desired_output,
                effs = effs,
            ),
            exe,
            exe_args,
            chunksize=500000,
        )

### Closure and sanity checks

Make sure that the method closes in pt, and gives reasonable agreement in the mass distribution.


In [None]:
# if inclusive number below is nan we have a bug in applying the efficiencies.
output['inclusive'].sum('pt', 'eta', 'phi', 'mass').values()

In [None]:
# inclusive and tagged numbers should agree within percent level
output['tagged'].sum('pt', 'eta', 'phi', 'mass').values()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,7))

h1 = Hist1D.from_bincounts(
        output['tagged']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].sum('phi', 'mass', 'eta', 'dataset').values()[()],
        output['tagged']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].axis('pt').edges(),
    )

h2 = Hist1D.from_bincounts(
        output['inclusive']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].sum('phi', 'mass', 'eta', 'dataset').values()[()],
        output['inclusive']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].axis('pt').edges(),
    )
h1.plot()
h2.plot()

In [None]:
h1 = Hist1D.from_bincounts(
        output['tagged']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].sum('phi', 'pt', 'eta', 'dataset').values()[()],
        output['tagged']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].axis('mass').edges(),
    )

h2 = Hist1D.from_bincounts(
        output['inclusive']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].sum('phi', 'pt', 'eta', 'dataset').values()[()],
        output['inclusive']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].axis('mass').edges(),
    )
h1.plot()
h2.plot()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(7,7))

h1 = Hist1D.from_bincounts(
        output['NH_true']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].sum('dataset').values()[()],
        output['NH_true']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].axis('multiplicity').edges(),
    )

h2 = Hist1D.from_bincounts(
        output['NH_weight']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].sum('dataset').values()[()],
        output['NH_weight']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].axis('multiplicity').edges(),
    )
h1.plot()
h2.plot()

ax.set_yscale('log')

### Compare prediction of tagged jets in the interesting mass window

In [None]:
output['inclusive']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].integrate('mass', int_range=slice(100,150)).sum('phi', 'pt', 'eta', 'dataset').values()[()]

In [None]:
output['tagged']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].integrate('mass', int_range=slice(100,150)).sum('phi', 'pt', 'eta', 'dataset').values()[()]

In [None]:
16166.0/14888.0

In [None]:
test_eff = ak.Array([[0.1,0.2], [0.15], []])
1-ak.prod(1-test_eff, axis=1)

In [None]:
np.zeros_like(ak.num(test_eff))

In [None]:
output['NH_true']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].values()

In [None]:
output['NH_weight']['ttHTobb_M125_TuneCP5_13TeV-powheg-pythia8'].values()

In [None]:
54281.68960112/(5.3884000e+04+1.7600000e+02)

## WIP

In [None]:
def compute_darkness(r, g, b, a=1.0):
    """Compute the 'darkness' value from RGBA (darkness = 1 - luminance)
       stolen from Nick Amin: https://github.com/aminnj/yahist
       Version from Jonathan Guiang: https://gist.github.com/jkguiang/279cb4d2e68e64148afc62274df09f18
    """
    return a * (1.0 - (0.299 * r + 0.587 * g + 0.114 * b))

def bin_text(counts, x_edges, y_edges, axes, cbar, errors=None, size=10, fmt=":0.2e"):
    """Write bin population on top of 2D histogram bins,
       stolen from Nick Amin: https://github.com/aminnj/yahist
       Version from Jonathan Guiang: https://gist.github.com/jkguiang/279cb4d2e68e64148afc62274df09f18
    """
    show_errors = (type(errors) != type(None))
    x_centers = x_edges[1:]-(x_edges[1:]-x_edges[:-1])/2
    y_centers = y_edges[1:]-(y_edges[1:]-y_edges[:-1])/2
    
    if show_errors:
        label_template = r"{0"+fmt+"}\n$\pm{1:0.2f}\%$"
    else:
        errors = np.zeros(counts.shape)
        label_template = r"{0"+fmt+"}"
        
    xyz = np.c_[        
        np.tile(x_centers, len(y_centers)),
        np.repeat(y_centers, len(x_centers)),
        counts.flatten(),
        errors.flatten()
    ][counts.flatten() != 0]

    r, g, b, a = cbar.mappable.to_rgba(xyz[:, 2]).T
    colors = np.zeros((len(xyz), 3))
    colors[compute_darkness(r, g, b, a) > 0.45] = 1

    for (x, y, count, err), color in zip(xyz, colors):
        axes.text(
            x,
            y,
            label_template.format(count, err),
            color=color,
            ha="center",
            va="center",
            fontsize=size,
            wrap=True,
        )

    return

In [None]:
h2.divide(h1).counts

In [None]:
h2.edges[0]