# Main notebook for ParticleNet AK15 cc-tagger SF derivation 
## (`coffea`+`awkward` workflow)

The notebook aims to
 - Make the ROOT-format **templates** for fit
 - Produce **data/MC comparison plots** under some given event selection
 - Produce **H->cc signal and g->cc proxy jets comparison plots** on various jet observables
 
We adopt the `coffea`+`awkward` non-processor workflow in this notebook, illustrated as follows:

    Input files (flat ROOT-tuples derived from analysis NanoAOD)
    -> use the `coffea` event factory to load the branches as awkward arrays (in the lazy way)
    -> manipulate the awkward arrays
    -> produce histograms (`boost_histogram`)
    -> (1) convert to TH1D for ROOT template; or (2) plot with `mplhep` using `matplotlib` as backend
    
An earlier notebook using `uproot`+`panda` workflow is given in `ak15_sf_main_pd.ipynb`. The two notebooks have exactly the same goal and have block-to-block correspondence.

# Make templates for fit

In [None]:
from coffea.nanoevents import NanoEventsFactory, TreeMakerSchema, BaseSchema
import awkward1 as ak
import uproot4 as uproot
import numpy as np
import math
import os

In [None]:
import numpy as np
import boost_histogram as bh
import matplotlib.pyplot as plt
import mplhep as hep
use_helvet = True  ## true: use helvetica for plots, make sure the system have the font installed
if use_helvet:
    CMShelvet = hep.style.CMS
    CMShelvet['font.sans-serif'] = ['Helvetica', 'Arial']
    plt.style.use(CMShelvet)
else:
    plt.style.use(hep.style.CMS)

import matplotlib as mpl
from cycler import cycler

def get_hist(array, bins=10, xmin=None, xmax=None, underflow=False, overflow=False, mergeflowbin=True, normed=False,
            weights=None, **kwargs):
    r"""Plot histogram from input array.

    Arguments:
        array (np.ndarray): input array.
        bins (int, list or tuple of numbers, np.ndarray, bh.axis): bins
        weights (None, or np.ndarray): weights
        # normed (bool): deprecated.

    Returns:
        hist (boost_histogram.Histogram)
    """
    if isinstance(bins, int):
        if xmin is None:
            xmin = array.min()
        if xmax is None:
            xmax = array.max()
        width = 1.*(xmax-xmin)/bins
        if mergeflowbin and underflow:
            xmin += width
            bins -= 1
        if mergeflowbin and underflow:
            xmax -= width
            bins -= 1
        bins = bh.axis.Regular(bins, xmin, xmax, underflow=underflow, overflow=overflow)
    elif isinstance(bins, (list, tuple, np.ndarray)):
        if mergeflowbin and underflow:
            bins = bins[1:]
        if mergeflowbin and overflow:
            bins = bins[:-1]
        bins = bh.axis.Variable(bins, underflow=underflow, overflow=overflow)

    hist = bh.Histogram(bins, storage=bh.storage.Weight())
    if weights is None:
        weights = np.ones_like(array)
    hist.fill(array, weight=weights)
    return hist


def plot_hist(hists, normed=False, **kwargs):
    r"""Plot the histogram in the type of boost_histogram
    """
    
    if not isinstance(hists, (list, tuple)):
        hists = [hists]
    content = [h.view(flow=True).value for h in hists]
    bins = hists[0].axes[0].edges
    if 'bins' in kwargs:
        bins = kwargs.pop('bins')
    if 'yerr' in kwargs:
        yerr = kwargs.pop('yerr')
    else:
        yerr = [np.sqrt(h.view(flow=True).variance) for h in hists]
    if normed:
        for i in range(len(content)):
            contsum = sum(content[i])
            content[i] /= contsum
            yerr[i] /= contsum
    if len(hists) == 1:
        content, yerr = content[0], yerr[0]
    hep.histplot(content, bins=bins, yerr=yerr, **kwargs)

## 1. Load files

Load the ROOT files into pandas DataFrame

In [None]:
year = 2018  ## config me! options: 2016, 2017, 2018
lumi = {2016: 35.92, 2017: 41.53, 2018: 59.74}

## Read the root file into lazy awkward arrays
arr = {}
arr['qcd-mg-noht'] = NanoEventsFactory.from_file(f'samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/qcd-mg_tree.root', schemaclass=BaseSchema).events()
arr['qcd-herwig-noht'] = NanoEventsFactory.from_file(f'samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/qcd-herwig_tree.root', schemaclass=BaseSchema).events()
arr['top-noht'] = NanoEventsFactory.from_file(f'samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/top_tree.root', schemaclass=BaseSchema).events()
arr['v-qq-noht'] = NanoEventsFactory.from_file(f'samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/v-qq_tree.root', schemaclass=BaseSchema).events()
arr['jetht-noht'] = NanoEventsFactory.from_file(f'samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/data/jetht_tree.root', schemaclass=BaseSchema).events()

## 2. Pre-processing

For data: apply OR of all HT trigger to enhance statistics.

For MC: apply no HT trigger, based on the strategy we name it "MC substitute".

We define an attribute `maskdict` in each sample that stores masks corresponding to different selections.

In [None]:
def eval_expr(ak_array, expr, mask=None):
    """A function that can do `eval` to the awkward array, immitating the behavior of `eval` in pandas."""
    
    def get_variable_names(expr, exclude=['awkward', 'ak', 'np', 'numpy', 'math']):
        """Extract variables in the expr"""
        import ast
        root = ast.parse(expr)
        return sorted({node.id for node in ast.walk(root) if isinstance(node, ast.Name) and not node.id.startswith('_')} - set(exclude))

    tmp = {k:ak_array[k] if mask is None else ak_array[k].mask[mask] for k in get_variable_names(expr)}
    tmp.update({'math': math, 'numpy': np, 'np': np, 'awkward': ak, 'ak': ak})
#     print('eval expr: ', expr, '\nvars', get_variable_names(expr))
    return eval(expr, tmp)

In [None]:
### ================ Pre-processing for data  ===================

## Baseline selection applied to data. 
## Note that we use the OR or all HT triggers (some are pre-scaled triggers)

hlt_branches = {  ## used HLT_PFHT* branches depend on year
    2016: ['HLT_PFHT125', 'HLT_PFHT200', 'HLT_PFHT250', 'HLT_PFHT300', 'HLT_PFHT350', 'HLT_PFHT400', 'HLT_PFHT475', 'HLT_PFHT600', 'HLT_PFHT650', 'HLT_PFHT800', 'HLT_PFHT900'],
    2017: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
    2018: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
}
htcut_incl = '('+' | '.join(hlt_branches[year])+')'
basesel_ext_noht_prep = f"passmetfilters & (fj_x_pt>200) & fj_x_is_qualified"
sl_prep = ['jetht-noht']

for sam in sl_prep:
    assert 'noht' in sam
    arr[sam].maskdict = {}
    arr[sam].maskdict['hlt'] = eval_expr(arr[sam], htcut_incl)
    for i in ['1','2']:
        ## The baseline selection for data
        print('baseline selection for data: ', sam, f'jet{i}')
        arr[sam].maskdict[f'fj_{i}_base'] = arr[sam].maskdict['hlt'] & eval_expr(arr[sam], basesel_ext_noht_prep.replace('fj_x', f'fj_{i}'))

In [None]:
## FOR TEST: check the xsecWeight for MG samples & genWeight for Herwig sample (to avoid extremely large values) 
from collections import Counter
print(Counter(np.array(arr['qcd-mg-noht'].xsecWeight)),'\n')
for i in [0.96, 0.98, 0.99]:
    print(np.quantile(np.array(arr['qcd-herwig-noht'].genWeight), q=i))

In [None]:
### ================ Pre-processing for MC substitute  ===================

## Baseline selection applied to MC.
## No HT trigger is applied, based on the "MC substitute" strategy
basesel_noht_prep_subst = "passmetfilters & (fj_x_pt>200) & fj_x_is_qualified"
sl_prep_subst = ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht']  ## mark sample name with "subst_" as a reminder of MC substitute
for sam in sl_prep_subst:
    assert 'noht' in sam
    arr[sam] = arr[sam.replace('subst_','')]  ## use the name subst_ as a ref
    arr[sam].maskdict = {}
    for i in ['1','2']:
        print('baseline selection for: ', sam, f'jet{i}')
        arr[sam].maskdict[f'fj_{i}_base_subst'] = eval_expr(arr[sam], basesel_noht_prep_subst.replace('fj_x', f'fj_{i}'))
        ## Drop MG events with extremely large xsecWeight (coming from low HT sample in the HT-binned MG list)
        if sam == 'subst_qcd-mg-noht':
            arr[sam].maskdict[f'fj_{i}_base_subst'] = arr[sam].maskdict[f'fj_{i}_base_subst'] & eval_expr(arr[sam], 'xsecWeight<5.')
        ## Drop Herwig events with extremely large genWeight
        if sam == 'subst_qcd-herwig-noht':
            arr[sam].maskdict[f'fj_{i}_base_subst'] = arr[sam].maskdict[f'fj_{i}_base_subst'] & eval_expr(arr[sam], 'genWeight<{}'.format(np.quantile(np.array(arr[sam].genWeight), q=0.96)))
    ## Fix a 2016 bug: Herwig sample xsec is mistaken
    if year == 2016 and sam == 'subst_qcd-herwig-noht' and not hasattr(arr[sam], 'xsecWeight_is_normed'):
        arr[sam]['xsecWeight'] = arr[sam]['xsecWeight'] * 2400.
        arr[sam]['xsecWeight_is_normed'] = True

## Produce new variables used for fit
for sam in sl_prep + sl_prep_subst:
    for i in ['1','2']:
        _mask = arr[sam].maskdict[f'fj_{i}_base'] if sam in sl_prep else arr[sam].maskdict[f'fj_{i}_base_subst']
        print('calculating new vars for: ', sam, f'jet{i}')
        arr[sam][f'fj_{i}_mSV12_ptmax'] = eval_expr(arr[sam], f'(fj_{i}_sj1_sv1_pt>fj_{i}_sj2_sv1_pt)*fj_{i}_sj1_sv1_masscor + (fj_{i}_sj1_sv1_pt<=fj_{i}_sj2_sv1_pt)*fj_{i}_sj2_sv1_masscor', mask=_mask)
        arr[sam][f'fj_{i}_mSV12_ptmax_log'] = eval_expr(arr[sam], f'np.log(fj_{i}_mSV12_ptmax)', mask=_mask)
        arr[sam][f'fj_{i}_mSV12_dxysig'] = eval_expr(arr[sam], f'(fj_{i}_sj1_sv1_dxysig>fj_{i}_sj2_sv1_dxysig)*fj_{i}_sj1_sv1_masscor + (fj_{i}_sj1_sv1_dxysig<=fj_{i}_sj2_sv1_dxysig)*fj_{i}_sj2_sv1_masscor', mask=_mask)
        arr[sam][f'fj_{i}_mSV12_dxysig_log'] = eval_expr(arr[sam], f'np.log(fj_{i}_mSV12_dxysig)', mask=_mask)

## 3. Obtain reweight factors

We extract the following reweight factors. The first two sets are used in the nominal fit. The other two are for validation.

 1. **MC substitute-to-data reweight factor**: on the HT variable based on (pT, jet index) bins. The goal is to bring the shape of MC substitute back to the data shape in the inclusive region. Remember that the raw MC substitute yield is always much larger than data, because most HT triggers applied to data are pre-scaled triggers. New variables have the name `htwgt_(|herwig)`.

 2. **sfBDT reweight factor**: based on (pT, jet index) bins, to further reweight MC substitute back to data shape on the sfBDT variable. Since sfBDT>0.9 is imposed in the final fit region, the sfBDT shape discrepancy between the "reweighted MC substitute" and data may again cause $N_{total}$ difference for MC and data, after setting sfBDT>0.9 in the fit region. Therefore, we calculate the overall factor `sfbdtwgt_g90_(|herwig)_incl` in each (pT, jet index) bin, used in the nominal shape template; and the binned factor `sfbdtwgt_g90_(|herwig)_binned` used in the shape uncertainty extraction brought by the sfBDT shape mismodeling

 3. **Additional MC substitute-to-data reweight factor on $p_{T}$ only**: A possible replacement of the first two factors combined. This factor is only used in the validation fit. The goal for this validation is to check if different reweighting schemes may affect the SF fit results. New variables have the name `ad_ptwgt_(|herwig)`.
 
 4. **Proxy-to-signal reweight factor on $m_{SD}$ / $p_{T}$ / $\tau_{21}$**: based on the shape of "reweighted MC substitute (after the first two steps)" and the H->cc signal jet shape in the inclusive region. The factor is only used in the validation fit, in which we apply such reweight factor to both MC substitute and data to check if the SF results are affected. New variables have the name `(mass|pt|tau21)datamcwgt_(|herwig)`

In [None]:
def mask_and(arr, mask_list):
    """Calculate AND of given mask list"""
    return np.logical_and.reduce([arr.maskdict[mask] for mask in mask_list])

def concat_array(arrdict, expr, sam_list, filter_list):
    """Concatenate the awkward arrays passing the given filter list"""
    if not isinstance(sam_list, list):
        sam_list = [sam_list]
    return np.concatenate([
        np.array(eval_expr(arrdict[sam], expr)[mask_and(arrdict[sam], filter_list)]) for sam in sam_list
    ])

def mask_and_fj12(arr, mask_list):
    """Comibne `mask_and` result for fj_1 and fj_2"""
    mask_list_fj1 = [ele.replace('fj_x', 'fj_1') for ele in mask_list]
    mask_list_fj2 = [ele.replace('fj_x', 'fj_2') for ele in mask_list]
    return np.concatenate([mask_and(arr, mask_list_fj1), mask_and(arr, mask_list_fj2)])

def concat_array_fj12(arrdict, expr, sam_list, filter_list):
    """Comibne `concat_array` result for fj_1 and fj_2"""
    filter_list_fj1 = [ele.replace('fj_x', 'fj_1') for ele in filter_list]
    filter_list_fj2 = [ele.replace('fj_x', 'fj_2') for ele in filter_list]
    return np.concatenate([concat_array(arrdict, expr.replace('fj_x', 'fj_1'), sam_list, filter_list_fj1), 
                           concat_array(arrdict, expr.replace('fj_x', 'fj_2'), sam_list, filter_list_fj2)])

def calc_rwgt_akarray(arr, rwgt_edge, rwgt):
    """Calculate the weight ak-array based on the value ak-array of the reweight variable"""
    arr_out = (arr<rwgt_edge[0])*rwgt[0]
    for i in range(len(rwgt_edge)-1):
        arr_out = arr_out + ((arr>=rwgt_edge[i]) & (arr<rwgt_edge[i+1]))*rwgt[i+1]
    arr_out = arr_out + (arr>=rwgt_edge[-1])*rwgt[-1]
    return arr_out

In [None]:
### ================ 1. Reweight MC subsitute to data: stored as variable "fj_x_htwgt", "fj_x_htwgt_herwig") ===================

## True: if the block has run before, we can obtain the reweight factor from the previously stored pickle output
is_read_from_pickel = False

def extract_mc_to_data_ht_weight(arr, sl_rwgt, wgtstr_rwgt, wgtname):
    r"""Extract the "MC subsisute to data" reweight factor on HT based on (pT, jet index) bins
    
    Arguments:
        arr: awkward array dict as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
    """

    rwgt_var = 'ht'
    ## The binning info for (pT, HT) grid. Note that 2016 is different from 2017/18. The adopted HT grid is based on MC shape in each pT bin
    rwgt_edge_dic = {}
    rwgt_edge_dic[2016] = {
        'pt200to250': [300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100],
        'pt250to300': [350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1300],
        'pt300to350': [450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1350],
        'pt350to400': [550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
        'pt400to500': [600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
        'pt500toInf': [800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
    }
    rwgt_edge_dic[2017] = rwgt_edge_dic[2018] = {
    #         'pt200to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100, 1200], # deprecated
    #         'pt300to400': [500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1600], # deprecated
        'pt200to250': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 900, 1000],
        'pt250to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100, 1200],
        'pt300to350': [450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
        'pt350to400': [550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1600],
        'pt400to500': [700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1800],
        'pt500toInf': [900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
    }
    
    ## Initially fill the output column with 0, since we will fill the column iteratively for each pT bin
    for sam in sl_rwgt:
        for i in ['1','2']:
            arr[sam][wgtname.replace('fj_x', f'fj_{i}')] = ak.zeros_like(arr[sam][rwgt_var])

    if is_read_from_pickel: ## restore info from a previously stored pickle
        import pickle
        with open(f'plots/wgtv5/htwgt_{year}.pickle', 'rb') as f:
            res = pickle.load(f)
            res = res[0] if 'herwig' not in wgtname else res[1]
            ent_data, ent_mc, rwgt = res['ent_data'], res['ent_mc'], res['rwgt']
    else:
        ent_data, ent_mc, rwgt = {}, {}, {}

    ## Rewight separately on jet pT bins
    for ptsel, ptlab in zip(['(fj_x_pt>=200) & (fj_x_pt<250)', '(fj_x_pt>=250) & (fj_x_pt<300)', '(fj_x_pt>=300) & (fj_x_pt<350)', '(fj_x_pt>=350) & (fj_x_pt<400)', '(fj_x_pt>=400) & (fj_x_pt<500)', '(fj_x_pt>=500)'], 
                            ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500toInf']):
        ## Reweight separately for 1st or 2nd jet
        for i, lab in zip(['1','2'], ['jet1','jet2']):
            print (' -- ', ptsel, lab)
            rwgt_edge = rwgt_edge_dic[year][ptlab]
            ## Calculate the rwgt for the first time
            if not is_read_from_pickel:
                for sam in sl_rwgt+['jetht-noht']:
                    arr[sam].maskdict[f'fj_{i}_{ptlab}'] = eval_expr(arr[sam], ptsel.replace('fj_x', f'fj_{i}'))

                ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
                ent_data[ptlab+lab] = get_hist(concat_array(arr, expr=rwgt_var, sam_list=['jetht-noht'], filter_list=[f'fj_{i}_base', f'fj_{i}_{ptlab}']),
                                               bins=rwgt_edge, 
                                               weights=np.ones(np.sum(mask_and(arr['jetht-noht'], mask_list=[f'fj_{i}_base', f'fj_{i}_{ptlab}']))), 
                                               underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
                ent_mc[ptlab+lab]   = get_hist(concat_array(arr, expr=rwgt_var, sam_list=sl_rwgt, filter_list=[f'fj_{i}_base_subst', f'fj_{i}_{ptlab}']),
                                               bins=rwgt_edge,
                                               weights=concat_array(arr, expr=wgtstr_rwgt, sam_list=sl_rwgt, filter_list=[f'fj_{i}_base_subst', f'fj_{i}_{ptlab}']),
                                               underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
                ## Calculate the reweight factor
                rwgt[ptlab+lab] = np.nan_to_num(ent_data[ptlab+lab] / ent_mc[ptlab+lab], nan=0) # len=nbin+2
            print(ent_data[ptlab+lab], '\n', rwgt[ptlab+lab])

            ## Assign the reweight factor to the new column
            ## We use pandas for easier implementation. Modifcation on pandas array can be directly transferred back to original ak array
            for sam in sl_rwgt:
                _var = rwgt_var
                _wgtname = wgtname.replace('fj_x', f'fj_{i}')
                _mask = mask_and(arr[sam], mask_list=[f'fj_{i}_base_subst', f'fj_{i}_{ptlab}'])
                arr[sam][_wgtname] = arr[sam][_wgtname] + ak.fill_none(calc_rwgt_akarray(arr[sam][_var].mask[_mask], rwgt_edge, rwgt[ptlab+lab]), 0)
                print('midpoint: ', sam, _wgtname, arr[sam][_wgtname])

    # =========== plot ===========
    mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green', 'violet', 'darkorange', 'black', 'cyan', 'yellow'])
    for ptlab in ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500toInf']:
        f, ax = plt.subplots(figsize=(11,11))
        hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
        for lab in ['jet1', 'jet2']:
            hep.histplot(ent_data[ptlab+lab], bins=[0]+list(rwgt_edge_dic[year][ptlab])+[2500], label=f'Data ({lab})')
            hep.histplot(ent_mc[ptlab+lab], bins=[0]+list(rwgt_edge_dic[year][ptlab])+[2500], label=f'MC subst. ({lab})')
        ax.set_xlim(0, 2500); ax.set_xlabel('$H_{T}$ [GeV]', ha='right', x=1.0); ax.set_ylabel('Events / bin', ha='right', y=1.0); ax.legend()
        if not os.path.exists('plots/wgtv5'):
            os.makedirs('plots/wgtv5')
        plt.savefig(f'plots/wgtv5/{year}_{ptlab}__{wgtname}.pdf')
        plt.savefig(f'plots/wgtv5/{year}_{ptlab}__{wgtname}.png')
    # ============================
    
    return {'ent_data':ent_data, 'ent_mc':ent_mc, 'rwgt':rwgt}

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
htwgt = extract_mc_to_data_ht_weight(arr, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'],     wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='fj_x_htwgt')
htwgt_herwig = extract_mc_to_data_ht_weight(arr, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='fj_x_htwgt_herwig')

if not is_read_from_pickel: ## store the info for the first run
    import pickle
    with open(f'plots/wgtv5/htwgt_{year}.pickle', 'wb') as fw:
        pickle.dump([htwgt, htwgt_herwig], fw)

ak.to_pandas(arr['subst_qcd-mg-noht'][['ht', 'fj_1_pt', 'fj_1_htwgt']][arr['subst_qcd-mg-noht'].maskdict['fj_1_base_subst']])

In [None]:
### ================ 2. Extract the sfBDT>0.9 overall factor and binned fractor: stored as variable "sfbdtwgt_g90_incl", "sfbdtwgt_g90_binned"; similar for herwig ===================

def extract_further_sfbdt_weight(arr, sl_rwgt, wgtstr_rwgt, wgtname_binned, wgtname_incl):
    r"""Extract the "MC substitute to data" reweight factor (both overall and binned factor) further on sfBDT variable, after a sfBDT>0.9 selection
    
    Arguments:
        arr: awkward array dict as input
        sl_rwgt: sample list for MC substitute in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname_binned: the reweight name (the binned factors) stored as a new column
        wgtname_incl: the reweight name (the overall factor) stored as a new column
    """
    
    ## Initially fill the output column with 0, since we will fill the column iteratively for each pT bin
    for sam in sl_rwgt:
        for i in ['1','2']:
            arr[sam][wgtname_binned.replace('fj_x', f'fj_{i}')] = ak.zeros_like(arr[sam]['ht'])
            arr[sam][wgtname_incl.replace('fj_x', f'fj_{i}')]   = ak.zeros_like(arr[sam]['ht'])
    
    ## Reweight based on the sfBDT variable
    rwgt_var, nbin, xmin, xmax  = 'fj_x_sfBDT', 5, 0.9, 1.
    print('rwgt sfBDT bins: ', rwgt_var, nbin, xmin, xmax)
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Rewight separately on jet pT bins
    for pt_range, ptlab in zip([(200, 250), (250, 300), (300, 350), (350, 400), (400, 500), (500, 100000)],
                               ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500toInf']):
        ## Requires the selection sfBDT>0.9 which is used in the fit region
        for sam in sl_rwgt+['jetht-noht']:
            arr[sam].maskdict['fj_1_sfBDT'] = eval_expr(arr[sam], 'fj_1_sfBDT>0.9')
            arr[sam].maskdict['fj_2_sfBDT'] = eval_expr(arr[sam], 'fj_2_sfBDT>0.9')

        ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
        ## does not distinguish jet1 or jet2 on this reweighting
        ent_data = get_hist(concat_array_fj12(arr, expr=rwgt_var, sam_list=['jetht-noht'], filter_list=['fj_x_base', f'fj_x_{ptlab}', 'fj_x_sfBDT']),
                            bins=rwgt_edge, 
                            weights=np.ones(np.sum(mask_and_fj12(arr['jetht-noht'], mask_list=['fj_x_base', f'fj_x_{ptlab}', 'fj_x_sfBDT']))), 
                            underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ent_mc   = get_hist(concat_array_fj12(arr, expr=rwgt_var, sam_list=sl_rwgt, filter_list=['fj_x_base_subst', f'fj_x_{ptlab}', 'fj_x_sfBDT']),
                            bins=rwgt_edge,
                            weights=concat_array_fj12(arr, expr=wgtstr_rwgt, sam_list=sl_rwgt, filter_list=['fj_x_base_subst', f'fj_x_{ptlab}', 'fj_x_sfBDT']),
                            underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ## Calculate the reweight factor
        rwgt = np.nan_to_num(ent_data / ent_mc, nan=0) # len=nbin+2
        print (ent_data, rwgt, 'incl:', sum(ent_data) / sum(ent_mc))

        ## Assign the reweight factor to the new column
        for sam in sl_rwgt:
            for i in ['1','2']:
                _var = rwgt_var.replace('fj_x', f'fj_{i}')
                _wgtname = wgtname_binned.replace('fj_x', f'fj_{i}')
                _mask = mask_and(arr[sam], mask_list=[f'fj_{i}_base_subst', f'fj_{i}_{ptlab}', f'fj_{i}_sfBDT'])
                arr[sam][_wgtname] = arr[sam][_wgtname] + ak.fill_none(calc_rwgt_akarray(arr[sam][_var].mask[_mask], rwgt_edge, rwgt), 0)
                
                _wgtname = wgtname_incl.replace('fj_x', f'fj_{i}')
                _mask = mask_and(arr[sam], mask_list=[f'fj_{i}_base_subst', f'fj_{i}_{ptlab}'])
                arr[sam][_wgtname] = arr[sam][_wgtname] + ak.fill_none(_mask * sum(ent_data) / sum(ent_mc), 0)
                print('midpoint: ', sam, _wgtname, arr[sam][_wgtname])

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_further_sfbdt_weight(arr, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt",
                             wgtname_binned='fj_x_sfbdtwgt_g90_binned', wgtname_incl='fj_x_sfbdtwgt_g90_incl')
extract_further_sfbdt_weight(arr, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt_herwig",
                             wgtname_binned='fj_x_sfbdtwgt_g90_herwig_binned', wgtname_incl='fj_x_sfbdtwgt_g90_herwig_incl')

ak.to_pandas(arr['subst_qcd-mg-noht'][['fj_1_pt', 'fj_1_sfBDT', 'fj_1_sfbdtwgt_g90_incl']][arr['subst_qcd-mg-noht'].maskdict['fj_1_base_subst']])

In [None]:
### ================ 3. [additional] Reweight MC subsitute to data on pT: stored as variable "ad_ptwgt", "ad_ptwgt_herwig" ===================

def extract_mc_to_data_pt_weight(arr, sl_rwgt, wgtstr_rwgt, wgtname):
    r"""Extract the "MC subsisute to data" reweight factor on pT as a optional choice
    
    Arguments:
        arr: awkward array dict as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
    """
    
    # Apply simple 1D reweight to pT
    rwgt_var, nbin, xmin, xmax  = 'fj_x_pt', 20, 200., 1200.
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Rewight separately on 1st/2nd jet
    for i, lab in zip(['1','2'], ['jet1','jet2']):
        ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
        ent_data = get_hist(concat_array(arr, expr=rwgt_var.replace('fj_x', f'fj_{i}'), sam_list=['jetht-noht'], filter_list=[f'fj_{i}_base', f'fj_{i}_sfBDT']),
                            bins=rwgt_edge, 
                            weights=np.ones(np.sum(mask_and(arr['jetht-noht'], filter_list=[f'fj_{i}_base', f'fj_{i}_sfBDT']))), 
                            underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ent_mc   = get_hist(concat_array(arr, expr=rwgt_var.replace('fj_x', f'fj_{i}'), sam_list=sl_rwgt, filter_list=[f'fj_{i}_base_subst', f'fj_{i}_sfBDT']),
                            bins=rwgt_edge,
                            weights=concat_array(arr, expr=wgtstr_rwgt, sam_list=sl_rwgt, filter_list=[f'fj_{i}_base_subst', f'fj_{i}_sfBDT']),
                            underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ## Calculate the reweight factor
        rwgt = np.nan_to_num(ent_data / ent_mc, nan=0) # len=nbin+2
        print (ent_data, rwgt)
        
        ## assign the reweight factor to the new column
        for sam in sl_rwgt:
            _var = rwgt_var.replace('fj_x', f'fj_{i}')
            _wgtname = wgtname.replace('fj_x', f'fj_{i}')
            _mask = mask_and(arr[sam], mask_list=[f'fj_{i}_base_subst'])
            arr[sam][_wgtname] = calc_rwgt_akarray(arr[sam][_var].mask[_mask], rwgt_edge, rwgt)  ## fill the new column directly as a masked array
            print('midpoint: ', sam, _wgtname, arr[sam][_wgtname])
        
## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_mc_to_data_pt_weight(arr, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'],     wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight",        wgtname='fj_x_ad_ptwgt')
extract_mc_to_data_pt_weight(arr, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='fj_x_ad_ptwgt_herwig')

ak.to_pandas(arr['subst_qcd-mg-noht'][['ht', 'fj_1_pt', 'fj_1_htwgt', 'fj_1_sfbdtwgt_g90_incl', 'fj_1_ad_ptwgt']][arr['subst_qcd-mg-noht'].maskdict['fj_1_base_subst']])

In [None]:
### ================ 4. [additional] Reweight MC (proxy jet) to H->cc signal jet on either mass/pT/tau21: stored as variable "(mass|pt|tau21)datamcwgt"; similar for herwig  ===================

# First load the h->cc signal ntuple. Adopt the selction used in the analysis
arr['vhcc-2L'] = NanoEventsFactory.from_file(f'samples/trees/20200906_VH_extfillsv_2016_2L/mc/vhcc_tree.root', schemaclass=BaseSchema).events()

boosted = "(v_pt>200) & (ak15_pt>200) & (dphi_V_ak15>2.5) & (ak15_sdmass>50) & (ak15_sdmass<200)"
basecut_vhcc_2L = "(v_mass>75) & (v_mass<105) & (((np.abs(lep1_pdgId)==11) & passTrigEl) | ((np.abs(lep1_pdgId)==13) & passTrigMu)) & " + boosted + " & (n_ak4<3)"
arr['vhcc-2L'].maskdict = {}
arr['vhcc-2L'].maskdict['base'] = eval_expr(arr['vhcc-2L'], basecut_vhcc_2L)

def extract_mc_to_signal_weight(arr, sl_rwgt, wgtstr_rwgt, wgtname, rwgt_info):
    r"""Extract the "MC subsisute (proxy) to H->cc signal jet" reweight factor on possible variable
    
    Arguments:
        arr: awkward array dict as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
        rwgt_info: variable and binning info for this reweighting routine
    """
    
    # Reweight info extracted from the function argument
    rwgt_var, nbin, xmin, xmax, rwgt_var_nom  = rwgt_info
    print('rwgt info: ', rwgt_var, nbin, xmin, xmax)
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Requires the selection sfBDT>0.9 which is used in the fit region
    rwgt_sel = 'fj_x_sfBDT>0.9'
    
    ## Get MC and h->cc signal histogram. Note: consider underflow & overflow bins, hence len = nbins+2
    wgt_mc = concat_array_fj12(arr, expr=wgtstr_rwgt, sam_list=sl_rwgt, filter_list=['fj_x_base_subst', 'fj_x_sfBDT'])
    yield_mc = wgt_mc.sum()
    ent_mc  = get_hist(concat_array_fj12(arr, expr=rwgt_var, sam_list=sl_rwgt, filter_list=['fj_x_base_subst', 'fj_x_sfBDT']),
                       bins=rwgt_edge,
                       weights=wgt_mc,
                       underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value      
    
    wgt_hcc = concat_array(arr, expr='genWeight*xsecWeight*puWeight', sam_list=['vhcc-2L'], filter_list=['base'])
    yield_hcc = wgt_hcc.sum()
    ent_hcc = get_hist(concat_array(arr, expr=rwgt_var_nom, sam_list=['vhcc-2L'], filter_list=['base']),
                       bins=rwgt_edge,
                       weights=wgt_hcc,
                       underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
    
    ## Calculate the reweight factors for the two normalized histograms, and clip to (0, 50)
    rwgt = np.nan_to_num((ent_hcc/yield_hcc) / (ent_mc/yield_mc), nan=0) # len=nbin+2
    rwgt = np.clip(rwgt, 0, 50)
    print (ent_hcc, rwgt)

    ## assign the reweight factor to the new column (to both MC and data)
    for sam in sl_rwgt + ['jetht-noht']:
        for i in ['1','2']:
            _var = rwgt_var.replace('fj_x', f'fj_{i}')
            _wgtname = wgtname.replace('fj_x', f'fj_{i}')
            _mask = mask_and(arr[sam], mask_list=[f'fj_{i}_base_subst'] if sam!='jetht-noht' else [f'fj_{i}_base'])
            arr[sam][_wgtname] = calc_rwgt_akarray(arr[sam][_var].mask[_mask], rwgt_edge, rwgt)  ## fill the new column directly as a masked array
            print('midpoint: ', sam, _wgtname, arr[sam][_wgtname])
    
## For each reweight variable, calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_mc_to_signal_weight(arr, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl",
                            wgtname='fj_x_massdatamcwgt', rwgt_info=('fj_x_sdmass', 15, 50, 200, 'ak15_sdmass'))
extract_mc_to_signal_weight(arr, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt_herwig*fj_x_sfbdtwgt_g90_herwig_incl",
                            wgtname='fj_x_massdatamcwgt_herwig', rwgt_info=('fj_x_sdmass', 15, 50, 200, 'ak15_sdmass'))
extract_mc_to_signal_weight(arr, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl",
                            wgtname='fj_x_ptdatamcwgt', rwgt_info=('fj_x_pt', 20, 200, 1200, 'ak15_pt'))
extract_mc_to_signal_weight(arr, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt_herwig*fj_x_sfbdtwgt_g90_herwig_incl",
                            wgtname='fj_x_ptdatamcwgt_herwig', rwgt_info=('fj_x_pt', 20, 200, 1200, 'ak15_pt'))
extract_mc_to_signal_weight(arr, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl",
                            wgtname='fj_x_tau21datamcwgt', rwgt_info=('fj_x_tau21', 20, 0, 1, 'ak15_tau21'))
extract_mc_to_signal_weight(arr, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*fj_x_htwgt_herwig*fj_x_sfbdtwgt_g90_herwig_incl",
                            wgtname='fj_x_tau21datamcwgt_herwig', rwgt_info=('fj_x_tau21', 20, 0, 1, 'ak15_tau21'))

ak.to_pandas(arr['jetht-noht'][['fj_1_sdmass', 'fj_1_massdatamcwgt', 'fj_1_pt', 'fj_1_ptdatamcwgt', 'fj_1_tau21', 'fj_1_tau21datamcwgt']][arr['jetht-noht'].maskdict['fj_1_base']])

## 4. Make ROOT templates

We produce the ROOT templates using the DataFrame in this step. The outputs are ROOT files with neat structure. After the further reorganization, they can be used as the Higgs Combine input to implement the fit.

As a reference, we provide an example of the output files and their structure. 
E.g., for a **given fit variable**, **given tagger WP** and a **certain jet-pT bin** for **a single fit**, the output ROOT templates should include the pass and fail MC template in the B/C/L flavors, the data template, and the MC systematics for all specified shape uncertainties. The files are organized in the following structure:
```
─── 20201115_SF2017_AK15_qcd_subst_pst_ptw50_TP_msv12_dxysig_log_var22binsv2  [use variable: msv12_dxysig_log, Tight WP]
    └── Cards
        └── bdt900
            ├── pt200to250                 [given pT bin]
            │   ├── nominal                    [the nominal histograms]
            │   │   ├── inputs_fail.root           [include four TH1D: flvC, flvB, flvL, data_obs]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBDown                 [shape uncertainty plots]
            │   │   ├── inputs_fail.root           [include three TH1D: flvC_fracBBDown, flvB_fracBBDown, flvL_fracBBDown]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBUp                   [..]
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdKdeSystDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdKdeSystUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdSystDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdSystUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTFloAroundDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTFloAroundUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTRwgtDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   └── sfBDTRwgtUp
            │       ├── inputs_fail.root
            │       └── inputs_pass.root
            ├── pt250to300
            │   ├── ...
```

The template making is organized in three nested functions.

In [None]:
#### ================================ Global parameters: config me! ================================ ####
g_make_template_mode = 'main'
r"""Options:
        main           : the main fit
        val_pt         : the validation fit -- to use an optional MC subsitute-to-data strategy, i.e. on pT variable only
        val_tosig_mass : the validation fit -- additionally reweight MC & data to h->cc signal jet on mass
        val_tosig_pt   : the validation fit -- additionally reweight MC & data to h->cc signal jet on pt  
        val_tosig_tau21: the validation fit -- additionally reweight MC & data to h->cc signal jet on tau21
        val_vary_sfbdt : the validation fit -- varying sfBDT cut value and drop sfBDT* uncertaint
        val_crop_bin   : the validation fit -- cropping the marginal bins for fit
"""

g_outdir_prefix = f'20201115_SF{year}_AK15_qcd_subst_pst_ptw50_ak'
r"""Prefix for the output dir name """

g_make_unce_types = {'nominal':True, 'pu':True, 'fracBB':True, 'fracCC':True, 'fracLight':True, 'psWeightIsr':False, 'psWeightFsr':False, 'sfBDTRwgt':True, 'sfBDTFloAround':True}
r"""The uncertainty types used in the fit. Use False or remove the key to disable an certain unce type
    Note: "qcdSyst" and "qcdKdeSyst" is not used in this verision. "psWeightIsr" and "psWeightFsr" works fine in 2018 while in 2016/17 one need to first garantee the 2018 histograms exist
          so the unce can be transferred.
""" # for test, we disable psWeightIsr/Fsr

g_do_fit_for = { # for test, we launch the main fit var (1) only
    1: ['TP', 'MP', 'LP'],
#     2: ['TP', 'MP', 'LP'],
#     3: ['TP', 'MP', 'LP'],
}
r""" Do fit for which variable and which WPs"""
#### =============================================================================================== ####

## Consistency check for gloal params
if g_make_template_mode not in ['main', 'val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_vary_sfbdt', 'val_crop_bin']:
    raise RuntimeError('Specified mode cannot be recognized.')
if g_make_template_mode in ['val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_vary_sfbdt'] and list(g_do_fit_for.keys()) != [1]:
    print('Warning: for validation fit, set the fit information to the main variable (1) only')
    g_do_fit_for = {1: ['TP', 'MP', 'LP']}
if g_make_template_mode == 'val_crop_bin' and list(g_do_fit_for.keys()) != [901]:
    print('Warning: for validation fit on cropping the marginal bins, set the fit information to the cropped main variable (901) only')
    g_do_fit_for = {901: ['TP', 'MP', 'LP']}
if g_make_template_mode == 'val_vary_sfbdt':
    g_make_unce_types.pop('sfBDTRwgt', None)
    g_make_unce_types.pop('sfBDTFloAround', None)
    
## The sfBDT varing list. 
## Note: to implement sfBDTFloAround unce, one must first obtain the nominal hist for the cut value 0.85, 0.95
if g_make_template_mode != 'val_vary_sfbdt':
    g_sfBDT_val_list = [0.85, 0.95, 0.9]
else:
    g_sfBDT_val_list = [0.84, 0.86, 0.88, 0.90, 0.92, 0.94] ## for validation: varying sfBDT
    

## Fit info: in the format of [ (fit var, nbins/edges, xmin/None, xmax/None, (underflow, overflow), label), outputdir lambda func ]
g_fitinfo = {
    1: [ ##  main fit var
        ('fj_x_mSV12_dxysig_log', [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, (True, True), 'mSV12_dxysig_log'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_msv12_dxysig_log_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
    2: [ ## the other var for validation
        ('fj_x_mSV12_ptmax_log', [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, (True, True), 'mSV12_ptmax_log'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_msv12_ptmax_log_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
    3: [ ## the other var for validation
        ('fj_x_btagcsvv2', [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, (True, True), 'CSVv2'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_csvv2_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
    901: [ ## crop the marginal bins for the main var as a validation
        ('fj_x_mSV12_dxysig_log', [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8], None, None, (False, False), 'mSV12_dxysig_log'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_msv12_dxysig_log_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
}
g_hist_qcdsyst = {}


## Tagger values in use
g_map_tagger_val = {'TP':0.95, 'MP':0.90, 'LP':0.80}

    
## Necessary KDE parameters used in qcdKdeSyst unce
g_custom_kde_bw = {'fj_x_btagcsvv2':15, 'mSV12_ptmax_log':4, 'mSV12_dxysig_log':4}
g_custom_kde_binmask = {'fj_x_btagcsvv2':[0], 'mSV12_ptmax_log':[-0.4,1.8,2.5,3.2], 'mSV12_dxysig_log':[-0.8,-0.4,1.8,2.5]}

def launch_maker():
    r"""Depth 0: Main function to launch the fit given the global parameters
    """
    
    ## flavor masks
    for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht']:
        for i in ['1','2']:
            arr[sam].maskdict[f'fj_{i}_flvB'] = eval_expr(arr[sam], f'fj_{i}_nbhadrons>=1')
            arr[sam].maskdict[f'fj_{i}_flvC'] = eval_expr(arr[sam], f'(fj_{i}_nbhadrons==0) & (fj_{i}_nchadrons>=1)')
            arr[sam].maskdict[f'fj_{i}_flvL'] = eval_expr(arr[sam], f'(fj_{i}_nbhadrons==0) & (fj_{i}_nchadrons==0)')

    for _ifit in g_do_fit_for:
        for _wp in g_do_fit_for[_ifit]:
            
            ## Real tagger range with the given WP
            tagger_range = {'TP': (g_map_tagger_val['TP'], 1.0), 'MP': (g_map_tagger_val['MP'], g_map_tagger_val['TP']), 'LP': (g_map_tagger_val['LP'], g_map_tagger_val['MP'])}

            ## masks for applying the tagger
            for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']:
                for i in ['1','2']:
                    arr[sam].maskdict[f'fj_{i}_tagger_pass'] = eval_expr(arr[sam], f'(fj_{i}_ParticleNetMD_XccVsQCD>{tagger_range[_wp][0]:.3f}) & (fj_{i}_ParticleNetMD_XccVsQCD<={tagger_range[_wp][1]:.3f})')
                    arr[sam].maskdict[f'fj_{i}_tagger_fail'] = eval_expr(arr[sam], f'(fj_{i}_ParticleNetMD_XccVsQCD<={tagger_range[_wp][0]:.3f}) | (fj_{i}_ParticleNetMD_XccVsQCD>{tagger_range[_wp][1]:.3f})')

            ## Get fit info and output lambda func
            fitinfo, outdir_func = g_fitinfo[_ifit]

            ## Loop over BDT varing list 
            for sfBDT_val in g_sfBDT_val_list:
                ## The default args in the main fit
                args = {
                    'wgtstr_dm': f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl', 'wgtstr_dm_data': None,
                    'sl_dm': ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'],
                    'sl_dm_herwig': ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'],
                    'categories_dm': ['flvL', 'flvB', 'flvC', 'data'],
                    'base_masks': {
                        'data': ['fj_x_base'],
                        'mc':   ['fj_x_base_subst'],
                    }
                }
                ## Modify args according to specified global param
                if g_make_template_mode == 'val_pt':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_ad_ptwgt', None
                elif g_make_template_mode == 'val_tosig_mass':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl*fj_x_massdatamcwgt', 'fj_x_massdatamcwgt'
                elif g_make_template_mode == 'val_tosig_pt':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl*fj_x_ptdatamcwgt', 'fj_x_ptdatamcwgt'
                elif g_make_template_mode == 'val_tosig_tau21':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl*fj_x_tau21datamcwgt', 'fj_x_tau21datamcwgt'

                ## masks for applying sfBDT cut
                for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']:
                    for i in ['1','2']:
                        arr[sam].maskdict[f'fj_{i}_sfBDT>{sfBDT_val}'] = eval_expr(arr[sam], f'fj_{i}_sfBDT>{sfBDT_val}')

                wrapperPt(arr, fitinfo, lambda pt_range, sys_name: outdir_func(_wp, sfBDT_val, pt_range, sys_name), sfBDT_val, args, ext_masks=[f'fj_x_sfBDT>{sfBDT_val}'])

In [None]:
def wrapperPt(arr, fitinfo, outdir_func, sfBDT_val, args, ext_masks):
    r"""Depth 1: Process the pT cut and wrap all other following steps
    """
    
    for pt_range, ptlab in zip([(200, 250), (250, 300), (300, 350), (350, 400), (400, 500), (500, 100000)],
                               ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500toInf']):
        print ('pt range:', pt_range)
        
        makeTemplatesWrapper(arr, fitinfo, lambda sys_name: outdir_func(pt_range, sys_name), sfBDT_val, args, ext_masks=ext_masks+[f'fj_x_{ptlab}'])

In [None]:
def makeTemplatesWrapper(arr, fitinfo, outdir_func, sfBDT_val, args, ext_masks):
    r"""Depth 2: Specify which template (nominal or any shape uncertainty) to make in this step
    """
    
    wgtstr_dm = args['wgtstr_dm']
    if 'nominal' in g_make_unce_types.keys() and g_make_unce_types['nominal']:
        sys_name = 'nominal'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
    
    ## Below we extract hists for all unce type. Note: we only need such procedure in sfBDT>0.9 case (except for the validaiton when varying the sfBDT)
    if sfBDT_val==g_sfBDT_val_list[-1] or g_make_template_mode=='val_vary_sfbdt':
        if 'pu' in g_make_unce_types.keys() and g_make_unce_types['pu']: 
            sys_name = 'puUp'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightUp'); makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = 'puDown'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightDown'); makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        
        if 'fracBB' in g_make_unce_types.keys() and g_make_unce_types['fracBB']: 
            sys_name = "fracBBUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = "fracBBDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        if 'fracCC' in g_make_unce_types.keys() and g_make_unce_types['fracCC']: 
            sys_name = "fracCCUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*((fj_x_nbhadrons==0) & (fj_x_nchadrons>1)) + 1.0*(np.logical_not((fj_x_nbhadrons==0) & (fj_x_nchadrons>1))))'; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = "fracCCDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*((fj_x_nbhadrons==0) & (fj_x_nchadrons>1)) + 1.0*(np.logical_not((fj_x_nbhadrons==0) & (fj_x_nchadrons>1))))'; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        if 'fracLight' in g_make_unce_types.keys() and g_make_unce_types['fracLight']: 
            sys_name = "fracLightUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*((fj_x_nbhadrons==0) & (fj_x_nchadrons==0)) + 1.0*(np.logical_not((fj_x_nbhadrons==0) & (fj_x_nchadrons==0))))'; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = "fracLightDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*((fj_x_nbhadrons==0) & (fj_x_nchadrons==0)) + 1.0*(np.logical_not((fj_x_nbhadrons==0) & (fj_x_nchadrons==0))))'; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        
        ## Below unce is not as easily extracted as above by specifying a different weight string. They may need *special treatment* implemented in the depth-3 function
        if 'qcdSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdSyst']: 
            sys_name = "qcdSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = "qcdSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        if 'qcdKdeSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdKdeSyst']: 
            sys_name = "qcdKdeSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = "qcdKdeSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        if 'psWeightIsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightIsr']: 
            sys_name = "psWeightIsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = "psWeightIsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        if 'psWeightFsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightFsr']: 
            sys_name = "psWeightFsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = "psWeightFsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)

        if 'sfBDTRwgt' in g_make_unce_types.keys() and g_make_unce_types['sfBDTRwgt']: 
            sys_name = 'sfBDTRwgtUp'; wgtstr_dm_sys = wgtstr_dm.replace('fj_x_sfbdtwgt_g90_incl','fj_x_sfbdtwgt_g90_binned'); makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = 'sfBDTRwgtDown'; wgtstr_dm_sys = wgtstr_dm.replace('fj_x_sfbdtwgt_g90_incl','(2*fj_x_sfbdtwgt_g90_incl-fj_x_sfbdtwgt_g90_binned)'); makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
        if 'sfBDTFloAround' in g_make_unce_types.keys() and g_make_unce_types['sfBDTFloAround']: 
            sys_name = 'sfBDTFloAroundUp'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)
            sys_name = 'sfBDTFloAroundDown'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(arr, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, ext_masks)

In [None]:
def makeTemplates(arr, fitinfo, outputdir, sys_name, wgtstr_dm_sys, args, ext_masks):
    r"""Depth 3: The very base implementation that apply the final pass/fail cut and make the template
    """
    print(ext_masks)
    wgtstr_dm, wgtstr_dm_data, sl_dm, sl_dm_herwig, categories_dm, base_masks = args['wgtstr_dm'], args['wgtstr_dm_data'], args['sl_dm'], args['sl_dm_herwig'], args['categories_dm'], args['base_masks']
    
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    ## Create the output root file
    print (fitinfo, outputdir, sys_name, wgtstr_dm_sys)
    
    ## Loop over pass and fail region
    import ROOT, array  ## use ROOT to write file...
    for b in ['pass', 'fail']:
        try:
            fw = ROOT.TFile(outputdir+f'inputs_{b}.root', 'recreate')
            vname, nbin, xmin, xmax, (underflow, overflow), vlabel = fitinfo
            mask_list_fin = {  # final mask list used for selection
                'data': base_masks['data']+ext_masks+[f'fj_x_tagger_{b}'],
                'mc':   base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}'],
            }
            
            ## Tranfer the {nbin, xmin, xmax} set to the real bin edge if necessary
            if not isinstance(nbin, int):
                edges = nbin
                nbin = len(edges)-1 # reset nbin to "real" nbin
                edges_inroot = (len(edges)-1, array.array('f', edges))
            else:
                edges = np.linspace(xmin, xmax, nbin+1)
                edges_inroot = (nbin, xmin, xmax)

            hv, hist = {}, {}
            hname_suf = '_'+sys_name if sys_name!='nominal' else ''  ## suffix to the hist name (the Higgs Combine syntax)
            print (' -- ', b)
            
            # Loop over categories: flvC/flvB/flvL/data
            for cat in categories_dm:
                ## hv[] holds the boosted-histogram type derived from the dataframe, hist[] holds the TH1D type to be stored in ROOT
                if cat=='data' and sys_name == 'nominal':
                    ## Get the data hist
                    _content = concat_array_fj12(arr, expr=vname, sam_list=[sl_dm[-1]], filter_list=base_masks['data']+ext_masks+[f'fj_x_tagger_{b}'])
                    _weights = np.ones_like(_content) if wgtstr_dm_data is None else concat_array_fj12(arr, expr=wgtstr_dm_data, sam_list=[sl_dm[-1]], filter_list=base_masks['data']+ext_masks+[f'fj_x_tagger_{b}'])
                    hv['data'] = get_hist(_content, bins=edges, weights=_weights, underflow=underflow, overflow=overflow).view(flow=True)     
                    # Initialize the TH1D hist
                    hist['data'] = ROOT.TH1D('data_obs', 'data_obs;'+vname, *edges_inroot) 
                if cat!='data':
                    ## Get the MC hist for certain flavor
                    _content = concat_array_fj12(arr, expr=vname, sam_list=sl_dm[:-1], filter_list=base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}', f'fj_x_{cat}'])
                    _weights = concat_array_fj12(arr, expr=wgtstr_dm_sys, sam_list=sl_dm[:-1], filter_list=base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}', f'fj_x_{cat}'])
                    hv[cat] = get_hist(_content, bins=edges, weights=_weights, underflow=underflow, overflow=overflow).view(flow=True)
                    # Initialize the TH1D hist
                    hist[cat] = ROOT.TH1D(cat+hname_suf, cat+hname_suf+';'+vname, *edges_inroot) # init TH1 hist
                    hist[cat].Sumw2()
            
                    ## For qcdSyst / qcdKdeSyst unce that is actually related to Herwig, hv[cat] is dummy here, 
                    ## and we mean to obtain hv[cat+'_herwig.value'] that will be later filled into hist[cat]
                    if sys_name=='qcdSystUp':
                        ## Get the Herwig fit for certain flavor
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g90','sfbdtwgt_g90_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        _content = concat_array_fj12(arr, expr=vname, sam_list=sl_dm_herwig[:-1], filter_list=base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}', f'fj_x_{cat}'])
                        _weights = concat_array_fj12(arr, expr=wgtstr_dm_sys_herwig, sam_list=sl_dm_herwig[:-1], filter_list=base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}', f'fj_x_{cat}'])                        
                        hv[cat+'_herwig.value'] = get_hist(_content, bins=edges, weights=_weights, underflow=underflow, overflow=overflow).view(flow=True).value
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
                    
                    ## Extract the KDE shape directly from herwig shape
                    if sys_name=='qcdKdeSystUp':
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g90','sfbdtwgt_g90_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        _content = concat_array_fj12(arr, expr=vname, sam_list=sl_dm_herwig[:-1], filter_list=base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}', f'fj_x_{cat}'])
                        _weights = concat_array_fj12(arr, expr=wgtstr_dm_sys_herwig, sam_list=sl_dm_herwig[:-1], filter_list=base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}', f'fj_x_{cat}'])                        
                        hv_herwig_orig_value = get_hist(_content, bins=edges, weights=_weights, underflow=underflow, overflow=overflow).view(flow=True).value
                        
                        ## Calculate KDE shape, apply two times so that we specify a finer KDE bindwidth based on the first result
                        from scipy.stats import gaussian_kde
                        kde = gaussian_kde(_content, weights=np.clip(_weights, 0, +np.inf))
                        kde = gaussian_kde(_content, weights=np.clip(_weights, 0, +np.inf), bw_method=kde.factor/g_custom_kde_bw[vname])
                        kde_int = np.zeros([nbin, 2])
                        
                        ## Integrate the KDE function to obtain KDE histogram
                        for i, (low, high) in enumerate(zip(edges[:-1], edges[1:])):
                            if low in g_custom_kde_binmask[vname]:
                                continue
                            kde_int[i] = [kde.integrate_box_1d(low, high), hv_herwig_orig_value[i]]
                        # print('rescale kde sum to original herwig sum: ', kde_int[:,1].sum() / kde_int[:,0].sum())
                        kde_int[:,0] *= kde_int[:,1].sum() / kde_int[:,0].sum()
                        
                        ## Fill with original madgraph hist if we plan to mask the bin for KDE. 
                        ## This is based on the fact that KDE cannot model the hist well in the marginal bins
                        hv[cat+'_herwig.value'] = np.array([kde_int[i][0] if kde_int[i][0]!=0 else hv[cat].value[i] for i in range(nbin)])
                        
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
            
                    ## Extract the PSWeight histogram
                    if 'psWeight' in sys_name:
                        if year==2018:  ## for 2018, calculate the hist by PSWeight vars 
                            ps_idx = {'psWeightIsrUp':2, 'psWeightIsrDown':0, 'psWeightFsrUp':3, 'psWeightFsrDown':1}
                            wgtstr_dm_sys_ps = wgtstr_dm_sys + f"*(PSWeight[:,{ps_idx[sys_name]}])"
                            _weights = concat_array_fj12(arr, expr=wgtstr_dm_sys_ps, sam_list=sl_dm[:-1], filter_list=base_masks['mc']+ext_masks+[f'fj_x_tagger_{b}', f'fj_x_{cat}'])
                            hv[cat] = get_hist(_content, bins=edges, weights=_weights, underflow=underflow, overflow=overflow).view(flow=True)
                        else:  ## for 2016/17 extract the PSWeight hist based on 2018 result (transfer the ratio for PSWeight/nominal)
                            import re
                            outputdir_ps_18 = outputdir.replace(f'_SF{year}_', '_SF2018_')
                            hv_nom_18 = uproot.open(outputdir_ps_18.replace(sys_name, 'nominal')+f'inputs_{b}.root')[cat]
                            hv_ps_18 = uproot.open(outputdir_ps_18+f'inputs_{b}.root')[cat+'_'+sys_name]
                            hv[cat].value *= hv_ps_18.values()[1:-1] / hv_nom_18.values()[1:-1]
                        # print (hv[cat].value)
                    
                    ## Extract the sfBDTFloAround histogram.
                    ## Method: to utilize the nominal hist for sfbdt>0.95 or 0.85 and migrate the MC-to-data confidence level in the 0.90 case
                    if 'sfBDTFloAround' in sys_name:
                        from scipy.stats import chi2
                        hv_data = uproot.open(outputdir.replace(sys_name, 'nominal')+f'inputs_{b}.root')['data_obs'].values()[1:-1]  ## nominal data hist for 0.90
                        _bdtname = '95' if 'Up' in sys_name else '85'
                        fr = uproot.open(outputdir.replace(sys_name, 'nominal').replace(f'/bdt{int(g_sfBDT_val_list[-1]*1000)}/',f'/bdt{_bdtname}0/')+f'inputs_{b}.root')
                        fr_data, fr_mc = fr['data_obs'].values()[1:-1], fr['flvC'].values()[1:-1]+fr['flvB'].values()[1:-1]+fr['flvL'].values()[1:-1]  ## nominal data & MC hist for 0.95 or 0.85 (depends on Up or Down)
                        
                        ## For each bins, migrate the confidence level of MC yield F0 given data yield D0 to the target data yield D => F
                        hv_mc = []
                        for D, D0, F0 in zip(hv_data, fr_data, fr_mc):
                            ## The precise calculation
                            F = 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0+2), 2*D+2) if F0>D0 else 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0), 2*D)
                            if F == np.inf: ## in case the formula results in inf (may occur if F0 >> D0)
                                assert F0 > D0
                                sigD0 = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D0+2) - D0
                                sigD = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D+2) - D
                                F = D + sigD/sigD0*(F0-D0)
                            hv_mc.append(F)
                        
                        ## Obtain flavor template based on the flavor proportion in 0.95 or 0.85 region
                        hv[cat].value = np.nan_to_num(hv_mc * fr[cat].values()[1:-1] / fr_mc, nan=0)
                        
            ## Fill the hv[cat] (for qcd*, fill hv[cat+'_herwig.value']) into TH1D and save into ROOT
            for cat in hist.keys():
                ## Special handling for qcdSyst / qcdKdeSyst
                if 'qcd' in sys_name and 'SystUp' in sys_name:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat+'_herwig.value'][i])
                elif 'qcd' in sys_name and 'SystDown' in sys_name:
                    hv[cat+'_herwig.value'] = g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)]
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, 2 * hv[cat].value[i] - hv[cat+'_herwig.value'][i])
                    g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)] = None

                ## Normal routine
                else:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat].value[i])
                        hist[cat].SetBinError(i+1, np.sqrt(hv[cat].variance[i]))
                
                ## Fix some buggy points
                if cat!='data':
                    for i in range(nbin):
                        if hist[cat].GetBinContent(i+1) <= 1e-3:
                            hist[cat].SetBinContent(i+1, 1e-3)
                            hist[cat].SetBinError(i+1, 1e-3)
                        elif hist[cat].GetBinError(i+1) > hist[cat].GetBinContent(i+1):
                            hist[cat].SetBinError(i+1, hist[cat].GetBinContent(i+1))

                hist[cat].Write()
        ## Close the ROOT file if error occurs (otherwise the notebook is easily corrupted)
        finally:
            fw.Close()

Now we launch the template maker

In [None]:
launch_maker()

# Data/MC comparison plots

Based on the ak-array dict `arr`, this section aims to make data and MC plots, while MC is categorized into three flavors: C/B/L.
With the universial make_data_mc_plots function, one can make specify any final selection, any sample list to produce the standard hist+ratio plot.

The below recipe can make a default set of plots.

In [None]:
### ================ configuration  ===================

def make_config_dm(sl_dm, wgtstr_dm):
    return {
        'data':  ('Data',       'jetht-noht',      '1.0',    ''      ),
        'flvB':  ('QCD (flvB)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons>=1'  ),
        'flvC':  ('QCD (flvC)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons==0 & fj_x_nchadrons>=1'  ),
        'flvL':  ('QCD (flvL)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons==0 & fj_x_nchadrons==0'  ),
    }

categories_dm = ['flvL', 'flvB', 'flvC', 'data']

bininfo_dm = [ #(savename, vname, nbin, xmin, xmax, label)
    ('ht', 'ht', 50, 0, 2000, r'$H_{T}$ [GeV]'),
    ('fj_x_pt', 'fj_x_pt', 20, 200, 800, r'$p_{T}(AK15)$ [GeV]'),
    ('fj_x_eta', 'fj_x_eta', 20, -2.5, 2.5, r'$\eta(AK15)$'),
    ('fj_x_sdmass', 'fj_x_sdmass', 15, 50, 200, r'$m_{SD}(AK15)$ [GeV]'),
    ('fj_x_sfBDT', 'fj_x_sfBDT', 50, 0.5, 1, r'$sfBDT(AK15)$'),

    ('fj_x_ParticleNetMD_XccVsQCD', 'fj_x_ParticleNetMD_XccVsQCD', 40, 0, 1, r'ParticleNetMD_XccVsQCD(AK15)'),
    ('fj_x_ParticleNetMD_XccVsQCD_08', 'fj_x_ParticleNetMD_XccVsQCD', 40, 0.8, 1, r'ParticleNetMD_XccVsQCD(AK15)-u'),
    
    ("fj_x_btagcsvv2", "fj_x_btagcsvv2", [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, r'$CSVv2$'),
    ("fj_x_mSV12_ptmax_log", "fj_x_mSV12_ptmax_log", [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, r'$log(m_{SV1,p_{T}\,max}\; /GeV)$'),
    ("fj_x_mSV12_dxysig_log", "fj_x_mSV12_dxysig_log", [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, r'$log(m_{SV1,d_{xy}sig\,max}\; /GeV)$'),
]

In [None]:
### ================ slim on cc-tagger, sfBDT, then make data/MC plots ===================

import seaborn as sns
def set_sns_color(*args):
    sns.palplot(sns.color_palette(*args))
    sns.set_palette(*args)

def calc_custom_masks(sl_dm, filter_list):
    for sam in sl_dm:
        for mask in filter_list + ([] if sam=='jetht-noht' else ['fj_x_flvB', 'fj_x_flvC', 'fj_x_flvL']):
            for i in ['1','2']:
                if mask.replace('fj_x', f'fj_{i}') not in arr[sam].maskdict.keys():
                    print('new mask calculated (fj_x -> fj_1/2): ', mask.replace('fj_x', f'fj_{i}'))
                    if 'fj_x_pt' in mask:
                        import re
                        ptmin, ptmax = re.findall('fj_x_pt(\S+)to(\S+)', mask)[0]
                        ptmax = '100000' if ptmax=='Inf' else ptmax
                        arr[sam].maskdict[mask.replace('fj_x', f'fj_{i}')] = eval_expr(arr[sam], f'(fj_{i}_pt>={ptmin}) & (fj_{i}_pt<{ptmax})')
                    elif 'fj_x_flv' in mask:
                        arr[sam].maskdict[f'fj_{i}_flvB'] = eval_expr(arr[sam], f'fj_{i}_nbhadrons>=1')
                        arr[sam].maskdict[f'fj_{i}_flvC'] = eval_expr(arr[sam], f'(fj_{i}_nbhadrons==0) & (fj_{i}_nchadrons>=1)')
                        arr[sam].maskdict[f'fj_{i}_flvL'] = eval_expr(arr[sam], f'(fj_{i}_nbhadrons==0) & (fj_{i}_nchadrons==0)')
                    elif 'fj_x_sfBDT' in mask or 'fj_x_ParticleNetMD_XccVsQCD' in mask:
                        arr[sam].maskdict[mask.replace('fj_x', f'fj_{i}')] = eval_expr(arr[sam], mask.replace('fj_x', f'fj_{i}'))

    
def make_data_mc_plots(sl_dm, config_dm, filter_list, prefix, **kwargs):
    r"""To make standard hist+ratio plots based on the sample list and the final selection
    Arguments:
        sl_dm: sample list
        config_dm: configuration set for each categories in the plots, in the dict format. name: (label, sample/sample list, weight string, cat selection)
        filter_list: keys of maskdict. The corresponding selections are used to produce the plots
        prefix: prefix string used in the output plot title
        kwargs: includes further KDE-related variables
    """
    
    calc_custom_masks(sl_dm, filter_list)
    result_dic = {savename: {} for savename, _, _, _, _, _ in bininfo_dm}
    for savename, vname, nbin, xmin, xmax, vlabel in bininfo_dm:
        if 'plot_vars' in kwargs and savename not in kwargs['plot_vars']:
            continue
        if not isinstance(nbin, int):
            edges, xmin, xmax, nbin = nbin, min(nbin), max(nbin), len(nbin)
        else:
            edges = np.linspace(xmin, xmax, nbin+1)

        label, hdm = {}, {}
        underflow = False if vlabel[-2:] in ['-u','-a'] else True
        overflow  = False if vlabel[-2:] in ['-o','-a'] else True
        if vlabel[-2:] in ['-u','-o','-a']:
            vlabel = vlabel[:-2]
        
        if 'g_do_kde_vars' in kwargs and savename in kwargs['g_do_kde_vars'] and kwargs['g_do_kde_vars'][savename]==True:
            g_do_kde_vars = True
            kde = {}
        else:
            g_do_kde_vars = False
        
        ## Loop over categories to extract the hist for each flavor and data
        for cat in categories_dm:
            lab, sam, wgt, sel = config_dm[cat]
            label[cat] = lab
            if cat != 'data':
                _content = concat_array_fj12(arr, expr=vname, sam_list=sam, filter_list=['fj_x_base_subst']+filter_list+[f'fj_x_{cat}'])
                _weights = concat_array_fj12(arr, expr=wgt,   sam_list=sam, filter_list=['fj_x_base_subst']+filter_list+[f'fj_x_{cat}'])
                hdm[cat] = get_hist(_content, bins=edges, weights=_weights, underflow=underflow, overflow=overflow)
                if g_do_kde_vars:
                    from scipy.stats import gaussian_kde
                    from scipy import integrate
                    import multiprocessing
                    if 'custom_kde' in kwargs.keys() and savename in kwargs['custom_kde']:
                        kde[cat] = kwargs['custom_kde'][savename][cat]
                        kde_int_res = [
                                integrate.quad(kde[cat][0], -np.inf if (i==0 and underflow) else edges[i], 
                                                  +np.inf if (i==len(edges)-1 and overflow) else edges[i+1]) for i in range(len(edges)-1)]
                    else:
                        kdetmp = gaussian_kde(_content, weights=np.clip(_weights, 0, np.inf))
                        if 'g_custom_kde_bw' in kwargs.keys() and savename in kwargs['g_custom_kde_bw']:
                            kdetmp = gaussian_kde(_content, weights=np.clip(_weights, 0, np.inf), bw_method=kdetmp.factor/kwargs['g_custom_kde_bw'][savename])
                        kde[cat] = (kdetmp, _weights.sum())
                        kde_int_res = [(kde[cat][0].integrate_box_1d(-np.inf if (i==0 and underflow) else edges[i], +np.inf if (i==len(edges)-1 and overflow) else edges[i+1]), 0.) for i in range(len(edges)-1)]
                    hdm[cat+'_kde'] = hdm[cat].copy()
                    hdm[cat+'_kde'].view(flow=True).value = np.array([kde_int_res[i][0] for i in range(len(edges)-1)]) * kde[cat][1]
                    hdm[cat+'_kde'].view(flow=True).variance = np.zeros(len(edges)-1)
                        
            else: ## is data: no sel, weight=1
                _content = concat_array_fj12(arr, expr=vname, sam_list=sam, filter_list=['fj_x_base']+filter_list)
                _weights = np.ones_like(_content)
                hdm[cat] = get_hist(_content, bins=edges, weights=_weights, underflow=underflow, overflow=overflow)
                
        cat_sufs = ['']
        if g_do_kde_vars:
            cat_sufs += ['_kde']
        for cat_suf in cat_sufs:
            ## Draw the standard hist_ratio plot
            set_sns_color('cubehelix_r', 3) ## set the color palette
            f = plt.figure(figsize=(12,12))
            gs = mpl.gridspec.GridSpec(2, 1, height_ratios=[3, 1], hspace=0.05) 
            
            ## Upper histogram panel
            ax = f.add_subplot(gs[0])
            hep.cms.label(data=True, paper=False, year=2016, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
            ax.set_xlim(xmin, xmax); ax.set_xticklabels([]); ax.set_ylabel('Events / bin', ha='right', y=1.0)

            plot_hist([hdm[cat+cat_suf] for cat in categories_dm if cat!='data'], bins=edges, label=[label[cat] for cat in categories_dm if cat!='data'], histtype='fill', edgecolor='k', linewidth=1, stack=True) ## draw stacked bkg
            cats_mc = list(set(categories_dm) - set(['data']))
            hdm_add = hdm[cats_mc[0]+cat_suf].copy()
            for cat in cats_mc[1:]:
                hdm_add += hdm[cat+cat_suf]
            bkgtot, bkgtot_err = hdm_add.view(flow=True).value, np.sqrt(hdm_add.view(flow=True).variance)
            ax.fill_between(edges, (bkgtot-bkgtot_err).tolist()+[0], (bkgtot+bkgtot_err).tolist()+[0], label='BKG unce.', step='post', hatch='///', edgecolor='darkblue', facecolor='none', linewidth=0) ## draw bkg unce.
            plot_hist(hdm['data'], bins=edges, label='Data', histtype='errorbar', color='k', markersize=15, elinewidth=1.5) ## draw data
            # ax.set_yscale('log')

            ax.legend()
            # ax.legend(loc='upper left'); ax.set_ylim(0, 1.4*ax.get_ylim()[1])
            
            ## Ratio panel
            ax1 = f.add_subplot(gs[1]); ax1.set_xlim(xmin, xmax); ax1.set_ylim(0.001, 1.999)
            ax1.set_xlabel(vlabel, ha='right', x=1.0); ax1.set_ylabel('Data / MC', ha='center')
            ax1.plot([xmin,xmax], [1,1], 'k'); ax1.plot([xmin,xmax], [0.5,0.5], 'k:'); ax1.plot([xmin,xmax], [1.5,1.5], 'k:')

            hr = hdm['data'].view(flow=True).value / hdm_add.view(flow=True).value
            # hr_err = hr * np.sqrt(hdm['data'].view(flow=True).variance/(hdm['data'].view(flow=True).value**2) + hdm_add.view(flow=True).variance/(hdm_add.view(flow=True).value**2))
            hr_dataerr = hr * np.sqrt(hdm['data'].view(flow=True).variance/(hdm['data'].view(flow=True).value**2))
            ax1.fill_between(edges, ((bkgtot-bkgtot_err)/bkgtot).tolist()+[0], ((bkgtot+bkgtot_err)/bkgtot).tolist()+[0], step='post', hatch='///', edgecolor='darkblue', facecolor='none', linewidth=0) ## draw bkg unce.
            hep.histplot(np.nan_to_num(hr, nan=-1), bins=edges, yerr=np.nan_to_num(hr_dataerr), histtype='errorbar', color='k', markersize=15, elinewidth=1) ## draw data in ratio plot

            filter_list_str = '_'.join(filter_list)
            print('save plot: ', f'plots/{g_dirname}_{year}/{prefix}__{filter_list_str}__{savename}{cat_suf}.png/pdf')
            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}__{filter_list_str}__{savename}{cat_suf}.png')
            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}__{filter_list_str}__{savename}{cat_suf}.pdf')

        ## kde/orig comparison plots
        if g_do_kde_vars:
            mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green'])
            f, ax = plt.subplots(figsize=(12,12))
            hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
            x_contin = np.linspace(xmin, xmax, 201)
            bin_width = edges[int(nbin/2)+1] - edges[int(nbin/2)]
            for cat, color in zip(['flvC', 'flvB', 'flvL'], ['blue', 'red', 'green']):
                lab, sam, wgt, sel = config_dm[cat]
                ax.plot(x_contin, kde[cat][0](x_contin) * kde[cat][1] * bin_width, label=lab+' KDE', linestyle=':', color=color)
            for cat, color in zip(['flvC', 'flvB', 'flvL'], ['blue', 'red', 'green']):
                lab, sam, wgt, sel = config_dm[cat]
                hep.histplot(hdm[cat+'_kde'].view(flow=True).value, bins=edges, label=lab+' KDE integral', linestyle='--', color=color)
                plot_hist(hdm[cat], bins=edges, label=lab, normed=False, color=color)
            ax.set_xlim(xmin, xmax); ax.set_xlabel(vlabel, ha='right', x=1.0); ax.set_ylabel('A.U.', ha='right', y=1.0); ax.legend()

            filter_list_str = '_'.join(filter_list)
            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}:kde_shape__{filter_list_str}__{savename}.png')
            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}:kde_shape__{filter_list_str}__{savename}.pdf')
            

g_do_kde_vars = {'fj_x_btagcsvv2':True, 'fj_x_mSV12_ptmax_log':True, 'fj_x_mSV12_dxysig_log':True}
g_custom_kde_bw = {'fj_x_btagcsvv2':15, 'fj_x_mSV12_ptmax_log':4, 'fj_x_mSV12_dxysig_log':4}

g_dirname = 'test_datamc' ## config me
if not os.path.exists(f'plots/{g_dirname}_{year}'):
    os.makedirs(f'plots/{g_dirname}_{year}')

for ptlab in ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500toInf'] + ['pt200toInf']:
    ## 1. With MadGraph sample list
    wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl'
    sl_dm = ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), filter_list=[f'fj_x_{ptlab}', 'fj_x_sfBDT>0.5'], prefix='mg')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), filter_list=[f'fj_x_{ptlab}', 'fj_x_sfBDT>0.9'], prefix='mg')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), filter_list=[f'fj_x_{ptlab}', 'fj_x_sfBDT>0.9', 'fj_x_ParticleNetMD_XccVsQCD>0.95'], prefix='mg')

    ## 2. With MadGraph sample list, while using the optional MC-to-data reweight scheme (on pT)
    wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_ad_ptwgt'
    sl_dm = ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), filter_list=[f'fj_x_{ptlab}', 'fj_x_sfBDT>0.9'], prefix='mg_ptwgt')
    
    ## 3. With Herwig sample list
    wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt_herwig*fj_x_sfbdtwgt_g90_herwig_incl'
    sl_dm = ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), filter_list=[f'fj_x_{ptlab}', 'fj_x_sfBDT>0.5'], prefix='herwig')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), filter_list=[f'fj_x_{ptlab}', 'fj_x_sfBDT>0.9'], prefix='herwig')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), filter_list=[f'fj_x_{ptlab}', 'fj_x_sfBDT>0.9', 'fj_x_ParticleNetMD_XccVsQCD>0.95'], prefix='herwig', 
                       g_do_kde_vars=g_do_kde_vars, g_custom_kde_bw=g_custom_kde_bw) ## also make the KDE plots

# Signal/proxy comparison plots

Based on the ak-array dict `arr`, The below recipe creates the proxy jet (from MC) and h->cc signal jet comparison plots on various jet observables.

In [None]:
## Load the hcc signal tree
arr['vhcc-2L'] = NanoEventsFactory.from_file(f'samples/trees/20200906_VH_extfillsv_2016_2L/mc/vhcc_tree.root', schemaclass=BaseSchema).events()

boosted = "(v_pt>200) & (ak15_pt>200) & (dphi_V_ak15>2.5) & (ak15_sdmass>50) & (ak15_sdmass<200)"
basecut_vhcc_2L = "(v_mass>75) & (v_mass<105) & (((np.abs(lep1_pdgId)==11) & passTrigEl) | ((np.abs(lep1_pdgId)==13) & passTrigMu)) & " + boosted + " & (n_ak4<3)"
arr['vhcc-2L'].maskdict = {}
arr['vhcc-2L'].maskdict['base'] = eval_expr(arr['vhcc-2L'], basecut_vhcc_2L)

basesel = { # name: cut, label
    'sv': ("(fj_x_sj1_nsv>=1) & (fj_x_sj2_nsv>=1)", r'$N_{SV}^{match}\geq 1$'),
    'tightsv': ("((fj_x_sj1_sv1_ntracks>2) & (np.abs(fj_x_sj1_sv1_dxy)<3) & (fj_x_sj1_sv1_dlensig>4) & (fj_x_sj2_sv1_ntracks>2) & (np.abs(fj_x_sj2_sv1_dxy)<3) & (fj_x_sj2_sv1_dlensig>4))", r'$N_{SV,tight}^{match}\geq 1$'),
}
def func_basesel(name):
    if name in basesel.keys():
        return basesel[name]
    elif name[:5]=='sfbdt':
        x = float(name[5:])/1000.
        return ('(fj_x_sfBDT>%.3f)'%x, r'$sfBDT>%.2f$'%x)
    else:
        raise RuntimeError('Baseline cut name not recognized.')

In [None]:
bininfo = [ #(vname, nbin, xmin, xmax, label, *vname for nominal*)   
    ('fj_x_ParticleNetMD_XccVsQCD', 20, 0, 1, 'ParticleNetMD_XccVsQCD (AK15)', 'ak15_ParticleNetMD_HccVsQCD'),
    ('fj_x_sdmass', 15, 50, 200, r'$m_{SD}$ (AK15)', 'ak15_sdmass'),
    ('fj_x_tau21', 20, 0, 1, r'$\tau_{21}$ (AK15)', 'ak15_tau21'), ##avaliable
    
    ('fj_x_deltaR_sj12', 40, 0, 1.5, r'$\Delta R_{sj_{1},sj_{2}}$ (AK15)', 'ak15_deltaR_sj12'),
    ('fj_x_pt', 40, 0, 1000, r'$p_{T}$ (AK15)', 'ak15_pt'),
    ('fj_x_sj1_pt', 40, 0, 1000, r'$p_{T,sj_{1}}$ (AK15)', 'ak15_sj1_pt'),
    ('fj_x_sj1_rawmass', 40, 0, 200, r'$m_{sj_{1},raw}$ (AK15)', 'ak15_sj1_rawmass'), ##avaliable
    ('fj_x_sj2_pt', 40, 0, 1000, r'$p_{T,sj_{2}}$ (AK15)', 'ak15_sj2_pt'),
    ('fj_x_sj2_rawmass', 40, 0, 200, r'$m_{sj_{2},raw}$ (AK15)', 'ak15_sj2_rawmass'), ##avaliable
    
    ('fj_x_nsv', 10, 0, 10, r'$N_{SV}$ (AK15)', 'ak15_nlooseSV'), ##avaliable
    ('fj_x_nsv_ptgt25', 8, 0, 8, r'$N_{SV,p_{T}\geq 25}$ (AK15)', 'ak15_nlooseSV_ptgt25'), ##avaliable
    ('fj_x_nsv_ptgt50', 8, 0, 8, r'$N_{SV,p_{T}\geq 50}$ (AK15)', 'ak15_nlooseSV_ptgt50'), ##avaliable
    ('fj_x_ntracks', 20, 0, 20, r'$N_{tracks}$ (AK15)', 'ak15_nlooseSV_ntracks'), ##avaliable
    ('fj_x_ntracks_sv12', 20, 0, 20, r'$N_{tracks\;for\;SV_{1,2}}$ (AK15)', 'ak15_nlooseSV_ntracks_sv12'), ##avaliable
    ('fj_x_sj1_nsv', 20, 0, 20, r'$N_{SV\;from\;sj_{1}}$ (AK15)', 'ak15_sj1_nlooseSV'), ##avaliable
    ('fj_x_sj1_ntracks', 20, 0, 20, r'$N_{tracks\;from\;sj_{1}}$ (AK15)', 'ak15_sj1_nlooseSV_ntracks'), ##avaliable
    ('fj_x_sj1_sv1_pt', 20, 0, 200, r'$p_{T,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_pt'),
    ('fj_x_sj1_sv1_mass', 20, 0, 50, r'$m_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_mass'), ##avaliable
    ('fj_x_sj1_sv1_masscor', 20, 0, 50, r'$m_{cor\;for\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_masscor'),
    ('fj_x_sj1_sv1_ntracks', 20, 0, 20, r'$N_{tracks\;from\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_ntracks'),
    ('fj_x_sj1_sv1_dxy', 20, 0, 5, r'$d_{xy,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dxy'),
    ('fj_x_sj1_sv1_dxysig', 20, 0, 20, r'$\sigma_{d_{xy},\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dxysig'),
    ('fj_x_sj1_sv1_dlen', 20, 0, 5, r'$d_{z,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dlen'),
    ('fj_x_sj1_sv1_dlensig', 20, 0, 20, r'$\sigma_{d_{z},\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dlensig'),
    ('fj_x_sj1_sv1_chi2ndof', 20, 0, 5, r'$\chi^2 / Ndof_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_chi2ndof'),
    ('fj_x_sj1_sv1_pangle', 40, 0, 5, r'$pAngle_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_pangle'),
]

In [None]:
g_dirname = 'test_sigpxy' ## config me
if not os.path.exists(f'plots/{g_dirname}_{year}'):
    os.makedirs(f'plots/{g_dirname}_{year}')

## Make comparison plots for normal weight (MC adopt the same weight as in the fit), or for additional mass / pT / tau21 weight
for wgtfac, pfwgt in zip(['1','fj_x_massdatamcwgt','fj_x_ptdatamcwgt'], ['nom', 'massdatamcwgt', 'ptdatamcwgt']):

    wgtstr = f'genWeight*xsecWeight*puWeight*fj_x_htwgt*fj_x_sfbdtwgt_g90_incl*{wgtfac}'
    wgtstr_vhcc_2L = 'genWeight*xsecWeight*puWeight'

    mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green', 'violet', 'darkorange', 'black', 'cyan', 'yellow'])
    do_rwgt = 0
    for ptmin, ptmax in [(200, 250), (250, 300), (300, 350), (350, 400), (400, 500), (500, 100000), (200, 100000)]:
        presel, presel1 = f'(fj_x_pt>{ptmin}) & (fj_x_pt<{ptmax})', f'(ak15_pt>{ptmin}) & (ak15_pt<{ptmax})'
        label = {'subst_qcd-mg-noht': r'g(cc) (subst.)', 'vhcc-2L':r'$Z(\ell\ell)H(cc)$'}
        for i in ['1','2']:
            arr['subst_qcd-mg-noht'].maskdict[f'_tmp_fj_{i}_sigpxy_presel'] = eval_expr(arr['subst_qcd-mg-noht'], presel.replace('fj_x', f'fj_{i}'))
        arr['vhcc-2L'].maskdict['_tmp_sigpxy_presel'] = eval_expr(arr['vhcc-2L'], presel1)
        
        for vname, nbin, xmin, xmax, vlabel, vname1 in bininfo:
            f, ax = plt.subplots(figsize=(12,12))
            hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')

            for sam in ['vhcc-2L']:
                _content = concat_array(arr, expr=vname1, sam_list=sam, filter_list=['base', '_tmp_sigpxy_presel'])
                _weights = concat_array(arr, expr=wgtstr_vhcc_2L, sam_list=sam, filter_list=['base', '_tmp_sigpxy_presel'])
                h = get_hist(_content, bins=np.linspace(xmin, xmax, nbin+1), weights=_weights)
                plot_hist(h, label=label[sam]+' $N_{SV}^{match}\geq 1$' if sam=='qcd-mg' else label[sam], normed=True)

            for sam in ['subst_qcd-mg-noht']:
                for ext in ['sv+sfbdt500', 'sv+sfbdt850', 'sv+sfbdt900', 'sv+sfbdt950']:
                    cutstr = ' & '.join(list(filter(None, [presel]+[func_basesel(cname)[0] for cname in ext.split('+')]))) ## join the cut string
                    print (cutstr)
                    for i in ['1','2']:
                        if f'fj_{i}_sigpxy_{ext}' not in arr[sam].maskdict.keys():
                            arr[sam].maskdict[f'fj_{i}_sigpxy_{ext}'] = eval_expr(arr[sam], cutstr.replace('fj_x', f'fj_{i}'))
                    _content = concat_array_fj12(arr, expr=vname, sam_list=sam, filter_list=['fj_x_base_subst', 'fj_x_flvC', '_tmp_fj_x_sigpxy_presel', f'fj_x_sigpxy_{ext}'])
                    _weights = concat_array_fj12(arr, expr=wgtstr, sam_list=sam, filter_list=['fj_x_base_subst', 'fj_x_flvC', '_tmp_fj_x_sigpxy_presel', f'fj_x_sigpxy_{ext}'])
                    h = get_hist(_content, bins=np.linspace(xmin, xmax, nbin+1), weights=_weights)
                    plot_hist(h, label=label[sam]+' '+(rwgt_ext_label if do_rwgt else '')+' & '.join([func_basesel(cname)[1] for cname in ext.split('+')]), normed=True)

            ax.legend()
            ax.set_xlim(xmin, xmax)
            ax.set_xlabel(vlabel, ha='right', x=1.0); ax.set_ylabel('A.U.', ha='right', y=1.0); 
            print('save plot: ', f'plots/{g_dirname}_{year}/{pfwgt}_{presel}__{vname}.png/pdf')
            plt.savefig(f'plots/{g_dirname}_{year}/{pfwgt}_{presel}__{vname}.png')
            plt.savefig(f'plots/{g_dirname}_{year}/{pfwgt}_{presel}__{vname}.pdf')