# Main notebook for ParticleNet AK15 cc-tagger SF derivation
## (`uproot`+`pandas` workflow)

The notebook aims to
 - Make the ROOT-format **templates** for fit
 - Produce **data/MC comparison plots** under some given event selection
 - Produce **H->cc signal and g->cc proxy jets comparison plots** on various jet observables
 
We adopt the `uproot`+`pandas`* workflow in this notebook, illustrated as follows:

    Input files (flat ROOT-tuples derived from analysis NanoAOD)
    -> load as `pandas` DataFrame (by `uproot`)
    -> manipulate the dataframe
    -> produce histograms (`boost_histogram`)
    -> (1) convert to TH1D for ROOT template; or (2) plot with `mplhep` using `matplotlib` as backend
    
 
(*) Note: this workflow suffers from large RAM usage in the runtime. It may consume 10-30 GB of RAM if dealing with large datasets, hence set requirement to the machine. 
A smarter workflow would be `coffea` (with lazy awkward-array as backend) which the future framework is planned to be migrated on.

(Updated Dec.24: The notebook using `coffea`+`awkward` workflow is available at `ak15_sf_main_ak.ipynb`)

# Make templates for fit

In [None]:
import uproot3 as uproot
from uproot3_methods import TLorentzVectorArray, TLorentzVector
import pandas as pd
import os

In [None]:
import numpy as np
import boost_histogram as bh
import matplotlib.pyplot as plt
import mplhep as hep
use_helvet = True  ## true: use helvetica for plots, make sure the system have the font installed
if use_helvet:
    CMShelvet = hep.style.CMS
    CMShelvet['font.sans-serif'] = ['Helvetica', 'Arial']
    plt.style.use(CMShelvet)
else:
    plt.style.use(hep.style.CMS)

import matplotlib as mpl
from cycler import cycler
mpl.use('AGG') # no rendering plots in the window

def get_hist(array, bins=10, xmin=None, xmax=None, underflow=False, overflow=False, mergeflowbin=True, normed=False,
            weights=None, **kwargs):
    r"""Plot histogram from input array.

    Arguments:
        array (np.ndarray): input array.
        bins (int, list or tuple of numbers, np.ndarray, bh.axis): bins
        weights (None, or np.ndarray): weights
        # normed (bool): deprecated.

    Returns:
        hist (boost_histogram.Histogram)
    """
    if isinstance(bins, int):
        if xmin is None:
            xmin = array.min()
        if xmax is None:
            xmax = array.max()
        width = 1.*(xmax-xmin)/bins
        if mergeflowbin and underflow:
            xmin += width
            bins -= 1
        if mergeflowbin and underflow:
            xmax -= width
            bins -= 1
        bins = bh.axis.Regular(bins, xmin, xmax, underflow=underflow, overflow=overflow)
    elif isinstance(bins, (list, tuple, np.ndarray)):
        if mergeflowbin and underflow:
            bins = bins[1:]
        if mergeflowbin and overflow:
            bins = bins[:-1]
        bins = bh.axis.Variable(bins, underflow=underflow, overflow=overflow)

    hist = bh.Histogram(bins, storage=bh.storage.Weight())
    if weights is None:
        weights = np.ones_like(array)
    hist.fill(array, weight=weights)
    return hist


def plot_hist(hists, normed=False, **kwargs):
    r"""Plot the histogram in the type of boost_histogram
    """
    
    if not isinstance(hists, (list, tuple)):
        hists = [hists]
    content = [h.view(flow=True).value for h in hists]
    bins = hists[0].axes[0].edges
    if 'bins' in kwargs:
        bins = kwargs.pop('bins')
    if 'yerr' in kwargs:
        yerr = kwargs.pop('yerr')
    else:
        yerr = [np.sqrt(h.view(flow=True).variance) for h in hists]
    if normed:
        for i in range(len(content)):
            contsum = sum(content[i])
            content[i] /= contsum
            yerr[i] /= contsum
    if len(hists) == 1:
        content, yerr = content[0], yerr[0]
    hep.histplot(content, bins=bins, yerr=yerr, **kwargs)

In [None]:
## Load the config.yml
import yaml
with open('config.yml') as f:
    config = yaml.safe_load(f)

## 1. Load files

Load the ROOT files into pandas DataFrame

In [None]:
year = config['samples']['year']

lumi = {2016: 35.92, 2017: 41.53, 2018: 59.74}

minimal_branches = [  ## minimal set of branches read into the notebook
    "run", "luminosityBlock", "event", "genWeight", "jetR", "passmetfilters", 
    "fj_1_dr_H", "fj_1_dr_Z", "fj_1_pt", "fj_1_eta", "fj_1_phi", "fj_1_rawmass", "fj_1_sdmass", "fj_1_tau21", "fj_1_btagcsvv2", "fj_1_btagjp", "fj_1_nsv", "fj_1_nsv_ptgt25", "fj_1_nsv_ptgt50", "fj_1_ntracks", "fj_1_ntracks_sv12", "fj_1_deltaR_sj12", "fj_1_sj1_pt", "fj_1_sj1_eta", "fj_1_sj1_phi", "fj_1_sj1_rawmass", "fj_1_sj1_ntracks", "fj_1_sj1_nsv", "fj_1_sj1_sv1_pt", "fj_1_sj1_sv1_mass", "fj_1_sj1_sv1_masscor", "fj_1_sj1_sv1_ntracks", "fj_1_sj1_sv1_dxy", "fj_1_sj1_sv1_dxysig", "fj_1_sj1_sv1_dlen", "fj_1_sj1_sv1_dlensig", "fj_1_sj1_sv1_chi2ndof", "fj_1_sj1_sv1_pangle", "fj_1_sj2_pt", "fj_1_sj2_eta", "fj_1_sj2_phi", "fj_1_sj2_rawmass", "fj_1_sj2_ntracks", "fj_1_sj2_nsv", "fj_1_sj2_sv1_pt", "fj_1_sj2_sv1_mass", "fj_1_sj2_sv1_masscor", "fj_1_sj2_sv1_ntracks", "fj_1_sj2_sv1_dxy", "fj_1_sj2_sv1_dxysig", "fj_1_sj2_sv1_dlen", "fj_1_sj2_sv1_dlensig", "fj_1_sj2_sv1_chi2ndof", "fj_1_sj2_sv1_pangle", "fj_1_sj12_masscor_dxysig", "fj_1_sfBDT", "fj_1_nbhadrons", "fj_1_nchadrons", "fj_1_sj1_nbhadrons", "fj_1_sj1_nchadrons", "fj_1_sj2_nbhadrons", "fj_1_sj2_nchadrons", 
    "fj_2_dr_H", "fj_2_dr_Z", "fj_2_pt", "fj_2_eta", "fj_2_phi", "fj_2_rawmass", "fj_2_sdmass", "fj_2_tau21", "fj_2_btagcsvv2", "fj_2_btagjp", "fj_2_nsv", "fj_2_nsv_ptgt25", "fj_2_nsv_ptgt50", "fj_2_ntracks", "fj_2_ntracks_sv12", "fj_2_deltaR_sj12", "fj_2_sj1_pt", "fj_2_sj1_eta", "fj_2_sj1_phi", "fj_2_sj1_rawmass", "fj_2_sj1_ntracks", "fj_2_sj1_nsv", "fj_2_sj1_sv1_pt", "fj_2_sj1_sv1_mass", "fj_2_sj1_sv1_masscor", "fj_2_sj1_sv1_ntracks", "fj_2_sj1_sv1_dxy", "fj_2_sj1_sv1_dxysig", "fj_2_sj1_sv1_dlen", "fj_2_sj1_sv1_dlensig", "fj_2_sj1_sv1_chi2ndof", "fj_2_sj1_sv1_pangle", "fj_2_sj2_pt", "fj_2_sj2_eta", "fj_2_sj2_phi", "fj_2_sj2_rawmass", "fj_2_sj2_ntracks", "fj_2_sj2_nsv", "fj_2_sj2_sv1_pt", "fj_2_sj2_sv1_mass", "fj_2_sj2_sv1_masscor", "fj_2_sj2_sv1_ntracks", "fj_2_sj2_sv1_dxy", "fj_2_sj2_sv1_dxysig", "fj_2_sj2_sv1_dlen", "fj_2_sj2_sv1_dlensig", "fj_2_sj2_sv1_chi2ndof", "fj_2_sj2_sv1_pangle", "fj_2_sj12_masscor_dxysig", "fj_2_sfBDT", "fj_2_nbhadrons", "fj_2_nchadrons", "fj_2_sj1_nbhadrons", "fj_2_sj1_nchadrons", "fj_2_sj2_nbhadrons", "fj_2_sj2_nchadrons", 
    "passHTTrig", "ht", "nlep", "fj_1_is_qualified", "fj_2_is_qualified", "puWeight", "puWeightUp", "puWeightDown", "xsecWeight"
]
minimal_branches += [config['tagger']['var'].replace('fj_x', 'fj_1'), config['tagger']['var'].replace('fj_x', 'fj_2')]

ext_hlt_branches = {  ## extra branches depend on year
    2016: ['HLT_PFHT125', 'HLT_PFHT200', 'HLT_PFHT250', 'HLT_PFHT300', 'HLT_PFHT350', 'HLT_PFHT400', 'HLT_PFHT475', 'HLT_PFHT600', 'HLT_PFHT650', 'HLT_PFHT800', 'HLT_PFHT900'],
    2017: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
    2018: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
}
minimal_branches += ext_hlt_branches[year]
minimal_branches += ['nPSWeight', 'PSWeight'] if year==2018 else []  ## extra PSWeight branches for 2018
minimal_branches_for_data = set(minimal_branches) - set(["fj_1_dr_H", "fj_1_dr_Z", "fj_2_dr_H", "fj_2_dr_Z", 'genWeight', "puWeight", "puWeightUp", "puWeightDown", "xsecWeight", 'nPSWeight', 'PSWeight',
                                'fj_1_nchadrons', 'fj_1_nbhadrons','fj_2_nbhadrons','fj_1_sj1_nbhadrons','fj_2_sj1_nbhadrons','fj_1_sj2_nbhadrons','fj_2_sj2_nbhadrons',
                                'fj_2_nchadrons','fj_1_sj1_nchadrons','fj_2_sj1_nchadrons','fj_1_sj2_nchadrons','fj_2_sj2_nchadrons'])

## Read into pandas DataFrame
sample_prefix = f"{config['samples']['sample_prefix']}_{year}"
_df0 = {}
_df0['qcd-mg-noht'] = uproot.open(f"{sample_prefix}/mc/qcd-mg_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['qcd-herwig-noht'] = uproot.open(f"{sample_prefix}/mc/qcd-herwig_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['top-noht'] = uproot.open(f"{sample_prefix}/mc/top_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['v-qq-noht'] = uproot.open(f"{sample_prefix}/mc/v-qq_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['qcd-mg-bflav-noht'] = uproot.open(f"{sample_prefix}/mc/qcd-mg-bflav_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['jetht-noht'] = uproot.open(f"{sample_prefix}/data/jetht_tree.root")['Events'].pandas.df(minimal_branches_for_data, flatten=False)

## 2. Pre-processing

For data: apply OR of all HT trigger to enhance statistics.

For MC: apply no HT trigger, based on the strategy we name it "MC substitute".

The initial dataframe (`_df0`) is event-based, but for the purpose of fit we transform the dataframe to be jet-based. 
The new dataframe `df1` contains branches `fj_x_` that either come from `fj_1_` or `fj_2_` passing the corresponding jet-based creteria (pT>200, each subjet matched to >=1 SV, sfBDT>0.5) carried by `fj_?_is_qualified` (?=1,2).

In [None]:
### ================ Pre-processing for data  ===================

## Baseline selection applied to data. 
## Note that we use the OR or all HT triggers (some are pre-scaled triggers)
htcut_incl = '('+' | '.join(ext_hlt_branches[year])+')'
basesel_noht_prep = f"passmetfilters & {htcut_incl} & fj_x_pt>200 & fj_x_is_qualified"
sl_prep = ['jetht-noht']
df1 = {}
for sam in sl_prep:
    assert 'noht' in sam
    ## To concatenate event lists where either fj_1 is qualified OR fj_2 is qualified
    fj_branches = [key.replace('fj_2', 'fj_x') for key in _df0[sam].keys() if (key.startswith('fj_2') and key!='fj_2_is_qualified')]  ## all fj_2_ branches expect fj_2_is_qualified
    for i, i_inv in zip(['1','2'], ['2','1']):
        df1[sam + i] = _df0[sam].query(basesel_noht_prep.replace('fj_x', f'fj_{i}'))  ## select events where fj_1/fj_2 is qualified
        df1[sam + i].drop(columns=[key.replace('fj_x', f'fj_{i_inv}') for key in fj_branches], inplace=True)  ## drop fj branches for the other index
        df1[sam + i].rename(columns={key.replace('fj_x', f'fj_{i}'): key for key in fj_branches}, inplace=True)  ## change branches name from fj_1/fj_2 to a unified name fj_x
        df1[sam + i].loc[:, 'fj_idx'] = int(i)  ## label the jet index
        df1[sam + i].loc[:, 'is_qcd'] = True if 'qcd' in sam else False
    df1[sam] = pd.concat([df1[sam + '1'], df1[sam + '2']])
    del df1[sam + '1'], df1[sam + '2']
    del _df0[sam]  # to release memory usage if necessary

## Produce new variables used for fit
for sam in sl_prep:
    df1[sam]['mSV12_ptmax'] = df1[sam].eval('(fj_x_sj1_sv1_pt>fj_x_sj2_sv1_pt)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_pt<=fj_x_sj2_sv1_pt)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_ptmax_log'] = df1[sam].eval('log(mSV12_ptmax)')
    df1[sam]['mSV12_dxysig'] = df1[sam].eval('(fj_x_sj1_sv1_dxysig>fj_x_sj2_sv1_dxysig)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_dxysig<=fj_x_sj2_sv1_dxysig)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_dxysig_log'] = df1[sam].eval('log(mSV12_dxysig)')

In [None]:
# ## FOR TEST: to see data HT distributions passing different HT pre-scaled trigger
# for hlt in ext_hlt_branches[year]:
#     dftmp = _df0['jetht-noht'].query(hlt)
#     h = get_hist(dftmp['ht'].values, bins=np.linspace(0, 2000, 201), weights=np.ones(dftmp.shape[0]))
#     plot_hist(h, label=hlt)

In [None]:
# ## FOR TEST: check the xsecWeight for MG samples & genWeight for Herwig sample (to avoid extremely large values) 
# from collections import Counter
# print(Counter(_df0['qcd-mg-noht']['xsecWeight']),'\n')
# for i in [0.96, 0.98, 0.99]:
#     print(_df0['qcd-herwig-noht']['genWeight'].quantile(q=i))

In [None]:
### ================ Pre-processing for MC substitute  ===================

## Baseline selection applied to MC.
## No HT trigger is applied, based on the "MC substitute" strategy
basesel_noht_prep_subst = "passmetfilters & fj_x_pt>200 & fj_x_is_qualified"
sl_prep_subst = ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'subst_qcd-mg-bflav-noht']  ## mark sample name with "subst_" as a reminder of MC substitute
for sam in sl_prep_subst:
    assert 'noht' in sam
    ## To concatenate event lists where fj_1 is qualified OR fj_2 is qualified. Same procedure here
    fj_branches = [key.replace('fj_2', 'fj_x') for key in _df0[sam.replace('subst_','')].keys() if (key.startswith('fj_2') and key!='fj_2_is_qualified')]  ## all fj_2_ branches expect fj_2_is_qualified
    for i, i_inv in zip(['1','2'], ['2','1']):
        df1[sam + i] = _df0[sam.replace('subst_','')].query(basesel_noht_prep_subst.replace('fj_x', f'fj_{i}'))
        df1[sam + i].drop(columns=[key.replace('fj_x', f'fj_{i_inv}') for key in fj_branches], inplace=True)
        df1[sam + i].rename(columns={key.replace('fj_x', f'fj_{i}'): key for key in fj_branches}, inplace=True)
        df1[sam + i].loc[:, 'fj_idx'] = int(i)
        df1[sam + i].loc[:, 'is_qcd'] = True if 'qcd' in sam else False
        if sam == 'subst_qcd-mg-noht':
            df1[sam + i].query('xsecWeight<5.', inplace=True)  ## drop MG events with extremely large xsecWeight (coming from low HT sample in the HT-binned MG list)
        if sam == 'subst_qcd-herwig-noht':
            df1[sam + i].query('genWeight<{}'.format(_df0['qcd-herwig-noht']['genWeight'].quantile(q=0.96)), inplace=True)  ## drop Herwig events with extremely large genWeight
        if year == 2016 and sam == 'subst_qcd-herwig-noht':
            df1[sam + i].loc[:, 'xsecWeight'] = df1[sam + i]['xsecWeight'] * 2400.  ## fix a 2016 bug: Herwig sample xsec is mistaken
    df1[sam] = pd.concat([df1[sam + '1'], df1[sam + '2']])
    del df1[sam + '1'], df1[sam + '2']
    del _df0[sam.replace('subst_','')]  # to release memory usage if necessary

## Produce new variables used for fit
for sam in sl_prep_subst:
    df1[sam]['mSV12_ptmax'] = df1[sam].eval('(fj_x_sj1_sv1_pt>fj_x_sj2_sv1_pt)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_pt<=fj_x_sj2_sv1_pt)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_ptmax_log'] = df1[sam].eval('log(mSV12_ptmax)')
    df1[sam]['mSV12_dxysig'] = df1[sam].eval('(fj_x_sj1_sv1_dxysig>fj_x_sj2_sv1_dxysig)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_dxysig<=fj_x_sj2_sv1_dxysig)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_dxysig_log'] = df1[sam].eval('log(mSV12_dxysig)')

    ## PSWeight variables exclusive to 2018 datasets
    if year==2018:
        if df1[sam]['nPSWeight'].iloc[0] == 1:
            df1[sam]['PSWeight1'] = df1[sam]['PSWeight2'] = df1[sam]['PSWeight3'] = df1[sam]['PSWeight4'] = df1[sam]['PSWeight']
        else:
            assert all(df1[sam]['nPSWeight'] == 4)
            for i in range(4):
                df1[sam][f'PSWeight{i+1}'] = df1[sam]['PSWeight'].map(lambda x: x[i])

## 3. Obtain reweight factors

We need to extract some reweight factors as well as the BDT variation points specific to pT ranges. Step 1-3 are necessary for the nominal fit routine. Factors obtained from step 4-5 are for validation fits.

 3-1. **MC substitute-to-data reweight factor**: reweight based on the 3D (HT, pT, jet index) grid. The goal is to bring the shape of MC (without pre-selection on the jet-HT triggers) back to the data shape (passing the logical OR of prescaled jet-HT triggers). Remember that the raw MC always yields much larger than data. New variables take the name `htwgt`, `htwgt_herwig`. (`htwgt_herwig` is derived using the Herwig QCD sample and is only used in the validation fit.)

 3-2. **sfBDT reweight factor**: reweight on the sfBDT variable based on (pT, jet index) bins. The reweight factors `sfbdtwgt_g50` are obtained, which is only used to derive the systematics shape templates in the nominal fit. `sfbdtwgt_g50_herwig` is derived as well using the Herwig sample, used in the validation fit.
 
 3-3. **sfBDT central point and variation range**: a set of sfBDT cut values which are specific for different pT range. The values are derived by judging the similarity of the tagger shape between the signal and proxy jet samples.

 3-4. **Additional MC substitute-to-data reweight factor on $p_{T}$ only**: A possible replacement of the factors in step 1. This factor is only used in the validation fit to check if different reweighting schemes may affect the SF fit results. New variables take the name `ad_ptwgt` and `ad_ptwgt_herwig`.
 
 3-5. **Proxy-to-signal reweight factor on $m_{SD}$ / $p_{T}$ / $\tau_{21}$**: based on the shape of MC after applying the MC-to-data factors in step 1 and the H->cc signal jet shape. The factor is only used in the validation fit, in which we apply such reweight factor to both MC and data to check if the SF results are affected. New variables take the name `(mass|pt|tau21)datamcwgt` and `(mass|pt|tau21)datamcwgt_herwig`

In [None]:
### ================ 3-1. Reweight MC subsitute to data: stored as variable "htwgt", "htwgt_herwig") ===================

## True: if the block has run before, we can obtain the reweight factor from the previously stored pickle output
is_read_from_pickel = False

def extract_source_to_target_ht_weight(df1, sl_rwgt_source, wgtstr_rwgt_source, sl_rwgt_target, wgtstr_rwgt_target, wgtname, ext_sl_rwgt_source=[], presel='', do_plot=True):
    r"""Extract the "MC subsisute to data" reweight factor on HT based on (pT, jet index) bins
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt_(source|target): sample list for the source/target in this reweighting routine
        wgtstr_rwgt_(source|target): the weight string applied to the source/target to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
        ext_sl_rwgt_source: extra source sample list for which we also calculate the reweight factors after extracting them
        presel: additonal pre-selection applied before reweigting
        do_plot: if store plots of reweighting
    """
    
    rwgt_var = 'ht'
    ## The binning info for (pT, HT) grid. Note that 2016 is different from 2017/18. The adopted HT grid is based on MC shape in each pT bin
    rwgt_edge_dic = {}
    rwgt_edge_dic[2016] = {
        'jet1': {
            'pt200to250': [250, 300, 350, 400, 450, 500, 550, 600, 650, 750],
            'pt250to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 900],
            'pt300to350': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1200],
            'pt350to400': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1300],
            'pt400to500': [350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1450],
            'pt500to100000': [400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
        },
        'jet2': {
            'pt200to250': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 900, 1000],
            'pt250to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100, 1200],
            'pt300to350': [450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
            'pt350to400': [550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1600],
            'pt400to500': [600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1700],
            'pt500to100000': [650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
        },
    }
    rwgt_edge_dic[2017] = rwgt_edge_dic[2018] = {
        'jet1': {
            'pt200to250': [250, 300, 350, 400, 450, 500, 550, 600, 650, 750],
            'pt250to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 900],
            'pt300to350': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1200],
            'pt350to400': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1300],
            'pt400to500': [350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
            'pt500to100000': [500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
        },
        'jet2': {
            'pt200to250': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 900, 1000],
            'pt250to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100, 1200],
            'pt300to350': [450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
            'pt350to400': [550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1600],
            'pt400to500': [700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1800],
            'pt500to100000': [900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
        },
    }
    for sam in sl_rwgt_source + ext_sl_rwgt_source:
        df1[sam][wgtname] = np.nan  ## initially fill the output column with NaN

    if is_read_from_pickel: ## restore info from a previously stored pickle
        import pickle
        with open(f'plots/prep_pd/{wgtname}_{year}.pickle', 'rb') as f:
            res = pickle.load(f)
            ent_target, ent_source, rwgt = res['ent_target'], res['ent_source'], res['rwgt']
    else:
        ent_target, ent_source, rwgt = {}, {}, {}
    
    df_target = df1[sl_rwgt_target[0]] if len(sl_rwgt_target)==1 else pd.concat([df1[sam] for sam in sl_rwgt_target])
    df_source = df1[sl_rwgt_source[0]] if len(sl_rwgt_source)==1 else pd.concat([df1[sam] for sam in sl_rwgt_source])
    if presel != '':
        df_target = df_target.query(presel)
        df_source = df_source.query(presel)
    ## Rewight separately on jet pT bins
    for ptsel, ptlab in zip(['fj_x_pt>=200 & fj_x_pt<250', 'fj_x_pt>=250 & fj_x_pt<300', 'fj_x_pt>=300 & fj_x_pt<350', 'fj_x_pt>=350 & fj_x_pt<400', 'fj_x_pt>=400 & fj_x_pt<500', 'fj_x_pt>=500'], 
                            ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500to100000']):
        ## Reweight separately for 1st or 2nd jet
        for sel, lab in zip(['fj_idx==1', 'fj_idx==2'], ['jet1', 'jet2']):
            print (' -- ', ptsel, sel)
            rwgt_edge = rwgt_edge_dic[year][lab][ptlab]
            if not is_read_from_pickel:
                ## Calculate the rwgt for the first time
                _df_target = df_target.query(f'{ptsel} & {sel}')
                _df_source = df_source.query(f'{ptsel} & {sel}')
                
                ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
                ent_target[ptlab+lab] = get_hist(
                    _df_target[rwgt_var].values, bins=rwgt_edge, 
                    weights=np.ones(_df_target.shape[0]) if wgtstr_rwgt_target=='1' else _df_target.eval(wgtstr_rwgt_target).values, 
                    underflow=True, overflow=True, mergeflowbin=False
                ).view(flow=True).value
                ent_source[ptlab+lab] = get_hist(
                    _df_source[rwgt_var].values, bins=rwgt_edge, 
                    weights=np.ones(_df_source.shape[0]) if wgtstr_rwgt_source=='1' else _df_source.eval(wgtstr_rwgt_source).values, 
                    underflow=True, overflow=True, mergeflowbin=False
                ).view(flow=True).value
                ## Calculate the reweight factor
                rwgt[ptlab+lab] = ent_target[ptlab+lab] / ent_source[ptlab+lab] # len=nbin+2
            print(ent_target[ptlab+lab], '\n', rwgt[ptlab+lab])
            
            ## assign the reweight factor to the new column
            for sam in sl_rwgt_source + ext_sl_rwgt_source:
                df1sel = df1[sam].eval(f'{ptsel} & {sel}')
                df1[sam].loc[df1sel, wgtname] = df1[sam].loc[df1sel, rwgt_var].map(lambda val: rwgt[ptlab+lab][sum(np.array(rwgt_edge)<=val)] )
    
    ## check all entries are filled with valid factors
    assert any([any(df1[sam][wgtname] == np.nan) for sam in sl_rwgt_source]) == False
    
    ## store into pickle for the first run
    if not is_read_from_pickel: ## store the info for the first run
        import pickle
        if not os.path.exists('plots/prep_pd'):
            os.makedirs('plots/prep_pd')
        with open(f'plots/prep_pd/{wgtname}_{year}.pickle', 'wb') as fw:
            pickle.dump({'ent_target':ent_target, 'ent_source':ent_source, 'rwgt':rwgt}, fw)

    # =========== plot ===========
    if do_plot:
        mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green', 'violet', 'darkorange', 'black', 'cyan', 'yellow'])
        for ptlab in ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500to100000']:
            for lab, cm, cd in zip(['jet1', 'jet2'], ['blue', 'red'], ['royalblue', 'lightcoral']):
                f = plt.figure(figsize=(12,12))
                gs = mpl.gridspec.GridSpec(2, 1, height_ratios=[2, 1], hspace=0.04) 
                ax = f.add_subplot(gs[0])
                hep.cms.label(data=True, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
                hep.histplot(ent_source[ptlab+lab], bins=[0]+list(rwgt_edge_dic[year][lab][ptlab])+[2500], label='Jet '+lab[-1]+' (MC)', color=cm)
                hep.histplot(ent_target[ptlab+lab], bins=[0]+list(rwgt_edge_dic[year][lab][ptlab])+[2500], label='Jet '+lab[-1]+' (Data)', color=cd, linestyle='--')

                ax.set_xlim(0, 2500); ax.set_xticklabels([]); 
                ax.set_yscale('log'); ax.set_ylabel('Events', ha='right', y=1.0)
                ax.legend()
                ax1 = f.add_subplot(gs[1]); 
                hep.histplot(rwgt[ptlab+lab], bins=[0]+list(rwgt_edge_dic[year][lab][ptlab])+[2500], label='Jet '+lab[-1], color=cm)
                ax1.set_xlim(0, 2500); ax1.set_xlabel('$H_{T}$ [GeV]', ha='right', x=1.0);
                ax1.legend()
                ax1.set_yscale('log')
                ax1.set_ylim(5e-3, 2e0); ax1.set_ylabel('Rwgt factor', ha='right', y=1.0);  ax1.set_yticks([1e-2,1e-1,1e0,1e1]);
                ax1.plot([0, 2500], [1, 1], 'k:')

                if not os.path.exists('plots/prep_pd'):
                    os.makedirs('plots/prep_pd')
                plt.savefig(f'plots/prep_pd/rwgtfac_{wgtname}_{year}_{ptlab}_{lab}.pdf')
                plt.savefig(f'plots/prep_pd/rwgtfac_{wgtname}_{year}_{ptlab}_{lab}.png')
    # ============================
    
    return {'ent_target':ent_target, 'ent_source':ent_source, 'rwgt':rwgt}

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
htwgt = extract_source_to_target_ht_weight(
    df1, sl_rwgt_source=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], ext_sl_rwgt_source=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt_source=f"{lumi[year]}*genWeight*xsecWeight*puWeight",
    sl_rwgt_target=['jetht-noht'], wgtstr_rwgt_target='1', wgtname='htwgt',
)
htwgt_herwig = extract_source_to_target_ht_weight(
    df1, sl_rwgt_source=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], ext_sl_rwgt_source=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt_source=f"{lumi[year]}*genWeight*xsecWeight*puWeight", 
    sl_rwgt_target=['jetht-noht'], wgtstr_rwgt_target='1', wgtname='htwgt_herwig',
)

## Calculate bflav factors: reweight bflav sample to inclusive QCD (after b selection cut)
bflav_htwgt = extract_source_to_target_ht_weight(
    df1, sl_rwgt_source=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt_source=f"{lumi[year]}*genWeight*xsecWeight*puWeight",
    sl_rwgt_target=['subst_qcd-mg-noht'], wgtstr_rwgt_target=f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='bflav_htwgt',
    presel='fj_x_nbhadrons>=1', do_plot=False,
)
for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht']:
    df1[sam]['bflav_htwgt'] = 1.

df1['subst_qcd-mg-noht'][['ht', 'fj_x_pt', 'fj_idx', 'htwgt']]

In [None]:
### ================ 3-2. Extract the sfBDT>0.5 binned fractor: stored as variable "sfbdtwgt_g50"; similar for herwig ===================

def extract_further_weight(df1, sl_rwgt, wgtstr_rwgt, wgtname, rwgt_info, sl_ext_rwgt=[], presel='fj_x_sfBDT>=0.5'):
    r"""Extract the "MC subsisute to data" reweight factor (both overall and binned factor) further on sfBDT variable, after a sfBDT>0.9 selection
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name (the binned factors) stored as a new column
        rwgt_info: info of the reweight variable, in the format of (var, nbin, xmin, xmax) or (var, edges list, None, None)
        sl_ext_rwgt: extra sample list for which we also calculate the reweight factors after extracting them
        presel: pre-selection before reweighting
    """
    
    for sam in sl_rwgt:
        df1[sam][wgtname] = np.nan  ## initially fill the output column with NaN

    ## Reweight based on given variable
    rwgt_var, nbin, xmin, xmax = rwgt_info
    if not isinstance(nbin, int):
        rwgt_edge, xmin, xmax, nbin = nbin, min(nbin), max(nbin), len(nbin)
    else:
        rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    print('rwgt info: ', rwgt_var, rwgt_edge)
    
    ## Rewight separately on jet pT bins
    ent_data, ent_mc, rwgt = {}, {}, {}
    for pt_range in config['pt_range']['range']:
        pt_range = tuple(pt_range)
        rwgt_presel = f'fj_x_pt>={pt_range[0]} & fj_x_pt<{pt_range[1]}'
        rwgt_sel = f'{presel} & {rwgt_presel}'; print(rwgt_sel)
        _dffdata = df1['jetht-noht'].query(rwgt_sel)
        _dffmc =  pd.concat([df1[sam].query(rwgt_sel) for sam in sl_rwgt])
        
        ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
        ent_data[pt_range] = get_hist(_dffdata[rwgt_var].values, bins=rwgt_edge, weights=np.ones(_dffdata.shape[0]), underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ent_mc[pt_range]  = get_hist(_dffmc[rwgt_var].values, bins=rwgt_edge, weights=_dffmc.eval(wgtstr_rwgt).values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ## Calculate the reweight factor
        rwgt[pt_range] = np.nan_to_num(ent_data[pt_range] / ent_mc[pt_range], nan=0) # len=nbin+2
        ## assign the reweight factor to the new column
        for sam in sl_rwgt + sl_ext_rwgt:
            df1[sam].loc[df1[sam].eval(rwgt_presel), wgtname] = df1[sam].query(rwgt_presel)[rwgt_var].map(lambda val: rwgt[pt_range][sum(np.array(rwgt_edge)<=val)] )
        print (ent_data[pt_range], rwgt[pt_range])
    
    ## Store reweight factors
    import pickle
    if not os.path.exists('plots/prep_pd'):
        os.makedirs('plots/prep_pd')
    with open(f'plots/prep_pd/{wgtname}_{year}.pickle', 'wb') as fw:
        pickle.dump({'ent_data':ent_data, 'ent_mc':ent_mc, 'rwgt':rwgt}, fw)
    
    return {'ent_data':ent_data, 'ent_mc':ent_mc, 'rwgt':rwgt}

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_further_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt",
                             wgtname='sfbdtwgt_g50', rwgt_info=('fj_x_sfBDT', 25, 0.5, 1.))
extract_further_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig",
                             wgtname='sfbdtwgt_g50_herwig', rwgt_info=('fj_x_sfBDT', 25, 0.5, 1.))

df1['subst_qcd-mg-noht'][['fj_x_pt', 'fj_idx', 'fj_x_sfBDT', 'sfbdtwgt_g50']]

In [None]:
### ================ 3-3. Determine the optimal sfBDT cut value for each pT range  ===================

# First load the h->cc signal ntuple. Adopt the selction used in the analysis
if 'df_comp' not in globals():  
    import re
    _df0['vhcc-2L'] = uproot.open(f"{re.search('^.+/trees', sample_prefix)[0]}/20210117_VH_extjetvar_{year}_2L/mc/vhcc_tree.root")['Events'].pandas.df()

    boosted = "v_pt>200 & ak15_pt>200 & dphi_V_ak15>2.5 & ak15_sdmass>50 & ak15_sdmass<200"
    basecut_vhcc_2L = "v_mass>75 & v_mass<105 & ((abs(lep1_pdgId)==11 & passTrigEl) | (abs(lep1_pdgId)==13 & passTrigMu)) & " + boosted + " & n_ak4<3"
    df_comp = {}
    df_comp['vhcc-2L'] = _df0['vhcc-2L'].query(basecut_vhcc_2L)

def extract_bdt_sequence(df1, sl_pxy, wgtstr, pxy_base_sel):
    r"""Extract the sfBDT sequence for specified pT range, based on the signal/proxy similarity
    
    Arguments:
        df1: DataFrame as input
        sl_pxy: proxy sample list
        wgtstr: the weight string applied to proxy samples
        pxy_base_sel: base selections to proxy for deriving the sfBDT sequence. Impose 'c' category requirement using fj_x_nb(c)hadrons
    """

    ## Edges based on the tagger WPs
    edges = [0.] + sorted([rg[0] for rg in config['tagger']['working_points']['range'].values()]) + [1.]
    rat_pxy = {}
    bdt_seq = {}

    ## Extract the optimal sfBDT and variation cut values for each pT range
    _dffpxy_base = pd.concat([df1[sam].query(pxy_base_sel) for sam in sl_pxy]) # fj_x_nb(c)hadrons correspond to the defination of 'c' category
                              
    for ptmin, ptmax in config['pt_range']['range']:
        print('pt range: ', ptmin, ptmax)

        ## Calculate the proportion of LP+MP+HP over inclusive tagger score for "signal jets"
        _dffhcc = df_comp['vhcc-2L'].query(f'ak15_pt>{ptmin} & ak15_pt<{ptmax}')
        h = get_hist(_dffhcc['ak15_ParticleNetMD_HccVsQCD'].values, bins=edges, weights=_dffhcc.eval('genWeight*xsecWeight*puWeight').values)
        rat_hcc = np.array([h.view().value[0], h.view().value[1], h.view().value[2], h.view().value[3], sum(h.view().value[1:])]) / sum(h.view().value) ## <LP, LP, MP, TP, LP+MP+TP

        ## Calculate the proportion for "proxy jets" as sfBDT floats
        _dffpxy = _dffpxy_base.query(f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax}')
        ratios = [[], [], [], [], []]
        bdt_range = np.arange(0.7, 0.996, 0.002)
        for bdt in bdt_range:  # loop oversf BDT grid
            _dffpxy = _dffpxy.query(f'fj_x_sfBDT>{bdt}')
            h = get_hist(_dffpxy['fj_x_ParticleNetMD_XccVsQCD'].values, bins=edges, weights=_dffpxy.eval(wgtstr).values)
            rat = np.array([h.view().value[0], h.view().value[1], h.view().value[2], h.view().value[3], sum(h.view().value[1:])]) / sum(h.view().value) ## <LP, LP, MP, TP, LP+MP+TP
    #         print(rat)
            rat_pxy[((ptmin,ptmax), np.round(bdt,3))] = rat
            for j in range(5):
                ratios[j].append(rat[j])

        ## Get sfBDT cut WP
        from scipy.interpolate import interp1d
        bdt_wp = interp1d(ratios[4], bdt_range)(rat_hcc[4]) # chosen BDT WP: proxy proportion under LP+MP+TP reaches signal
        bdt_wp_hi = interp1d(ratios[3], bdt_range)(rat_hcc[3]) # chosen BDT WP (for 4/5's upper bound): proxy proportion under TP reaches signal
        rat_wp, rat_wp_hi = rat_hcc[4], interp1d(bdt_range, ratios[4])(bdt_wp_hi) # corresponding LP+MP+TP proportion
        step = (rat_wp_hi - rat_wp) / 4
        rat_seq = np.linspace(rat_wp-step*5, rat_wp+step*5, 11) # derive an arithmetic sequence
        bdt_seq[(ptmin,ptmax)] = interp1d(ratios[4], bdt_range, fill_value="extrapolate")(rat_seq)
        print('BDT seq: ', bdt_seq[(ptmin,ptmax)])

    df1[f"bdt_seq_{config['pt_range']['name']}"] = bdt_seq
    df1[f"rat_pxy_{config['pt_range']['name']}"] = rat_pxy

    import pickle
    if not os.path.exists('plots/prep_pd'):
        os.makedirs('plots/prep_pd')
    with open(f'plots/prep_pd/bdt_seq_{year}.pickle', 'wb') as fw:
        pickle.dump({k:df1[k] for k in df1 if k.startswith('bdt_seq') or k.startswith('rat_pxy')}, fw)

extract_bdt_sequence(df1, sl_pxy = ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'],
                     wgtstr='genWeight*xsecWeight*puWeight*htwgt',
                     pxy_base_sel='(fj_x_nbhadrons==0) & (fj_x_nchadrons>=1) & (fj_x_sfBDT>0.7)')

In [None]:
### ================ 3-4. [additional] Reweight MC subsitute to data on pT: stored as variable "ad_ptwgt", "ad_ptwgt_herwig" ===================

def extract_mc_to_data_pt_weight(df1, sl_rwgt, wgtstr_rwgt, wgtname, sl_ext_rwgt=[]):
    r"""Extract the "MC subsisute to data" reweight factor on pT as a optional choice
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
        sl_ext_rwgt: extra sample list for which we also calculate the reweight factors after extracting them
    """
    
    # Apply simple 1D reweight to pT
    rwgt_var, nbin, xmin, xmax  = 'fj_x_pt', 20, 200., 1200.
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Rewight separately on 1st/2nd jet
    for sel, lab in zip(['fj_idx==1', 'fj_idx==2'], ['jet1', 'jet2']):
        # Previously this extra factor is extracted with a presel of sfBDT>0.9. Now given that the sfBDT is optimized by pT range thus not fixed, we relax this cut
        _dffdata = df1['jetht-noht'].query(sel)
        _dffmc =  pd.concat([df1[sam].query(sel) for sam in sl_rwgt])
        
        ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
        ent_data = get_hist(_dffdata[rwgt_var].values, bins=rwgt_edge, weights=np.ones(_dffdata.shape[0]), underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ent_mc  = get_hist(_dffmc[rwgt_var].values, bins=rwgt_edge, weights=_dffmc.eval(wgtstr_rwgt).values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ## Calculate the reweight factor
        rwgt = ent_data / ent_mc # len=nbin+2
        
        ## assign the reweight factor to the new column
        for sam in sl_rwgt + sl_ext_rwgt:
            df1sel = df1[sam].eval(sel)
            df1[sam].loc[df1sel, wgtname] = df1[sam].loc[df1sel, rwgt_var].map(lambda val: rwgt[int(max(0, min(nbin+1, np.floor((val-1.*xmin)/(1.*xmax-xmin)*nbin) +1 )))] )
        print (ent_data, rwgt)

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_mc_to_data_pt_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight",        wgtname='ad_ptwgt')
extract_mc_to_data_pt_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='ad_ptwgt_herwig')

df1['subst_qcd-mg-noht'][['ht', 'fj_x_pt', 'fj_idx', 'htwgt', 'sfbdtwgt_g50', 'ad_ptwgt']]

In [None]:
### ================ 3-5. [additional] Reweight MC (proxy jet) to H->cc signal jet on either mass/pT/tau21: stored as variable "(mass|pt|tau21)datamcwgt"; similar for herwig  ===================

# First load the h->cc signal ntuple. Adopt the selction used in the analysis
if 'df_comp' not in globals():  
    import re
    _df0['vhcc-2L'] = uproot.open(f"{re.search('^.+/trees', sample_prefix)[0]}/20210117_VH_extjetvar_{year}_2L/mc/vhcc_tree.root")['Events'].pandas.df()

    boosted = "v_pt>200 & ak15_pt>200 & dphi_V_ak15>2.5 & ak15_sdmass>50 & ak15_sdmass<200"
    basecut_vhcc_2L = "v_mass>75 & v_mass<105 & ((abs(lep1_pdgId)==11 & passTrigEl) | (abs(lep1_pdgId)==13 & passTrigMu)) & " + boosted + " & n_ak4<3"
    df_comp = {}
    df_comp['vhcc-2L'] = _df0['vhcc-2L'].query(basecut_vhcc_2L)

def extract_mc_to_signal_weight(df1, sl_rwgt, wgtstr_rwgt, wgtname, rwgt_info, sl_ext_rwgt=[]):
    r"""Extract the "MC subsisute (proxy) to H->cc signal jet" reweight factor on possible variable
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
        rwgt_info: variable and binning info for this reweighting routine
        sl_ext_rwgt: extra sample list for which we also calculate the reweight factors after extracting them
    """
    
    # Reweight info extracted from the function argument
    rwgt_var, nbin, xmin, xmax, rwgt_var_nom  = rwgt_info
    print('rwgt info: ', rwgt_var, nbin, xmin, xmax)
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Requires the selection sfBDT>0.9 which is (averagely) used in the fit region
    rwgt_sel = 'fj_x_sfBDT>0.9'
    
    ## Get MC and h->cc signal histogram. Note: consider underflow & overflow bins, hence len = nbins+2
    _dffmc =  pd.concat([df1[sam].query(rwgt_sel) for sam in sl_rwgt])
    _dffmc_wgt = _dffmc.eval(wgtstr_rwgt)
    ent_mc  = get_hist(_dffmc[rwgt_var].values, bins=rwgt_edge, weights=_dffmc_wgt.values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
    yield_mc = _dffmc_wgt.sum()
    _dffhcc_wgt = df_comp['vhcc-2L'].eval('genWeight*xsecWeight*puWeight')
    ent_hcc  = get_hist(df_comp['vhcc-2L'][rwgt_var_nom].values, bins=rwgt_edge, weights=_dffhcc_wgt.values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
    yield_hcc = _dffhcc_wgt.sum()
    
    ## Calculate the reweight factor, and clip to (0, 50)
    rwgt = (ent_hcc/yield_hcc) / (ent_mc/yield_mc) # len=nbin+2
    rwgt = np.clip(rwgt, 0, 50)
    
    ## assign the reweight factor to the new column (to both MC and data)
    for sam in sl_rwgt + sl_ext_rwgt + ['jetht-noht']:
        df1[sam][wgtname] = df1[sam][rwgt_var].map(lambda val: rwgt[int(max(0, min(nbin+1, np.floor((val-1.*xmin)/(1.*xmax-xmin)*nbin) +1 )))] )
    print (ent_hcc, rwgt)

## For each reweight variable, calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt",
                            wgtname='massdatamcwgt', rwgt_info=('fj_x_sdmass', 15, 50, 200, 'ak15_sdmass'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig",
                            wgtname='massdatamcwgt_herwig', rwgt_info=('fj_x_sdmass', 15, 50, 200, 'ak15_sdmass'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt",
                            wgtname='ptdatamcwgt', rwgt_info=('fj_x_pt', 20, 200, 1200, 'ak15_pt'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig",
                            wgtname='ptdatamcwgt_herwig', rwgt_info=('fj_x_pt', 20, 200, 1200, 'ak15_pt'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt",
                            wgtname='tau21datamcwgt', rwgt_info=('fj_x_tau21', 20, 0, 1, 'ak15_tau21'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], sl_ext_rwgt=['subst_qcd-mg-bflav-noht'], wgtstr_rwgt=f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig",
                            wgtname='tau21datamcwgt_herwig', rwgt_info=('fj_x_tau21', 20, 0, 1, 'ak15_tau21'))

df1['jetht-noht'][['fj_x_sdmass', 'massdatamcwgt', 'fj_x_pt', 'ptdatamcwgt', 'fj_x_tau21', 'tau21datamcwgt']]

In [None]:
# ## Read from pre-stored dataframe
# year=2018
# lumi = {2016: 35.92, 2017: 41.53, 2018: 59.74}

# import pickle
# with open(f'bak-pnV02-bdtoptimV3-{year}.pickle', 'rb') as f:
#     df1 = pickle.load(f)
# sample_prefix = f'/home/pku/licq/hcc/samples/trees_sf/hqu/20210102_pnV02_ak15_qcd_{year}'
# _df0 = {}

## 4. Make ROOT templates

We produce the ROOT templates using the DataFrame in this step. The outputs are ROOT files with neat structure. After the further reorganization, they can be used as the Higgs Combine input to implement the fit.

As a reference, we provide an example of the output files and their structure. 
E.g., for a **given fit variable**, **given tagger WP** and a **certain jet-pT bin** for **a single fit**, the output ROOT templates should include the pass and fail MC template in the B/C/L flavors, the data template, and the MC systematics for all specified shape uncertainties. The files are organized in the following structure:
```
─── 20210315_SF2018_AK15_qcd_ak_pnV02_HP_msv12_dxysig_log_var22binsv2  [use variable: msv12_dxysig_log, Tight WP]
    └── Cards
        └── pt250to350   [given pT bin]
            ├── bdt719   [the sfBDT cut points]
            │   ├── nominal                    [the nominal histograms]
            │   │   ├── inputs_fail.root           [include four TH1D: flvC, flvB, flvL, data_obs]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBDown                 [shape uncertainty plots]
            │   │   ├── inputs_fail.root           [include three TH1D: flvC_fracBBDown, flvB_fracBBDown, flvL_fracBBDown]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fitVarRwgtDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fitVarRwgtUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTRwgtDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   └── sfBDTRwgtUp
            │       ├── inputs_fail.root
            │       └── inputs_pass.root
            └── bdt752
                ├── nominal
                │   ├── ...
```

The template making is organized in three nested functions.

In [None]:
#### =========================================================================== Global parameters =========================================================================== ####
g_make_template_mode = 'main'
r"""Options:
        main           : the main fit
        val_pt         : the validation fit -- to use an optional MC subsitute-to-data strategy, i.e. on pT variable only
        val_tosig_mass : the validation fit -- additionally reweight MC & data to h->cc signal jet on mass
        val_tosig_pt   : the validation fit -- additionally reweight MC & data to h->cc signal jet on pt  
        val_tosig_tau21: the validation fit -- additionally reweight MC & data to h->cc signal jet on tau21
        val_crop_bin   : the validation fit -- cropping the marginal bins for fit
"""

g_outdir_prefix = f'20210315_SF{year}_AK15_qcd'
r"""Prefix for the output dir name """

g_make_unce_types = {'nominal':True, 'pu':True, 'fracBB':True, 'fracCC':True, 'fracLight':True, 'psWeightIsr':True, 'psWeightFsr':True, 'sfBDTRwgt':True, 'fitVarRwgt':True}
r"""The uncertainty types used in the fit. Use False or remove the key to disable an certain unce type
    Note: "qcdSyst" and "qcdKdeSyst" is not used in this verision. "psWeightIsr" and "psWeightFsr" works fine in 2018 while in 2016/17 one need to first garantee the 2018 histograms exist
          so the unce can be transferred.
"""

g_do_fit_for_var = [1, 2, 3]
r""" Do fit for which variable"""

g_mode_bdt_runlist = 'all'
r"""Mode of BDT list for the run. Set 'all' for all 11 BDT values, or 'central' for the central BDT value only"""

g_pt_range = config['pt_range']['range']
r"""pT range for define separate fit points"""

g_tagger_range = config['tagger']['working_points']['range']
g_tagger_var = config['tagger']['var']
r"""Trigger info"""

g_use_bflav = True
r"""Use additional B flavor MC samples to improve the statistics for the 'b' catogory"""

g_mode_psWeight_run_templ = None
r"""Set None for the normal run. If set to 2016 or 2017, produce the 2018 templates for psWeightIsr/Fsr unce that can be migarated to 2016/2017 conditions. sfBDT cut value set under the 2016/2017 condition."""

g_dryrun = False
r"""Launch a test process only without writing the ROOT template files"""

#### ===================================================================================================================================================================================== ####

## Fit info: in the format of [ (fit var, nbins/edges, xmin/None, xmax/None, (underflow, overflow), label), outputdir lambda func ]
g_fitinfo = {
    1: [ ##  main fit var
        ('mSV12_dxysig_log', [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, (True, True), 'mSV12_dxysig_log'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_msv12_dxysig_log_var22binsv2/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
    2: [ ## the other var for validation
        ('mSV12_ptmax_log', [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, (True, True), 'mSV12_ptmax_log'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_msv12_ptmax_log_var22binsv2/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
    3: [ ## the other var for validation
        ('fj_x_btagcsvv2', [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, (True, True), 'CSVv2'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_csvv2_var22binsv2/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
    901: [ ## crop the marginal bins for the main var as a validation
        ('mSV12_dxysig_log', [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8], None, None, (False, False), 'mSV12_dxysig_log'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_msv12_dxysig_log_var22binsv2/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
}

## Necessary KDE parameters used in qcdKdeSyst unce
g_custom_kde_bw = {'fj_x_btagcsvv2':15, 'mSV12_ptmax_log':4, 'mSV12_dxysig_log':4}
g_custom_kde_binmask = {'fj_x_btagcsvv2':[0], 'mSV12_ptmax_log':[-0.4,1.8,2.5,3.2], 'mSV12_dxysig_log':[-0.8,-0.4,1.8,2.5]}

## Some other global vars
g_do_sfBDT_points = None
g_outdir_prefix_used = None
g_hist_qcdsyst = {}
g_wgtstr_dm_sys_fac = {}
g_hist_fitvar_rwgt = {}

def check_consistency(): ## Consistency check for gloal params
    assert g_make_template_mode in ['main', 'val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_vary_sfbdt', 'val_crop_bin'], \
        'Specified mode cannot be recognized.'
    
    global g_do_fit_for_var
    if g_make_template_mode in ['val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21'] and g_do_fit_for_var != [1]:
        print('Warning: for validation fit, set the fit information to the main variable (1) only')
        g_do_fit_for_var = [1]
    if g_make_template_mode == 'val_crop_bin' and g_do_fit_for_var.keys() != [901]:
        print('Warning: for validation fit on cropping the marginal bins, set the fit information to the cropped main variable (901) only')
        g_do_fit_for_var = [901]
    
    global g_mode_bdt_runlist
    if g_make_template_mode.startswith('val_') and g_mode_bdt_runlist != 'central':
        print('Warning: for validation fit, set the BDT run list to central')
        g_mode_bdt_runlist = 'central'
    
    global g_do_sfBDT_points
    if g_mode_bdt_runlist == 'all':
        g_do_sfBDT_points = df1[f"bdt_seq_{config['pt_range']['name']}"]
    elif g_mode_bdt_runlist == 'central':
        _points = df1[f"bdt_seq_{config['pt_range']['name']}"]
        g_do_sfBDT_points = {k:[_points[k][int((len(_points[k])-1)/2)]] for k in _points}
    else:
        raise RuntimeError('Specified mode for BDT runlist cannot be recognized.')
    
    global g_outdir_prefix_used
    g_outdir_prefix_used = g_outdir_prefix + '_' + config['tagger']['working_points']['name']
    if g_make_template_mode.startswith('val_'):
        g_outdir_prefix_used += '_-' + g_make_template_mode + '-'
    
    if g_mode_psWeight_run_templ is not None:
        assert year==2018, 'g_mode_psWeight_run_templ only set for year 2016/2017'
        assert int(g_mode_psWeight_run_templ) in [2016, 2017], 'g_mode_psWeight_run_templ can only be 2016 or 2017'
        import pickle
        with open(f'plots/prep/bdt_seq_{g_mode_psWeight_run_templ}.pickle', 'rb') as f:
            g_do_sfBDT_points = pickle.load(f)[f"bdt_seq_{config['pt_range']['name']}"]
            g_outdir_prefix_used += f"_psWeight{g_mode_psWeight_run_templ}"
            g_make_unce_types = {'nominal':True, 'psWeightIsr':True, 'psWeightFsr':True}

def launch_maker():
    r"""Depth 0: Main function to launch the fit given the global parameters
    """
    check_consistency()
    
    print('Launch variablel list:', g_do_fit_for_var)
    for _ifit in g_do_fit_for_var:
        for _wp in g_tagger_range:
            
            ## Get fit info and output lambda func
            fitinfo, outdir_func = g_fitinfo[_ifit]

            ## The default args in the main fit
            args = {
                'wgtstr_dm': f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt', 'wgtstr_dm_data': None,
                'sl_dm': ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'],
                'sl_dm_herwig': ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'],
                'config_dm': {
                    'data':  '',
                    'flvB':  'fj_x_nbhadrons>=1',
                    'flvC':  'fj_x_nbhadrons==0 & fj_x_nchadrons>=1',
                    'flvL':  'fj_x_nbhadrons==0 & fj_x_nchadrons==0',
                },
                'categories_dm': ['flvL', 'flvB', 'flvC', 'data'],
                'catMap': {
                    'pass': f'{g_tagger_var}>{g_tagger_range[_wp][0]:.3f} & {g_tagger_var}<={g_tagger_range[_wp][1]:.3f}',
                    'fail': f'{g_tagger_var}<={g_tagger_range[_wp][0]:.3f} | {g_tagger_var}>{g_tagger_range[_wp][1]:.3f}',
                },
                'use_bflav': g_use_bflav, 'args_bflav': {
                    'sl_dm_bflav': ['subst_qcd-mg-bflav-noht'], 'sl_dm_bflav_orig': ['subst_qcd-mg-noht'],
                    'wgtstropt_bflav': lambda s: s.replace('fj_x_htwgt', '(fj_x_htwgt*fj_x_bflav_htwgt)'),
                },
            }
            ## Modify args according to specified global param
            if g_make_template_mode == 'val_pt':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*ad_ptwgt', None
            elif g_make_template_mode == 'val_tosig_mass':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*massdatamcwgt', 'massdatamcwgt'
            elif g_make_template_mode == 'val_tosig_pt':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*ptdatamcwgt', 'ptdatamcwgt'
            elif g_make_template_mode == 'val_tosig_tau21':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*tau21datamcwgt', 'tau21datamcwgt'

            wrapperPt(df1, fitinfo, lambda bdt, pt_range, sys_name: outdir_func(g_outdir_prefix_used, _wp, bdt, pt_range, sys_name), args)

In [None]:
def wrapperPt(df2, fitinfo, outdir_func, args):
    r"""Depth 1: Process the pT cut and wrap all other following steps
    """
    print('Launch pT range:', g_pt_range)
    for pt_range in g_pt_range:
        pt_range = tuple(pt_range)
        print ('pt range:', pt_range)
        
        ## df2->df2a: apply the pT cut (to speed up)
        df2a = {}
        for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'] + (['subst_qcd-mg-bflav-noht'] if args['use_bflav'] else []):
            df2a[sam] = df2[sam].query(f'fj_x_pt>={pt_range[0]} & fj_x_pt<{pt_range[1]}')
        
        sfBDT_list = g_do_sfBDT_points[pt_range]
        if isinstance(sfBDT_list, dict):
            sfBDT_list = sfBDT_list.values()
        for sfBDT_val in sfBDT_list:
            print(' sfBDT cut at:', sfBDT_val)
            
            ## df2a->df3: apply the corresponding bdt cut
            df3 = {}
            for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'] + (['subst_qcd-mg-bflav-noht'] if args['use_bflav'] else []):
                df3[sam] = df2a[sam].query(f'fj_x_pt>={pt_range[0]} & fj_x_pt<{pt_range[1]} & fj_x_sfBDT>{sfBDT_val}')

            makeTemplatesWrapper(df3, fitinfo, lambda sys_name: outdir_func(sfBDT_val, pt_range, sys_name), sfBDT_val, args)

In [None]:
def makeTemplatesWrapper(df3, fitinfo, outdir_func, sfBDT_val, args):
    r"""Depth 2: Specify which template (nominal or any shape uncertainty) to make in this step
    """
    global g_wgtstr_dm_sys_fac, g_hist_qcdsyst, g_hist_fitvar_rwgt
    g_wgtstr_dm_sys_fac, g_hist_qcdsyst = {}, {} ## clear
    g_hist_fitvar_rwgt = {}
    
    wgtstr_dm = args['wgtstr_dm']
    if 'nominal' in g_make_unce_types.keys() and g_make_unce_types['nominal']:
        sys_name = 'nominal'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    
    ## Below we extract hists for all unce type
    if 'pu' in g_make_unce_types.keys() and g_make_unce_types['pu']: 
        sys_name = 'puUp'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightUp'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = 'puDown'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightDown'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

    if 'fracBB' in g_make_unce_types.keys() and g_make_unce_types['fracBB']: 
        sys_name = "fracBBUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "fracBBDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'fracCC' in g_make_unce_types.keys() and g_make_unce_types['fracCC']: 
        sys_name = "fracCCUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons==0 & fj_x_nchadrons>1) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons>1)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "fracCCDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons==0 & fj_x_nchadrons>1) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons>1)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'fracLight' in g_make_unce_types.keys() and g_make_unce_types['fracLight']: 
        sys_name = "fracLightUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons==0 & fj_x_nchadrons==0) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons==0)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "fracLightDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons==0 & fj_x_nchadrons==0) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons==0)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

    ## Below unce is not as easily extracted as above by specifying a different weight string. They may need *special treatment* implemented in the depth-3 function
    if 'qcdSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdSyst']: 
        sys_name = "qcdSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "qcdSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'qcdKdeSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdKdeSyst']: 
        sys_name = "qcdKdeSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "qcdKdeSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'psWeightIsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightIsr']: 
        sys_name = "psWeightIsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "psWeightIsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'psWeightFsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightFsr']: 
        sys_name = "psWeightFsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "psWeightFsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

    if 'sfBDTRwgt' in g_make_unce_types.keys() and g_make_unce_types['sfBDTRwgt']: 
        sys_name = 'sfBDTRwgtUp'; wgtstr_dm_sys = wgtstr_dm;'''factors decided by special_wgtstr argument'''; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, special_wgtstr='sfbdtwgt_g50')
        sys_name = 'sfBDTRwgtDown'; wgtstr_dm_sys = wgtstr_dm;'''factors decided by special_wgtstr argument'''; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, special_wgtstr='sfbdtwgt_g50')
    
    if 'fitVarRwgt' in g_make_unce_types.keys() and g_make_unce_types['fitVarRwgt']: 
        sys_name = 'fitVarRwgtUp'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = 'fitVarRwgtDown'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    

In [None]:
def makeTemplates(df3, fitinfo, outputdir, sys_name, wgtstr_dm_sys, args, special_wgtstr=None):
    r"""Depth 3: The very base implementation that apply the final pass/fail cut and make the template
    """
    
    wgtstr_dm, wgtstr_dm_data, sl_dm, sl_dm_herwig, config_dm, categories_dm, catMap = args['wgtstr_dm'], args['wgtstr_dm_data'], args['sl_dm'], args['sl_dm_herwig'], args['config_dm'], args['categories_dm'], args['catMap']
    
    ## Create the output root file
    if not os.path.exists(outputdir) and not g_dryrun:
        os.makedirs(outputdir)

    import ROOT, array  ## use ROOT to write file...
    vname, nbin, xmin, xmax, (underflow, overflow), vlabel = fitinfo
    ## Tranfer the {nbin, xmin, xmax} set to the real bin edge if necessary
    if not isinstance(nbin, int):
        edges = nbin
        nbin = len(edges)-1 # reset nbin to "real" nbin
        edges_inroot = (len(edges)-1, array.array('f', edges))
    else:
        edges = np.linspace(xmin, xmax, nbin+1)
        edges_inroot = (nbin, xmin, xmax)

    ## Impose the overall factor between MC and data
    def extract_factor_overal(_sl, _wgtstr):
        return np.round(df3[_sl[-1]].shape[0] * 1. / sum([df3[sam].eval(_wgtstr).sum() for sam in _sl[:-1]]), 4)
    
    if special_wgtstr is None: ## no special weight string provided -> use the nominal one
        if any([_sys in sys_name for _sys in ['qcdSyst','qcdKdeSyst']]): # note that qcd syst uses the setting of the herwig sample
            fac_overal = g_wgtstr_dm_sys_fac['qcdSystUp'] if 'qcdSystUp' in g_wgtstr_dm_sys_fac else \
                         g_wgtstr_dm_sys_fac['qcdKdeSystUp'] if 'qcdKdeSystUp' in g_wgtstr_dm_sys_fac else None
            if fac_overal is None:
                fac_overal = extract_factor_overal(sl_dm_herwig, wgtstr_dm.replace('htwgt','htwgt_herwig'))
        else:  # nominal case
            fac_overal = g_wgtstr_dm_sys_fac['nominal'] if 'nominal' in g_wgtstr_dm_sys_fac else None
            if fac_overal is None:
                fac_overal = extract_factor_overal(sl_dm, wgtstr_dm)
        # equip the weight factor
        g_wgtstr_dm_sys_fac[sys_name] = fac_overal
        wgtstr_dm_sys = wgtstr_dm_sys+f'*{fac_overal}'

    else: ## special weight string specified
        if sys_name.endswith('Up'):
            fac_overal = extract_factor_overal(sl_dm, wgtstr_dm+f'*{special_wgtstr}')
            # equip the weight factor
            g_wgtstr_dm_sys_fac[sys_name] = fac_overal
            wgtstr_dm_sys = wgtstr_dm+f'*{special_wgtstr}*{fac_overal}'
        else:
            wgtstr_dm_sys = wgtstr_dm+f"*(2*{g_wgtstr_dm_sys_fac['nominal']}-{special_wgtstr}*{g_wgtstr_dm_sys_fac[sys_name.replace('Down','Up')]})"

    print (fitinfo, outputdir, sys_name, wgtstr_dm_sys)
    
    ## Preprocess for fitVarRwgt
    if sys_name == 'fitVarRwgtUp':
        _df_mc = pd.concat([df3[sam] for sam in sl_dm[:-1]])
        _df_data = df3[sl_dm[-1]]
        _h_data = get_hist(_df_data[vname].values, bins=edges, weights=np.ones(_df_data.shape[0]) if wgtstr_dm_data==None else _df_data.eval(wgtstr_dm_data).values, underflow=underflow, overflow=overflow).view(flow=True)
        _h_mc = get_hist(_df_mc[vname].values, bins=edges, weights=_df_mc.eval(wgtstr_dm_sys).values, underflow=underflow, overflow=overflow).view(flow=True)
        g_hist_fitvar_rwgt[sys_name] = _h_data.value / _h_mc.value
    
    ## Loop over pass and fail region
    for b in ['pass', 'fail']:
        try:
            if not g_dryrun:
                fw = ROOT.TFile(outputdir+f'inputs_{b}.root', 'recreate')
            
            hv, hist = {}, {}
            hname_suf = '_'+sys_name if sys_name!='nominal' else ''  ## suffix to the hist name (the Higgs Combine syntax)
            print (' -- ', catMap[b])
            
            ## MC and data dataframe after applying the final selection
            df_mc = pd.concat([df3[sam].query(catMap[b]) for sam in sl_dm[:-1]])
            df_data = df3[sl_dm[-1]].query(catMap[b])
            
            ## Preprocessing for herwig related dataframe if we mean to calculate qcdSyst / qcdKdeSyst unce in this iteration
            if 'qcdSyst' in sys_name or 'qcdKdeSyst' in sys_name:
                df_mc_herwig = pd.concat([df3[sam].query(catMap[b]) for sam in sl_dm_herwig[:-1]])

            # Loop over categories: flvC/flvB/flvL/data
            for cat in config_dm:
                ## hv[] holds the boosted-histogram type derived from the dataframe, hist[] holds the TH1D type to be stored in ROOT
                if cat=='data' and sys_name == 'nominal':
                    ## Get the data hist
                    hv['data'] = get_hist(df_data[vname].values, bins=edges, weights=np.ones(df_data.shape[0]) if wgtstr_dm_data==None else df_data.eval(wgtstr_dm_data).values, underflow=underflow, overflow=overflow).view(flow=True)
                    # Initialize the TH1D hist
                    hist['data'] = ROOT.TH1D('data_obs', 'data_obs;'+vname, *edges_inroot) 
                if cat!='data':
                    df_mc_tmp = df_mc.query(config_dm[cat]) ## category selection based on flavor
                    ## Get the MC hist for certain flavor
                    hv[cat] = get_hist(df_mc_tmp[vname].values, bins=edges, weights=df_mc_tmp.eval(wgtstr_dm_sys).values, underflow=underflow, overflow=overflow).view(flow=True)
                    # Initialize the TH1D hist
                    hist[cat] = ROOT.TH1D(cat+hname_suf, cat+hname_suf+';'+vname, *edges_inroot) # init TH1 hist
                    hist[cat].Sumw2()
            
                    ## For qcdSyst / qcdKdeSyst unce that is actually related to Herwig, hv[cat] is dummy here, 
                    ## and we mean to obtain hv[cat+'_herwig.value'] that will be later filled into hist[cat]
                    if sys_name=='qcdSystUp':
                        ## Get the Herwig fit for certain flavor
                        df_mc_herwig_tmp = df_mc_herwig.query(config_dm[cat]) ## cat selection
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g50','sfbdtwgt_g50_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        hv[cat+'_herwig.value'] = get_hist(df_mc_herwig_tmp[vname].values, bins=edges, 
                                                     weights=df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 
                                                     underflow=underflow, overflow=overflow).view(flow=True).value
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
                    
                    ## Extract the KDE shape directly from herwig shape
                    if sys_name=='qcdKdeSystUp':
                        df_mc_herwig_tmp = df_mc_herwig.query(config_dm[cat])
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g50','sfbdtwgt_g50_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        hv_herwig_orig_value = get_hist(df_mc_herwig_tmp[vname].values, bins=edges, 
                                                     weights=df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 
                                                     underflow=underflow, overflow=overflow).view(flow=True).value
                        
                        ## Calculate KDE shape, apply two times so that we specify a finer KDE bindwidth based on the first result
                        from scipy.stats import gaussian_kde
                        kde = gaussian_kde(df_mc_herwig_tmp[vname].values, weights=np.clip(df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 0, +np.inf))
                        kde = gaussian_kde(df_mc_herwig_tmp[vname].values, weights=np.clip(df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 0, +np.inf), bw_method=kde.factor/g_custom_kde_bw[vname])
                        kde_int = np.zeros([nbin, 2])
                        
                        ## Integrate the KDE function to obtain KDE histogram
                        for i, (low, high) in enumerate(zip(edges[:-1], edges[1:])):
                            if low in g_custom_kde_binmask[vname]:
                                continue
                            kde_int[i] = [kde.integrate_box_1d(low, high), hv_herwig_orig_value[i]]
                        # print('rescale kde sum to original herwig sum: ', kde_int[:,1].sum() / kde_int[:,0].sum())
                        kde_int[:,0] *= kde_int[:,1].sum() / kde_int[:,0].sum()
                        
                        ## Fill with original madgraph hist if we plan to mask the bin for KDE. 
                        ## This is based on the fact that KDE cannot model the hist well in the marginal bins
                        hv[cat+'_herwig.value'] = np.array([kde_int[i][0] if kde_int[i][0]!=0 else hv[cat].value[i] for i in range(nbin)])
                        
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
            
                    ## Extract the PSWeight histogram
                    if 'psWeight' in sys_name:
                        if year==2018:  ## for 2018, calculate the hist by PSWeight vars 
                            ps_idx = {'psWeightIsrUp':2, 'psWeightIsrDown':0, 'psWeightFsrUp':3, 'psWeightFsrDown':1}
                            hv[cat] = get_hist(df_mc_tmp[vname].values, bins=edges, weights=df_mc_tmp.eval(wgtstr_dm_sys+f'*PSWeight{ps_idx[sys_name]+1}').values, underflow=underflow, overflow=overflow).view(flow=True)
                        else:  ## for 2016/17 extract the PSWeight hist based on 2018 result (transfer the ratio for PSWeight/nominal)
                            import re
                            outputdir_ps_18 = re.sub('^(.+)_SF201[6-8]_(.*)_([A-Z]P_.*)$', f'\g<1>_SF2018_\g<2>_psWeight{year}_\g<3>', outputdir)
                            hv_nom_18 = uproot.open(outputdir_ps_18.replace(sys_name, 'nominal')+f'inputs_{b}.root')[cat]
                            hv_ps_18 = uproot.open(outputdir_ps_18+f'inputs_{b}.root')[cat+'_'+sys_name]
                            hv[cat].value *= hv_ps_18.values / hv_nom_18.values
                        # print (hv[cat].value)
                    
                    ## Extract the sfBDTFloAround histogram.
                    ## Method: to utilize the nominal hist for sfbdt>0.95 or 0.85 and migrate the MC-to-data confidence level in the 0.90 case
                    if 'sfBDTFloAround' in sys_name:
                        from scipy.stats import chi2
                        hv_data = uproot.open(outputdir.replace(sys_name, 'nominal')+f'inputs_{b}.root')['data_obs'].values  ## nominal data hist for 0.90
                        _bdtname = '95' if 'Up' in sys_name else '85'
                        fr = uproot.open(outputdir.replace(sys_name, 'nominal').replace(f'/bdt{int(g_sfBDT_val_list[-1]*1000)}/',f'/bdt{_bdtname}0/')+f'inputs_{b}.root')
                        fr_data, fr_mc = fr['data_obs'].values, fr['flvC'].values+fr['flvB'].values+fr['flvL'].values  ## nominal data & MC hist for 0.95 or 0.85 (depends on Up or Down)
                        
                        ## For each bins, migrate the confidence level of MC yield F0 given data yield D0 to the target data yield D => F
                        hv_mc = []
                        for D, D0, F0 in zip(hv_data, fr_data, fr_mc):
                            ## The precise calculation
                            F = 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0+2), 2*D+2) if F0>D0 else 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0), 2*D)
                            if F == np.inf: ## in case the formula results in inf (may occur if F0 >> D0)
                                assert F0 > D0
                                sigD0 = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D0+2) - D0
                                sigD = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D+2) - D
                                F = D + sigD/sigD0*(F0-D0)
                            hv_mc.append(F)
                        
                        ## Obtain flavor template based on the flavor proportion in 0.95 or 0.85 region
                        hv[cat].value = np.nan_to_num(hv_mc * fr[cat].values / fr_mc, nan=0)
                    
                    ## Modify hv[cat] based on extracted pass+fail histogram
                    if 'fitVarRwgt' in sys_name:
                        if sys_name == 'fitVarRwgtUp':
                            hv[cat].value = hv[cat].value * g_hist_fitvar_rwgt['fitVarRwgtUp']
                        else:
                            hv[cat].value = 2 * hv[cat].value - hv[cat].value * g_hist_fitvar_rwgt['fitVarRwgtUp']
                    
                    ## Use bflav qcd samples to stitch the final bflav template
                    if 'use_bflav' in args and args['use_bflav'] and cat == 'flvB' and not all([s in sys_name for s in ['qcd','Syst']]):
                        # print('---', hv[cat])
                        ## Get the MC hist from the new b-enriched sample
                        df_mc_bflav = pd.concat([df3[sam].query(f'({catMap[b]}) & ({config_dm[cat]})') for sam in args['args_bflav']['sl_dm_bflav']])
                        hv_bflav = get_hist(df_mc_bflav[vname].values, bins=edges, weights=df_mc_bflav.eval(args['args_bflav']['wgtstropt_bflav'](wgtstr_dm_sys)).values, underflow=underflow, overflow=overflow).view(flow=True)
                        df_mc_bflav_og = pd.concat([df3[sam].query(f'({catMap[b]}) & ({config_dm[cat]})') for sam in args['args_bflav']['sl_dm_bflav_orig']])
                        hv_bflav_og = get_hist(df_mc_bflav_og[vname].values, bins=edges, weights=df_mc_bflav_og.eval(wgtstr_dm_sys).values, underflow=underflow, overflow=overflow).view(flow=True)
                        ## Combine histogram
                        hv_bflav_og.variance[hv_bflav_og.variance==0] = 1e20
                        hv_bflav.variance[hv_bflav.variance==0] = 1e20
                        hv_bflav_comb = hv[cat].copy()
                        hv_bflav_comb.value = (hv_bflav_og.value*(1/hv_bflav_og.variance) + hv_bflav.value*(1/hv_bflav.variance)) / (1/hv_bflav_og.variance + 1/hv_bflav.variance)
                        hv_bflav_comb.variance = 1 / (1/hv_bflav_og.variance + 1/hv_bflav.variance)
                        ## Further combine with the non no-QCD contribution
                        hv_bflav_nonsubst = hv[cat].copy() # histogram constitution not to be combined (i.e. no-QCD contribution)
                        hv_bflav_nonsubst.value -= hv_bflav_og.value
                        hv_bflav_nonsubst.variance -= hv_bflav_og.variance
                        hv[cat] = hv_bflav_comb + hv_bflav_nonsubst
                        # print('+++', hv_bflav_og, hv_bflav, hv_bflav_comb, hv_bflav_nonsubst, hv[cat])
                    
            ## Fill the hv[cat] (for qcd*, fill hv[cat+'_herwig.value']) into TH1D and save into ROOT
            for cat in hist.keys():
                ## Special handling for qcdSyst / qcdKdeSyst
                if 'qcd' in sys_name and 'SystUp' in sys_name:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat+'_herwig.value'][i])
                elif 'qcd' in sys_name and 'SystDown' in sys_name:
                    hv[cat+'_herwig.value'] = g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)]
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, 2 * hv[cat].value[i] - hv[cat+'_herwig.value'][i])
                    g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)] = None

                ## Normal routine
                else:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat].value[i])
                        hist[cat].SetBinError(i+1, np.sqrt(hv[cat].variance[i]))
                
                ## Fix some buggy points
                if cat!='data':
                    for i in range(nbin):
                        if hist[cat].GetBinContent(i+1) <= 1e-3:
                            hist[cat].SetBinContent(i+1, 1e-3)
                            hist[cat].SetBinError(i+1, 1e-3)
                        elif hist[cat].GetBinError(i+1) > hist[cat].GetBinContent(i+1):
                            hist[cat].SetBinError(i+1, hist[cat].GetBinContent(i+1))

                if not g_dryrun:
                    hist[cat].Write()
        ## Close the ROOT file if error occurs (otherwise the notebook is easily corrupted)
        finally:
            if not g_dryrun:
                fw.Close()

Now we launch the template maker

In [None]:
## ====================================================================================================
## Main fit routine: launch all sfBDT values, only run on 1st variable
g_dryrun = False
g_make_template_mode = 'main'; g_mode_bdt_runlist = 'all'
g_mode_psWeight_run_templ = None
g_do_fit_for_var = [1] # only run the first fit variable (2, 3 are for validation fit)
launch_maker()

**For year 2018**: you need to run the following block to provide psWeight templates for year 2016 and 2017 (otherwise 2016 and 2017 will report errors)

However, you need to first run the same pre-processing for the corresponding 2016 and 2017 util step 3-2 to extract the sfBDT sequence in that year condition. The sequence will be stored to the file e.g. `plots/prep-pd/bdt_seq_2016.pickle`

In [None]:
## ====================================================================================================
## For year 2018, extract necessary psWeight templates for 2016/2017
if year == 2018:
    for ext_year in [2016, 2017]:
        g_make_template_mode = 'main'; g_mode_bdt_runlist = 'all'
        g_mode_psWeight_run_templ = ext_year
        g_do_fit_for_var = [1] # only run the first fit variable (2, 3 are for validation fit)
        launch_maker()

Below are optional routines for the validation fit. No need to launch during the first run.

In [None]:
# ## ====================================================================================================
# ## Validation on other variables
# g_make_template_mode = 'main'; g_mode_bdt_runlist = 'all'
# g_mode_psWeight_run_templ = None
# g_do_fit_for_var = [2, 3]
# launch_maker()

# ## ====================================================================================================
# ## Multiple validations modes: only run the central sfBDT cut point is fine
# for mode in ['val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_crop_bin']:
#     g_make_template_mode = mode; g_mode_bdt_runlist = 'central'
#     g_mode_psWeight_run_templ = None
#     g_do_fit_for_var = [1]
#     launch_maker()

# Data/MC comparison plots

Based on the DataFrame `df1`, this section aims to make data and MC plots, while MC is categorized into three flavors: C/B/L.
With the universial make_data_mc_plots function, one can make specify any final selection, any sample list to produce the standard hist+ratio plot.

The below recipe can make a default set of plots.

In [None]:
### ================ configuration  ===================

def make_config_dm(sl_dm, wgtstr_dm):
    return {
        'data':  ('Data',       'jetht-noht',      '1.0',    ''      ),
        'flvB':  ('MC (flvB)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons>=1'  ),
        'flvC':  ('MC (flvC)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons==0 & fj_x_nchadrons>=1'  ),
        'flvL':  ('MC (flvL)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons==0 & fj_x_nchadrons==0'  ),
    }

categories_dm = ['flvL', 'flvB', 'flvC', 'data']

bininfo_dm = [ #(savename, vname, nbin, xmin, xmax, label)
    ('ht', 50, 0, 2000, r'$H_{T}$ [GeV]'),
#     ('fj_x_pt', 20, 200, 800, r'$p_{T}(AK15)$ [GeV]'),
#     ('fj_x_eta', 20, -2.5, 2.5, r'$\eta(AK15)$'),
#     ('fj_x_sdmass', 15, 50, 200, r'$m_{SD}(AK15)$ [GeV]'),
#     ('fj_x_sfBDT', 50, 0.5, 1, r'$sfBDT(AK15)$'),

#     ('fj_x_ParticleNetMD_XccVsQCD', 100, 0, 1, r'ParticleNetMD_XccVsQCD(AK15)'),
#     ('fj_x_origParticleNetMD_XccVsQCD', 50, 0, 1, r'ParticleNetMD_XccVsQCD(AK15) orig'),
#     (('fj_x_ParticleNetMD_XccVsQCD_08', 'fj_x_ParticleNetMD_XccVsQCD'), 40, 0.8, 1, r'ParticleNetMD_XccVsQCD(AK15)-u'),
    
#     ('fj_x_btagcsvv2', [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, r'$CSVv2$'),
#     ('mSV12_ptmax_log', [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, r'$log(m_{SV1,p_{T}\,max}\; /GeV)$'),
#     ('mSV12_dxysig_log', [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, r'$log(m_{SV1,d_{xy}sig\,max}\; /GeV)$'),
]

In [None]:
### ================ slim on cc-tagger, sfBDT, then make data/MC plots ===================

import seaborn as sns
def set_sns_color(*args):
    sns.palplot(sns.color_palette(*args))
    sns.set_palette(*args)
    
def make_data_mc_plots(sl_dm, config_dm, finsel, prefix, **kwargs):
    r"""To make standard hist+ratio plots based on the sample list and the final selection
    Arguments:
        sl_dm: sample list
        config_dm: configuration set for each categories in the plots, in the dict format. name: (label, sample/sample list, weight string, cat selection)
        finsel: final selections made to produce the plots
        prefix: prefix string used in the output plot title
        kwargs: includes further KDE-related variables
    """
    
    df2 = {}
    for sam in sl_dm:
        df2[sam] = df1[sam].query(finsel)

    for vname, nbin, xmin, xmax, vlabel in bininfo_dm:
        if not isinstance(vname, str): ## savename is specified other then the variable name
            savename, vname = vname
        else:
            savename = vname
        if 'plot_vars' in kwargs and savename not in kwargs['plot_vars']:
            continue
        if not isinstance(nbin, int):
            edges, xmin, xmax, nbin = nbin, min(nbin), max(nbin), len(nbin)
        else:
            edges = np.linspace(xmin, xmax, nbin+1)

        label, hdm = {}, {}
        underflow = False if vlabel[-2:] in ['-u','-a'] else True
        overflow  = False if vlabel[-2:] in ['-o','-a'] else True
        if vlabel[-2:] in ['-u','-o','-a']:
            vlabel = vlabel[:-2]
        
        if 'g_do_kde_vars' in kwargs and savename in kwargs['g_do_kde_vars'] and kwargs['g_do_kde_vars'][savename]==True:
            g_do_kde_vars = True
            kde = {}
        else:
            g_do_kde_vars = False
        
        ## Loop over categories to extract the hist for each flavor and data
        for cat in categories_dm:
            lab, sam, wgt, sel = config_dm[cat]
            label[cat] = lab
            if cat != 'data':
                if not isinstance(sam, list):
                    df2tmp = df2[sam].query(sel) if sel not in ['','1==1'] else df2[sam]
                else:
                    df2tmp = []
                    for s in sam:
                        df2tmp.append(df2[s].query(sel) if sel not in ['','1==1'] else df2[s])
                    df2tmp = pd.concat(df2tmp, ignore_index=True)
                hdm[cat] = get_hist(df2tmp[vname].values, bins=edges, weights=df2tmp.eval(wgt).values, underflow=underflow, overflow=overflow)
                if g_do_kde_vars:
                    from scipy.stats import gaussian_kde
                    from scipy import integrate
                    import multiprocessing
                    if 'custom_kde' in kwargs.keys() and savename in kwargs['custom_kde']:
                        kde[cat] = kwargs['custom_kde'][savename][cat]
                        kde_int_res = [
                                integrate.quad(kde[cat][0], -np.inf if (i==0 and underflow) else edges[i], 
                                                  +np.inf if (i==len(edges)-1 and overflow) else edges[i+1]) for i in range(len(edges)-1)]
                    else:
                        kdetmp = gaussian_kde(df2tmp[vname].values, weights=np.clip(df2tmp.eval(wgt).values, 0, np.inf))
                        if 'g_custom_kde_bw' in kwargs.keys() and savename in kwargs['g_custom_kde_bw']:
                            kdetmp = gaussian_kde(df2tmp[vname].values, weights=np.clip(df2tmp.eval(wgt).values, 0, np.inf), bw_method=kdetmp.factor/kwargs['g_custom_kde_bw'][savename])
                        kde[cat] = (kdetmp, df2tmp.eval(wgt).sum())
                        kde_int_res = [(kde[cat][0].integrate_box_1d(-np.inf if (i==0 and underflow) else edges[i], +np.inf if (i==len(edges)-1 and overflow) else edges[i+1]), 0.) for i in range(len(edges)-1)]
                    hdm[cat+'_kde'] = hdm[cat].copy()
                    hdm[cat+'_kde'].view(flow=True).value = np.array([kde_int_res[i][0] for i in range(len(edges)-1)]) * kde[cat][1]
                    hdm[cat+'_kde'].view(flow=True).variance = np.zeros(len(edges)-1)
                        
            else: ## is data: no sel, weight=1
                hdm[cat] = get_hist(df2[sam][vname].values, bins=edges, weights=np.ones(df2[sam].shape[0]), underflow=underflow, overflow=overflow)
        
        cat_sufs = ['']
        if g_do_kde_vars:
            cat_sufs += ['_kde']
        for cat_suf in cat_sufs:
            ## Draw the standard hist_ratio plot
            set_sns_color('cubehelix_r', 3) ## set the color palette
            f = plt.figure(figsize=(12,12))
            gs = mpl.gridspec.GridSpec(2, 1, height_ratios=[3, 1], hspace=0.05) 
            
            ## Upper histogram panel
            ax = f.add_subplot(gs[0])
            hep.cms.label(data=True, paper=False, year=2016, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
            ax.set_xlim(xmin, xmax); ax.set_xticklabels([]); ax.set_ylabel('Events / bin', ha='right', y=1.0)

            plot_hist([hdm[cat+cat_suf] for cat in categories_dm if cat!='data'], bins=edges, label=[label[cat] for cat in categories_dm if cat!='data'], histtype='fill', edgecolor='k', linewidth=1, stack=True) ## draw stacked bkg
            cats_mc = list(set(categories_dm) - set(['data']))
            hdm_add = hdm[cats_mc[0]+cat_suf].copy()
            for cat in cats_mc[1:]:
                hdm_add += hdm[cat+cat_suf]
            bkgtot, bkgtot_err = hdm_add.view(flow=True).value, np.sqrt(hdm_add.view(flow=True).variance)
            ax.fill_between(edges, (bkgtot-bkgtot_err).tolist()+[0], (bkgtot+bkgtot_err).tolist()+[0], label='BKG unce.', step='post', hatch='///', edgecolor='darkblue', facecolor='none', linewidth=0) ## draw bkg unce.
            plot_hist(hdm['data'], bins=edges, label='Data', histtype='errorbar', color='k', markersize=15, elinewidth=1.5) ## draw data
#             ax.set_yscale('log')
            
            ax.legend()
            # ax.legend(loc='upper left'); ax.set_ylim(0, 1.4*ax.get_ylim()[1])
            
            ## Ratio panel
            ax1 = f.add_subplot(gs[1]); ax1.set_xlim(xmin, xmax); ax1.set_ylim(0.001, 1.999)
            ax1.set_xlabel(vlabel, ha='right', x=1.0); ax1.set_ylabel('Data / MC', ha='center')
            ax1.plot([xmin,xmax], [1,1], 'k'); ax1.plot([xmin,xmax], [0.5,0.5], 'k:'); ax1.plot([xmin,xmax], [1.5,1.5], 'k:')

            hr = hdm['data'].view(flow=True).value / hdm_add.view(flow=True).value
            # hr_err = hr * np.sqrt(hdm['data'].view(flow=True).variance/(hdm['data'].view(flow=True).value**2) + hdm_add.view(flow=True).variance/(hdm_add.view(flow=True).value**2))
            hr_dataerr = hr * np.sqrt(hdm['data'].view(flow=True).variance/(hdm['data'].view(flow=True).value**2))
            ax1.fill_between(edges, ((bkgtot-bkgtot_err)/bkgtot).tolist()+[0], ((bkgtot+bkgtot_err)/bkgtot).tolist()+[0], step='post', hatch='///', edgecolor='darkblue', facecolor='none', linewidth=0) ## draw bkg unce.
            hep.histplot(np.nan_to_num(hr, nan=-1), bins=edges, yerr=np.nan_to_num(hr_dataerr), histtype='errorbar', color='k', markersize=15, elinewidth=1) ## draw data in ratio plot

            plt.savefig(f'plots/{g_dirname}_{year}_pd/{prefix}__{finsel}__{savename}{cat_suf}.png')
            plt.savefig(f'plots/{g_dirname}_{year}_pd/{prefix}__{finsel}__{savename}{cat_suf}.pdf')
            pickle.dump(hdm['data'], open(f'plots/{g_dirname}_{year}_pd/{prefix}__{finsel}__{savename}{cat_suf}.pickle', 'wb'))

        ## kde/orig comparison plots
        if g_do_kde_vars:
            mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green'])
            f, ax = plt.subplots(figsize=(12,12))
            hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
            x_contin = np.linspace(xmin, xmax, 201)
            bin_width = edges[int(nbin/2)+1] - edges[int(nbin/2)]
            for cat, color in zip(['flvC', 'flvB', 'flvL'], ['blue', 'red', 'green']):
                lab, sam, wgt, sel = config_dm[cat]
                ax.plot(x_contin, kde[cat][0](x_contin) * kde[cat][1] * bin_width, label=lab+' KDE', linestyle=':', color=color)
            for cat, color in zip(['flvC', 'flvB', 'flvL'], ['blue', 'red', 'green']):
                lab, sam, wgt, sel = config_dm[cat]
                hep.histplot(hdm[cat+'_kde'].view(flow=True).value, bins=edges, label=lab+' KDE integral', linestyle='--', color=color)
                plot_hist(hdm[cat], bins=edges, label=lab, normed=False, color=color)
            ax.set_xlim(xmin, xmax); ax.set_xlabel(vlabel, ha='right', x=1.0); ax.set_ylabel('A.U.', ha='right', y=1.0); ax.legend()
            
            plt.savefig(f'plots/{g_dirname}_{year}_pd/{prefix}:kde_shape__{finsel}__{savename}.png')
            plt.savefig(f'plots/{g_dirname}_{year}_pd/{prefix}:kde_shape__{finsel}__{savename}.pdf')
            

g_do_kde_vars = {'fj_x_btagcsvv2':True, 'mSV12_ptmax_log':True, 'mSV12_dxysig_log':True}
g_custom_kde_bw = {'fj_x_btagcsvv2':15, 'mSV12_ptmax_log':4, 'mSV12_dxysig_log':4}

g_dirname = 'test_datamc' ## config me
if not os.path.exists(f'plots/{g_dirname}_{year}_pd'):
    os.makedirs(f'plots/{g_dirname}_{year}_pd')

for ptrange in config['pt_range']['range']:
    ptmin, ptmax = ptrange
    bdt_seq = df1[f"bdt_seq_{config['pt_range']['name']}"][(ptrange[0], ptrange[1])]
    bdt_cent = bdt_seq[int((len(bdt_seq)-1)/2)]
    tagger_wp = sorted([rg[0] for rg in config['tagger']['working_points']['range'].values()])
    
    ## 1. With MadGraph sample list
#     wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt'
#     sl_dm = ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
#     make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.5', prefix='mg')
#     make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>{bdt_cent:.3f}', prefix='mg')
#     make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f"fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>{bdt_cent:.3f} & {config['tagger']['var']}>{tagger_wp[-1]}", prefix='mg')

#     ## 2. With MadGraph sample list, while using the optional MC-to-data reweight scheme (on pT)
#     wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*ad_ptwgt'
#     sl_dm = ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
#     make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>{bdt_cent:.3f}', prefix='mg_ptwgt')
    
#     ## 3. With Herwig sample list
#     wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt_herwig'
#     sl_dm = ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
#     make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.5', prefix='herwig')
#     make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>{bdt_cent:.3f}', prefix='herwig')
#     make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f"fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>{bdt_cent:.3f} & {config['tagger']['var']}>{tagger_wp[-1]}", prefix='herwig', 
#                        g_do_kde_vars=g_do_kde_vars, g_custom_kde_bw=g_custom_kde_bw) ## also make the KDE plots

# Signal/proxy comparison plots

Based on the DataFrame `df1`, The below recipe creates the proxy jet (from MC) and h->cc signal jet comparison plots on various jet observables.

In [None]:
## Load the hcc signal tree
if 'df_comp' not in globals():  
    import re
    _df0['vhcc-2L'] = uproot.open(f"{re.search('^.+/trees', sample_prefix)[0]}/20210117_VH_extjetvar_{year}_2L/mc/vhcc_tree.root")['Events'].pandas.df()

    boosted = "v_pt>200 & ak15_pt>200 & dphi_V_ak15>2.5 & ak15_sdmass>50 & ak15_sdmass<200"
    basecut = f"fj_x_pt>200 & fj_x_sdmass>50 & fj_x_sdmass<200 & passmetfilters & fj_x_nbhadrons==0 & fj_x_nchadrons>=1"
    basecut_vhcc_2L = "v_mass>75 & v_mass<105 & ((abs(lep1_pdgId)==11 & passTrigEl) | (abs(lep1_pdgId)==13 & passTrigMu)) & " + boosted + " & n_ak4<3"
    df_comp = {}
    df_comp['proxy'] = pd.concat([df1[sam].query(basecut) for sam in ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht']])
    df_comp['vhcc-2L'] = _df0['vhcc-2L'].query(basecut_vhcc_2L)

wgtstr = 'genWeight*xsecWeight*puWeight*htwgt'
wgtstr_vhcc_2L = 'genWeight*xsecWeight*puWeight'
basesel = { # name: cut, label
    'sv': ("fj_x_sj1_nsv>=1 & fj_x_sj2_nsv>=1", r'$N_{SV}^{match}\geq 1$'),
    'tightsv': ("(fj_x_sj1_sv1_ntracks>2 & abs(fj_x_sj1_sv1_dxy)<3 & fj_x_sj1_sv1_dlensig>4 & fj_x_sj2_sv1_ntracks>2 & abs(fj_x_sj2_sv1_dxy)<3 & fj_x_sj2_sv1_dlensig>4)", r'$N_{SV,tight}^{match}\geq 1$'),
}
def func_basesel(name):
    if name in basesel.keys():
        return basesel[name]
    elif name[:5]=='sfbdt':
        x = float(name[5:])/1000.
        return ('fj_x_sfBDT>%.3f'%x, r'$sfBDT>%.3f$'%x)
    else:
        raise RuntimeError('Baseline cut name not recognized.')

In [None]:
bininfo = [ #(vname, nbin, xmin, xmax, label, *vname for nominal*, xlim)   
#     ('fj_x_ParticleNetMD_XccVsQCD', 20, 0, 1, 'ParticleNetMD_XccVsQCD (AK15)', 'ak15_ParticleNetMD_HccVsQCD', None),
    (('fj_x_ParticleNetMD_XccVsQCD_3WP', 'fj_x_ParticleNetMD_XccVsQCD'), [0,0.5,0.8,0.9,0.96,0.99,1], None, None, 'ParticleNetMD_XccVsQCD (AK15)', 'ak15_ParticleNetMD_HccVsQCD', (0.9,1)),
#     ('fj_x_sdmass', 15, 50, 200, r'$m_{SD}$ (AK15)', 'ak15_sdmass', None),
#     ('fj_x_tau21', 20, 0, 1, r'$\tau_{21}$ (AK15)', 'ak15_tau21', None), ##avaliable
    
#     ('fj_x_deltaR_sj12', 40, 0, 1.5, r'$\Delta R_{sj_{1},sj_{2}}$ (AK15)', 'ak15_deltaR_sj12', None),
#     ('fj_x_pt', 40, 0, 1000, r'$p_{T}$ (AK15)', 'ak15_pt', None),
#     ('fj_x_sj1_pt', 40, 0, 1000, r'$p_{T,sj_{1}}$ (AK15)', 'ak15_sj1_pt', None),
#     ('fj_x_sj1_rawmass', 40, 0, 200, r'$m_{sj_{1},raw}$ (AK15)', 'ak15_sj1_rawmass', None), ##avaliable
#     ('fj_x_sj2_pt', 40, 0, 1000, r'$p_{T,sj_{2}}$ (AK15)', 'ak15_sj2_pt', None),
#     ('fj_x_sj2_rawmass', 40, 0, 200, r'$m_{sj_{2},raw}$ (AK15)', 'ak15_sj2_rawmass', None), ##avaliable
    
#     ('fj_x_nsv', 10, 0, 10, r'$N_{SV}$ (AK15)', 'ak15_nlooseSV', None), ##avaliable
#     ('fj_x_nsv_ptgt25', 8, 0, 8, r'$N_{SV,p_{T}\geq 25}$ (AK15)', 'ak15_nlooseSV_ptgt25', None), ##avaliable
#     ('fj_x_nsv_ptgt50', 8, 0, 8, r'$N_{SV,p_{T}\geq 50}$ (AK15)', 'ak15_nlooseSV_ptgt50', None), ##avaliable
#     ('fj_x_ntracks', 20, 0, 20, r'$N_{tracks}$ (AK15)', 'ak15_nlooseSV_ntracks', None), ##avaliable
#     ('fj_x_ntracks_sv12', 20, 0, 20, r'$N_{tracks\;for\;SV_{1,2}}$ (AK15)', 'ak15_nlooseSV_ntracks_sv12', None), ##avaliable
#     ('fj_x_sj1_nsv', 20, 0, 20, r'$N_{SV\;from\;sj_{1}}$ (AK15)', 'ak15_sj1_nlooseSV', None), ##avaliable
#     ('fj_x_sj1_ntracks', 20, 0, 20, r'$N_{tracks\;from\;sj_{1}}$ (AK15)', 'ak15_sj1_nlooseSV_ntracks', None), ##avaliable
#     ('fj_x_sj1_sv1_pt', 20, 0, 200, r'$p_{T,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_pt', None),
#     ('fj_x_sj1_sv1_mass', 20, 0, 50, r'$m_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_mass', None), ##avaliable
#     ('fj_x_sj1_sv1_masscor', 20, 0, 50, r'$m_{cor\;for\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_masscor', None),
#     ('fj_x_sj1_sv1_ntracks', 20, 0, 20, r'$N_{tracks\;from\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_ntracks', None),
#     ('fj_x_sj1_sv1_dxy', 20, 0, 5, r'$d_{xy,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dxy', None),
#     ('fj_x_sj1_sv1_dxysig', 20, 0, 20, r'$\sigma_{d_{xy},\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dxysig', None),
#     ('fj_x_sj1_sv1_dlen', 20, 0, 5, r'$d_{z,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dlen', None),
#     ('fj_x_sj1_sv1_dlensig', 20, 0, 20, r'$\sigma_{d_{z},\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dlensig', None),
#     ('fj_x_sj1_sv1_chi2ndof', 20, 0, 5, r'$\chi^2 / Ndof_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_chi2ndof', None),
#     ('fj_x_sj1_sv1_pangle', 40, 0, 5, r'$pAngle_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_pangle', None),
]

In [None]:
g_dirname = 'test_sigpxy' ## config me
if not os.path.exists(f'plots/{g_dirname}_{year}_pd'):
    os.makedirs(f'plots/{g_dirname}_{year}_pd')

## Make comparison plots for normal weight (MC adopt the same weight as in the fit), or for additional mass / pT / tau21 weight
# for wgtfac, pfwgt in zip(['1','massdatamcwgt','ptdatamcwgt'], ['nom', 'massdatamcwgt', 'ptdatamcwgt']):
for wgtfac, pfwgt in zip(['1'], ['nom']):

    wgtstr = f'genWeight*xsecWeight*puWeight*htwgt*{wgtfac}'
    wgtstr_vhcc_2L = 'genWeight*xsecWeight*puWeight'

    mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green', 'violet', 'darkorange', 'black', 'cyan', 'yellow'])
    do_rwgt = 0
    for ptmin, ptmax in config['pt_range']['range']:
        presel, presel1 = f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax}', f'ak15_pt>{ptmin} & ak15_pt<{ptmax}'
        label = {'proxy': r'g(cc)', 'vhcc-2L':r'$Z(\ell\ell)H(cc)$'}

        for vname, nbin, xmin, xmax, vlabel, vname1, xlim in bininfo:
            if not isinstance(vname, str): ## savename is specified other then the variable name
                savename, vname = vname
            else:
                savename = vname
            if not isinstance(nbin, int):
                edges, xmin, xmax, nbin = nbin, min(nbin), max(nbin), len(nbin)
            else:
                edges = np.linspace(xmin, xmax, nbin+1)

            f, ax = plt.subplots(figsize=(12,12))
            hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')

            for sam in ['vhcc-2L']:
                dftmp = df_comp[sam] if presel1=='' else df_comp[sam].query(presel1)
                h = get_hist(dftmp[vname1].values, bins=edges, weights=dftmp.eval(wgtstr_vhcc_2L).values)
                plot_hist(h, bins=edges, label=label[sam]+' $N_{SV}^{match}\geq 1$' if sam=='qcd-mg' else label[sam], normed=True)

            for sam in ['proxy']:
                if (ptmin,ptmax) == (200,100000):
                    selclist, suf_label = ['sv+sfbdt500', 'sv+sfbdt800', 'sv+sfbdt900', 'sv+sfbdt950'], ['','','','']
                else:
                    bdt_seq = df1[f"bdt_seq_{config['pt_range']['name']}"][(ptmin,ptmax)]
                    selclist = ['sv+sfbdt500'] + [f'sv+sfbdt{int(b*1000)}' for b in [bdt_seq[0], bdt_seq[int((len(bdt_seq)-1)/2)], bdt_seq[-1]]]
                    suf_label = ['', ' (lower)', ' (central)', ' (upper)']
                for ext, slb in zip(selclist, suf_label):
                    cutstr = ' & '.join(list(filter(None, [presel]+[func_basesel(cname)[0] for cname in ext.split('+')]))) ## join the cut string
                    if 'qcd-mg' in sam:  print (cutstr)
                    dftmp = df_comp[sam].query(cutstr)
                    h = get_hist(dftmp[vname].values, bins=edges, weights=dftmp.eval(wgtstr))
                    plot_hist(h, bins=edges, label=label[sam]+' '+(rwgt_ext_label if do_rwgt else '')+' & '.join([func_basesel(cname)[1] for cname in ext.split('+')])+slb, normed=True)

            ax.legend()
            ax.set_xlim((xmin, xmax) if xlim is None else xlim)
            ax.set_xlabel(vlabel, ha='right', x=1.0); ax.set_ylabel('A.U.', ha='right', y=1.0); 
            plt.savefig(f'plots/{g_dirname}_{year}_pd/{pfwgt}_{presel}__{savename}.png')
            plt.savefig(f'plots/{g_dirname}_{year}_pd/{pfwgt}_{presel}__{savename}.pdf')
    break

# Other comparisons

The below function enables one to make a simple comparison with the given sample lists, weight strings, pre-selection strings, and labels.

## Standard vs. extra b-enriched sample

In [None]:
def simple_comp_plot(df, bininfo, sam_list, wgtstr, presel, label, isnormed=True):
    for i in range(len(sam_list)):
        if isinstance(sam_list[i], str):
            sam_list[i] = [sam_list[i]]

    mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green', 'violet', 'darkorange', 'black', 'cyan', 'yellow'])
    for vname, nbin, xmin, xmax, vlabel in bininfo:
        if not isinstance(vname, str): ## savename is specified other then the variable name
            savename, vname = vname
        else:
            savename = vname
        if not isinstance(nbin, int):
            edges, xmin, xmax, nbin = nbin, min(nbin), max(nbin), len(nbin)
        else:
            edges = np.linspace(xmin, xmax, nbin+1)

        f, ax = plt.subplots(figsize=(12,12))
        hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')

        for sl, wgt, sel, lab in zip(sam_list, wgtstr, presel, label):
            print(sl, wgt, sel, lab)
            _df = df[sl[0]].query(sel) if len(sl)==1 else pd.concat([df[sam].query(sel) for sam in sl])
            h = get_hist(_df[vname].values, bins=edges, weights=_df.eval(wgt).values if wgt!='1' else np.ones(_df.shape[0]))
            plot_hist(h, bins=edges, label=lab, normed=isnormed)

        ax.legend()
        ax.set_xlim(xmin, xmax)
        ax.set_xlabel(vlabel, ha='right', x=1.0); ax.set_ylabel('A.U.' if isnormed else 'Events / bin', ha='right', y=1.0); 

bininfo = [ #(savename, vname, nbin, xmin, xmax, label)
    ('fj_x_btagcsvv2', [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, r'$CSVv2$'),
    ('mSV12_ptmax_log', [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, r'$log(m_{SV1,p_{T}\,max}\; /GeV)$'),
    ('mSV12_dxysig_log', [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, r'$log(m_{SV1,d_{xy}sig\,max}\; /GeV)$'),
]
ptmin, ptmax = 250, 350
simple_comp_plot(
    df=df1, bininfo=bininfo,
    sam_list=[['subst_qcd-mg-noht'],['subst_qcd-mg-bflav-noht']],
    wgtstr=['genWeight*xsecWeight*puWeight*htwgt', 'genWeight*xsecWeight*puWeight*htwgt*bflav_htwgt'],
    presel=[f'fj_x_nbhadrons>=1 & fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.9']*2,
    label=['standard','b-flavor'],
    isnormed=False
)