# Main notebook for ParticleNet AK15 cc-tagger SF derivation

The notebook aims to
 - Make the ROOT-format **templates** for fit
 - Produce **data/MC comparison plots** under some given event selection
 - Produce **H->cc signal and g->cc proxy jets comparison plots** on various jet observables
 
We adopt the `uproot`+`pandas`* workflow in this notebook, illustrated as follows:

    Input files (flat ROOT-tuples derived from analysis NanoAOD)
    -> load as `pandas` DataFrame (by `uproot`)
    -> manipulate the dataframe
    -> produce histograms (`boost_histogram`)
    -> (1) convert to TH1D for ROOT template; or (2) plot with `mplhep` using `matplotlib` as backend
    
 
(*) Note: this workflow suffers from large RAM usage in the runtime. It may consume 10-30 GB of RAM if dealing with large datasets, hence set requirement to the machine. 
A smarter workflow would be `coffea` (with `uproot` lazy dataframe as backend) which the future framework is planned to be migrated on.

# Make templates for fit

In [None]:
import numpy as np
import boost_histogram as bh
import matplotlib.pyplot as plt
import mplhep as hep
import pandas as pd
use_helvet = True  ## true: use helvetica for plots, make sure the system have the font installed
if use_helvet:
    CMShelvet = hep.style.CMS
    CMShelvet['font.sans-serif'] = ['Helvetica', 'Arial']
    plt.style.use(CMShelvet)
else:
    plt.style.use(hep.style.CMS)

import matplotlib as mpl
from cycler import cycler

def get_hist(array, bins=10, xmin=None, xmax=None, underflow=False, overflow=False, mergeflowbin=True, normed=False,
            weights=None, **kwargs):
    r"""Plot histogram from input array.

    Arguments:
        array (np.ndarray): input array.
        bins (int, list or tuple of numbers, np.ndarray, bh.axis): bins
        weights (None, or np.ndarray): weights
        # normed (bool): deprecated.

    Returns:
        hist (boost_histogram.Histogram)
    """
    if isinstance(bins, int):
        if xmin is None:
            xmin = array.min()
        if xmax is None:
            xmax = array.max()
        width = 1.*(xmax-xmin)/bins
        if mergeflowbin and underflow:
            xmin += width
            bins -= 1
        if mergeflowbin and underflow:
            xmax -= width
            bins -= 1
        bins = bh.axis.Regular(bins, xmin, xmax, underflow=underflow, overflow=overflow)
    elif isinstance(bins, (list, tuple, np.ndarray)):
        if mergeflowbin and underflow:
            bins = bins[1:]
        if mergeflowbin and overflow:
            bins = bins[:-1]
        bins = bh.axis.Variable(bins, underflow=underflow, overflow=overflow)

    hist = bh.Histogram(bins, storage=bh.storage.Weight())
    if weights is None:
        weights = np.ones_like(array)
    hist.fill(array, weight=weights)
    return hist


def plot_hist(hists, normed=False, **kwargs):
    r"""Plot the histogram in the type of boost_histogram
    """
    
    if not isinstance(hists, (list, tuple)):
        hists = [hists]
    content = [h.view(flow=True).value for h in hists]
    bins = hists[0].axes[0].edges
    if 'bins' in kwargs:
        bins = kwargs.pop('bins')
    if 'yerr' in kwargs:
        yerr = kwargs.pop('yerr')
    else:
        yerr = [np.sqrt(h.view(flow=True).variance) for h in hists]
    if normed:
        for i in range(len(content)):
            contsum = sum(content[i])
            content[i] /= contsum
            yerr[i] /= contsum
    if len(hists) == 1:
        content, yerr = content[0], yerr[0]
    hep.histplot(content, bins=bins, yerr=yerr, **kwargs)

In [None]:
import uproot
from uproot_methods import TLorentzVectorArray, TLorentzVector
import ROOT
import array
import os

## 1. Load files

Load the ROOT files into pandas DataFrame

In [None]:
year = 2016  ## config me! options: 2016, 2017, 2018

lumi = {2016: 35.92, 2017: 41.53, 2018: 59.74}

minimal_branches = [  ## minimal set of branches read into the notebook
    "run", "luminosityBlock", "event", "genWeight", "jetR", "passmetfilters", "n_fatjet", "fj_1_ParticleNetMD_XbbVsQCD", "fj_1_ParticleNetMD_XccVsQCD", "fj_1_dr_H", "fj_1_dr_Z", "fj_1_pt", "fj_1_eta", "fj_1_phi", "fj_1_energy", "fj_1_rawmass", "fj_1_sdmass", "fj_1_tau21", "fj_1_btagcsvv2", "fj_1_btagjp", "fj_1_nsv", "fj_1_nsv_ptgt25", "fj_1_nsv_ptgt50", "fj_1_ntracks", "fj_1_ntracks_sv12", "fj_1_deltaR_sj12", "fj_1_sj1_pt", "fj_1_sj1_eta", "fj_1_sj1_phi", "fj_1_sj1_rawmass", "fj_1_sj1_energy", "fj_1_sj1_btagdeepcsv", "fj_1_sj1_btagcsvv2", "fj_1_sj1_btagjp", "fj_1_sj1_ntracks", "fj_1_sj1_nsv", "fj_1_sj1_sv1_pt", "fj_1_sj1_sv1_mass", "fj_1_sj1_sv1_masscor", "fj_1_sj1_sv1_ntracks", "fj_1_sj1_sv1_dxy", "fj_1_sj1_sv1_dxysig", "fj_1_sj1_sv1_dlen", "fj_1_sj1_sv1_dlensig", "fj_1_sj1_sv1_chi2ndof", "fj_1_sj1_sv1_pangle", "fj_1_sj2_pt", "fj_1_sj2_eta", "fj_1_sj2_phi", "fj_1_sj2_rawmass", "fj_1_sj2_energy", "fj_1_sj2_btagdeepcsv", "fj_1_sj2_btagcsvv2", "fj_1_sj2_btagjp", "fj_1_sj2_ntracks", "fj_1_sj2_nsv", "fj_1_sj2_sv1_pt", "fj_1_sj2_sv1_mass", "fj_1_sj2_sv1_masscor", "fj_1_sj2_sv1_ntracks", "fj_1_sj2_sv1_dxy", "fj_1_sj2_sv1_dxysig", "fj_1_sj2_sv1_dlen", "fj_1_sj2_sv1_dlensig", "fj_1_sj2_sv1_chi2ndof", "fj_1_sj2_sv1_pangle", "fj_1_sj12_masscor_dxysig", "fj_1_sfBDT", "fj_1_nbhadrons", "fj_1_nchadrons", "fj_1_sj1_nbhadrons", "fj_1_sj1_nchadrons", "fj_1_sj2_nbhadrons", "fj_1_sj2_nchadrons", "fj_2_ParticleNetMD_XbbVsQCD", "fj_2_ParticleNetMD_XccVsQCD", "fj_2_dr_H", "fj_2_dr_Z", "fj_2_pt", "fj_2_eta", "fj_2_phi", "fj_2_energy", "fj_2_rawmass", "fj_2_sdmass", "fj_2_tau21", "fj_2_btagcsvv2", "fj_2_btagjp", "fj_2_nsv", "fj_2_nsv_ptgt25", "fj_2_nsv_ptgt50", "fj_2_ntracks", "fj_2_ntracks_sv12", "fj_2_deltaR_sj12", "fj_2_sj1_pt", "fj_2_sj1_eta", "fj_2_sj1_phi", "fj_2_sj1_rawmass", "fj_2_sj1_energy", "fj_2_sj1_btagdeepcsv", "fj_2_sj1_btagcsvv2", "fj_2_sj1_btagjp", "fj_2_sj1_ntracks", "fj_2_sj1_nsv", "fj_2_sj1_sv1_pt", "fj_2_sj1_sv1_mass", "fj_2_sj1_sv1_masscor", "fj_2_sj1_sv1_ntracks", "fj_2_sj1_sv1_dxy", "fj_2_sj1_sv1_dxysig", "fj_2_sj1_sv1_dlen", "fj_2_sj1_sv1_dlensig", "fj_2_sj1_sv1_chi2ndof", "fj_2_sj1_sv1_pangle", "fj_2_sj2_pt", "fj_2_sj2_eta", "fj_2_sj2_phi", "fj_2_sj2_rawmass", "fj_2_sj2_energy", "fj_2_sj2_btagdeepcsv", "fj_2_sj2_btagcsvv2", "fj_2_sj2_btagjp", "fj_2_sj2_ntracks", "fj_2_sj2_nsv", "fj_2_sj2_sv1_pt", "fj_2_sj2_sv1_mass", "fj_2_sj2_sv1_masscor", "fj_2_sj2_sv1_ntracks", "fj_2_sj2_sv1_dxy", "fj_2_sj2_sv1_dxysig", "fj_2_sj2_sv1_dlen", "fj_2_sj2_sv1_dlensig", "fj_2_sj2_sv1_chi2ndof", "fj_2_sj2_sv1_pangle", "fj_2_sj12_masscor_dxysig", "fj_2_sfBDT", "fj_2_nbhadrons", "fj_2_nchadrons", "fj_2_sj1_nbhadrons", "fj_2_sj1_nchadrons", "fj_2_sj2_nbhadrons", "fj_2_sj2_nchadrons", "passHTTrig", "ht", "nlep", "fj_1_is_qualified", "fj_2_is_qualified", "puWeight", "puWeightUp", "puWeightDown", "xsecWeight"
]
ext_hlt_branches = {  ## extra branches depend on year
    2016: ['HLT_PFHT125', 'HLT_PFHT200', 'HLT_PFHT250', 'HLT_PFHT300', 'HLT_PFHT350', 'HLT_PFHT400', 'HLT_PFHT475', 'HLT_PFHT600', 'HLT_PFHT650', 'HLT_PFHT800', 'HLT_PFHT900'],
    2017: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
    2018: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
}
minimal_branches += ext_hlt_branches[year]
minimal_branches += ['nPSWeight', 'PSWeight'] if year==2018 else []  ## extra PSWeight branches for 2018
minimal_branches_for_data = set(minimal_branches) - set(['genWeight',"puWeight", "puWeightUp", "puWeightDown", "xsecWeight", 'nPSWeight', 'PSWeight',
                                'fj_1_nchadrons', 'fj_1_nbhadrons','fj_2_nbhadrons','fj_1_sj1_nbhadrons','fj_2_sj1_nbhadrons','fj_1_sj2_nbhadrons','fj_2_sj2_nbhadrons',
                                'fj_2_nchadrons','fj_1_sj1_nchadrons','fj_2_sj1_nchadrons','fj_1_sj2_nchadrons','fj_2_sj2_nchadrons'])

## Read into pandas DataFrame
_df0 = {}
_df0['qcd-mg-noht'] = uproot.open(f"samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/qcd-mg_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['qcd-herwig-noht'] = uproot.open(f"samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/qcd-herwig_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['top-noht'] = uproot.open(f"samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/top_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['v-qq-noht'] = uproot.open(f"samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/mc/v-qq_tree.root")['Events'].pandas.df(minimal_branches, flatten=False)
_df0['jetht-noht'] = uproot.open(f"samples/trees_sf/20201028_nohtwbdt_v2_ak15_qcd_{year}/data/jetht_tree.root")['Events'].pandas.df(minimal_branches_for_data, flatten=False)

## 2. Pre-processing

For data: apply OR of all HT trigger to enhance statistics.

For MC: apply no HT trigger, based on the strategy we name it "MC substitute".

The initial dataframe (`_df0`) is event-based, but for the purpose of fit we transform the dataframe to be jet-based. 
The new dataframe `df1` contains branches `fj_x_` that either come from `fj_1_` or `fj_2_` passing the corresponding jet-based creteria (pT>200, each subjet matched to >=1 SV, sfBDT>0.5) carried by `fj_?_is_qualified` (?=1,2).

In [None]:
### ================ Pre-processing for data  ===================

## Baseline selection applied to data. 
## Note that we use the OR or all HT triggers (some are pre-scaled triggers)
htcut_incl = '('+' | '.join(ext_hlt_branches[year])+')'
basesel_noht_prep = f"passmetfilters & {htcut_incl} & fj_x_pt>200 & fj_x_is_qualified"
sl_prep = ['jetht-noht']
df1 = {}
for sam in sl_prep:
    assert 'noht' in sam
    ## To concatenate event lists where either fj_1 is qualified OR fj_2 is qualified
    fj_branches = [key.replace('fj_2', 'fj_x') for key in _df0[sam].keys() if (key.startswith('fj_2') and key!='fj_2_is_qualified')]  ## all fj_2_ branches expect fj_2_is_qualified
    for i, i_inv in zip(['1','2'], ['2','1']):
        df1[sam + i] = _df0[sam].query(basesel_noht_prep.replace('fj_x', f'fj_{i}'))  ## select events where fj_1/fj_2 is qualified
        df1[sam + i].drop(columns=[key.replace('fj_x', f'fj_{i_inv}') for key in fj_branches], inplace=True)  ## drop fj branches for the other index
        df1[sam + i].rename(columns={key.replace('fj_x', f'fj_{i}'): key for key in fj_branches}, inplace=True)  ## change branches name from fj_1/fj_2 to a unified name fj_x
        df1[sam + i].loc[:, 'fj_idx'] = int(i)  ## label the jet index
        df1[sam + i].loc[:, 'is_qcd'] = True if 'qcd' in sam else False
    df1[sam] = pd.concat([df1[sam + '1'], df1[sam + '2']])
    del df1[sam + '1'], df1[sam + '2']
    del _df0[sam]  # to release memory usage if necessary

## Produce new variables used for fit
for sam in sl_prep:
    df1[sam]['mSV12_ptmax'] = df1[sam].eval('(fj_x_sj1_sv1_pt>fj_x_sj2_sv1_pt)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_pt<=fj_x_sj2_sv1_pt)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_ptmax_log'] = df1[sam].eval('log(mSV12_ptmax)')
    df1[sam]['mSV12_dxysig'] = df1[sam].eval('(fj_x_sj1_sv1_dxysig>fj_x_sj2_sv1_dxysig)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_dxysig<=fj_x_sj2_sv1_dxysig)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_dxysig_log'] = df1[sam].eval('log(mSV12_dxysig)')

In [None]:
# ## FOR TEST: to see data HT distributions passing different HT pre-scaled trigger
# for hlt in ext_hlt_branches[year]:
#     dftmp = _df0['jetht-noht'].query(hlt)
#     h = get_hist(dftmp['ht'].values, bins=np.linspace(0, 2000, 201), weights=np.ones(dftmp.shape[0]))
#     plot_hist(h, label=hlt)

In [None]:
## FOR TEST: check the xsecWeight for MG samples & genWeight for Herwig sample (to avoid extremely large values) 
from collections import Counter
print(Counter(_df0['qcd-mg-noht']['xsecWeight']),'\n')
for i in [0.96, 0.98, 0.99]:
    print(_df0['qcd-herwig-noht']['genWeight'].quantile(q=i))

In [None]:
### ================ Pre-processing for MC substitute  ===================

## Baseline selection applied to MC.
## No HT trigger is applied, based on the "MC substitute" strategy
basesel_noht_prep_subst = "passmetfilters & fj_x_pt>200 & fj_x_is_qualified"
sl_prep_subst = ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht']  ## mark sample name with "subst_" as a reminder of MC substitute
for sam in sl_prep_subst:
    assert 'noht' in sam
    ## To concatenate event lists where fj_1 is qualified OR fj_2 is qualified. Same procedure here
    fj_branches = [key.replace('fj_2', 'fj_x') for key in _df0[sam.replace('subst_','')].keys() if (key.startswith('fj_2') and key!='fj_2_is_qualified')]  ## all fj_2_ branches expect fj_2_is_qualified
    for i, i_inv in zip(['1','2'], ['2','1']):
        df1[sam + i] = _df0[sam.replace('subst_','')].query(basesel_noht_prep_subst.replace('fj_x', f'fj_{i}'))
        df1[sam + i].drop(columns=[key.replace('fj_x', f'fj_{i_inv}') for key in fj_branches], inplace=True)
        df1[sam + i].rename(columns={key.replace('fj_x', f'fj_{i}'): key for key in fj_branches}, inplace=True)
        df1[sam + i].loc[:, 'fj_idx'] = int(i)
        df1[sam + i].loc[:, 'is_qcd'] = True if 'qcd' in sam else False
        if sam == 'subst_qcd-mg-noht':
            df1[sam + i].query('xsecWeight<5.', inplace=True)  ## drop MG events with extremely large xsecWeight (coming from low HT sample in the HT-binned MG list)
        if sam == 'subst_qcd-herwig-noht':
            df1[sam + i].query('genWeight<{}'.format(_df0['qcd-herwig-noht']['genWeight'].quantile(q=0.96)), inplace=True)  ## drop Herwig events with extremely large genWeight
        if year == 2016 and sam == 'subst_qcd-herwig-noht':
            df1[sam + i].loc[:, 'xsecWeight'] = df1[sam + i]['xsecWeight'] * 2400.  ## fix a 2016 bug: Herwig sample xsec is mistaken
    df1[sam] = pd.concat([df1[sam + '1'], df1[sam + '2']])
    del df1[sam + '1'], df1[sam + '2']
    del _df0[sam.replace('subst_','')]  # to release memory usage if necessary

## Produce new variables used for fit
for sam in sl_prep_subst:
    df1[sam]['mSV12_ptmax'] = df1[sam].eval('(fj_x_sj1_sv1_pt>fj_x_sj2_sv1_pt)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_pt<=fj_x_sj2_sv1_pt)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_ptmax_log'] = df1[sam].eval('log(mSV12_ptmax)')
    df1[sam]['mSV12_dxysig'] = df1[sam].eval('(fj_x_sj1_sv1_dxysig>fj_x_sj2_sv1_dxysig)*fj_x_sj1_sv1_masscor + (fj_x_sj1_sv1_dxysig<=fj_x_sj2_sv1_dxysig)*fj_x_sj2_sv1_masscor')
    df1[sam]['mSV12_dxysig_log'] = df1[sam].eval('log(mSV12_dxysig)')

    ## PSWeight variables exclusive to 2018 datasets
    if year==2018:
        if df1[sam]['nPSWeight'].iloc[0] == 1:
            df1[sam]['PSWeight1'] = df1[sam]['PSWeight2'] = df1[sam]['PSWeight3'] = df1[sam]['PSWeight4'] = df1[sam]['PSWeight']
        else:
            assert all(df1[sam]['nPSWeight'] == 4)
            for i in range(4):
                df1[sam][f'PSWeight{i+1}'] = df1[sam]['PSWeight'].map(lambda x: x[i])

## 3. Obtain reweight factors

We extract the following reweight factors. The first two sets are used in the nominal fit. The other two are for validation.
 1. **MC substitute-to-data reweight factor**: on the HT variable based on (pT, jet index) bins. The goal is to bring the shape of MC substitute back to the data shape in the inclusive region. Remember that the raw MC substitute yield is always much larger than data, because most HT triggers applied to data are pre-scaled triggers. New variables have the name `htwgt_(|herwig)`.
 2. **sfBDT reweight factor**: based on (pT, jet index) bins, to further reweight MC substitute back to data shape on the sfBDT variable. Since sfBDT>0.9 is imposed in the final fit region, the sfBDT shape discrepancy between the "reweighted MC substitute" and data may again cause $N_{total}$ difference for MC and data, after setting sfBDT>0.9 in the fit region. Therefore, we calculate the overall factor `sfbdtwgt_g90_(|herwig)_incl` in each (pT, jet index) bin, used in the nominal shape template; and the binned factor `sfbdtwgt_g90_(|herwig)_binned` used in the shape uncertainty extraction brought by the sfBDT shape mismodeling
 3. **Additional MC substitute-to-data reweight factor on $p_{T}$ only**: A possible replacement of the first two factors combined. This factor is only used in the validation fit. The goal for this validation is to check if different reweighting schemes may affect the SF fit results. New variables have the name `ad_ptwgt_(|herwig)`.
 4. **Proxy-to-signal reweight factor on $m_{SD}$ / $p_{T}$ / $\tau_{21}$**: based on the shape of "reweighted MC substitute (after the first two steps)" and the H->cc signal jet shape in the inclusive region. The factor is only used in the validation fit, in which we apply such reweight factor to both MC substitute and data to check if the SF results are affected. New variables have the name `(mass|pt|tau21)datamcwgt_(|herwig)`

In [None]:
### ================ 1. Reweight MC subsitute to data: stored as variable "htwgt", "htwgt_herwig") ===================

## True: if the block has run before, we can obtain the reweight factor from the previously stored pickle output
is_read_from_pickel = False

def extract_mc_to_data_ht_weight(df1, sl_rwgt, wgtstr_rwgt, wgtname):
    r"""Extract the "MC subsisute to data" reweight factor on HT based on (pT, jet index) bins
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
    """
    
    rwgt_var = 'ht'
    ## The binning info for (pT, HT) grid. Note that 2016 is different from 2017/18. The adopted HT grid is based on MC shape in each pT bin
    rwgt_edge_dic = {}
    rwgt_edge_dic[2016] = {
        'pt200to250': [300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100],
        'pt250to300': [350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1300],
        'pt300to350': [450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1350],
        'pt350to400': [550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
        'pt400to500': [600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
        'pt500toInf': [800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
    }
    rwgt_edge_dic[2017] = rwgt_edge_dic[2018] = {
#         'pt200to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100, 1200], # deprecated
#         'pt300to400': [500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1600], # deprecated
        'pt200to250': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 900, 1000],
        'pt250to300': [250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1100, 1200],
        'pt300to350': [450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1500],
        'pt350to400': [550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1600],
        'pt400to500': [700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1800],
        'pt500toInf': [900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 2000, 2200],
    }
    for sam in sl_rwgt:
        df1[sam][wgtname] = np.nan  ## initially fill the output column with NaN

    if is_read_from_pickel: ## restore info from a previously stored pickle
        import pickle
        with open(f'plots/wgtv5/htwgt_{year}.pickle', 'rb') as f:
            res = pickle.load(f)
            res = res[0] if 'herwig' not in wgtname else res[1]
            ent_data, ent_mc, rwgt = res['ent_data'], res['ent_mc'], res['rwgt']
    else:
        ent_data, ent_mc, rwgt = {}, {}, {}
        
    ## Rewight separately on jet pT bins
    for ptsel, ptlab in zip(['fj_x_pt>=200 & fj_x_pt<250', 'fj_x_pt>=250 & fj_x_pt<300', 'fj_x_pt>=300 & fj_x_pt<350', 'fj_x_pt>=350 & fj_x_pt<400', 'fj_x_pt>=400 & fj_x_pt<500', 'fj_x_pt>=500'], 
                            ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500toInf']):
        ## Reweight separately for 1st or 2nd jet
        for sel, lab in zip(['fj_idx==1', 'fj_idx==2'], ['jet1', 'jet2']):
            print (' -- ', ptsel, sel)
            rwgt_edge = rwgt_edge_dic[year][ptlab]
            if not is_read_from_pickel:
                ## Calculate the rwgt for the first time
                _dffdata = df1['jetht-noht'].query(f'{ptsel} & {sel}')
                _dffmc =  pd.concat([df1[sam].query(f'{ptsel} & {sel}') for sam in sl_rwgt])  ## concat all MC substitute sample
                
                ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
                ent_data[ptlab+lab] = get_hist(_dffdata[rwgt_var].values, bins=rwgt_edge, weights=np.ones(_dffdata.shape[0]), underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
                ent_mc[ptlab+lab]  = get_hist(_dffmc[rwgt_var].values, bins=rwgt_edge, weights=_dffmc.eval(wgtstr_rwgt).values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
                ## Calculate the reweight factor
                rwgt[ptlab+lab] = ent_data[ptlab+lab] / ent_mc[ptlab+lab] # len=nbin+2
            print(ent_data[ptlab+lab], '\n', rwgt[ptlab+lab])
            
            ## assign the reweight factor to the new column
            for sam in sl_rwgt:
                df1sel = df1[sam].eval(f'{ptsel} & {sel}')
                df1[sam].loc[df1sel, wgtname] = df1[sam].loc[df1sel, rwgt_var].map(lambda val: rwgt[ptlab+lab][sum(np.array(rwgt_edge)<=val)] )
    
    ## check all entries are filled with valid factors
    assert any([any(df1[sam][wgtname] == np.nan) for sam in sl_rwgt]) == False

    # =========== plot ===========
    mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green', 'violet', 'darkorange', 'black', 'cyan', 'yellow'])
    for ptlab in ['pt200to250', 'pt250to300', 'pt300to350', 'pt350to400', 'pt400to500', 'pt500toInf']:
        f, ax = plt.subplots(figsize=(11,11))
        hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
        for lab in ['jet1', 'jet2']:
            hep.histplot(ent_data[ptlab+lab], bins=[0]+list(rwgt_edge_dic[year][ptlab])+[2500], label=f'Data ({lab})')
            hep.histplot(ent_mc[ptlab+lab], bins=[0]+list(rwgt_edge_dic[year][ptlab])+[2500], label=f'MC subst. ({lab})')
        ax.set_xlim(0, 2500); ax.set_xlabel('$H_{T}$ [GeV]', ha='right', x=1.0); ax.set_ylabel('Events / bin', ha='right', y=1.0); ax.legend()
        if not os.path.exists('plots/wgtv5'):
            os.makedirs('plots/wgtv5')
        plt.savefig(f'plots/wgtv5/{year}_{ptlab}__{wgtname}.pdf')
        plt.savefig(f'plots/wgtv5/{year}_{ptlab}__{wgtname}.png')
    # ============================
    
    return {'ent_data':ent_data, 'ent_mc':ent_mc, 'rwgt':rwgt}

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
htwgt = extract_mc_to_data_ht_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'],     wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='htwgt')
htwgt_herwig = extract_mc_to_data_ht_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='htwgt_herwig')

if not is_read_from_pickel: ## store the info for the first run
    import pickle
    with open(f'plots/wgtv5/htwgt_{year}.pickle', 'wb') as fw:
        pickle.dump([htwgt, htwgt_herwig], fw)

df1['subst_qcd-mg-noht'][['ht', 'fj_x_pt', 'fj_idx', 'htwgt']]

In [None]:
### ================ 2. Extract the sfBDT>0.9 overall factor and binned fractor: stored as variable "sfbdtwgt_g90_incl", "sfbdtwgt_g90_binned"; similar for herwig ===================

def extract_further_sfbdt_weight(df1, sl_rwgt, wgtstr_rwgt, wgtname_binned, wgtname_incl):
    r"""Extract the "MC substitute to data" reweight factor (both overall and binned factor) further on sfBDT variable, after a sfBDT>0.9 selection
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt: sample list for MC substitute in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname_binned: the reweight name (the binned factors) stored as a new column
        wgtname_incl: the reweight name (the overall factor) stored as a new column
    """
    
    for sam in sl_rwgt:
        df1[sam][wgtname_binned] = np.nan  ## initially fill the output column with NaN
        df1[sam][wgtname_incl] = np.nan
    
    ## Reweight based on the sfBDT variable
    rwgt_var, nbin, xmin, xmax  = 'fj_x_sfBDT', 5, 0.9, 1.
    print('rwgt sfBDT bins: ', rwgt_var, nbin, xmin, xmax)
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Rewight separately on jet pT bins
    for pt_range in [(200, 250), (250, 300), (300, 350), (350, 400), (400, 500), (500, 100000)]:
        ## Requires the selection sfBDT>0.9 which is used in the fit region
        rwgt_sel = f'fj_x_sfBDT>0.9 & fj_x_pt>={pt_range[0]} & fj_x_pt<{pt_range[1]}'; print(rwgt_sel)
        _dffdata = df1['jetht-noht'].query(rwgt_sel)
        _dffmc =  pd.concat([df1[sam].query(rwgt_sel) for sam in sl_rwgt])
        
        ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
        ent_data = get_hist(_dffdata[rwgt_var].values, bins=rwgt_edge, weights=np.ones(_dffdata.shape[0]), underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ent_mc  = get_hist(_dffmc[rwgt_var].values, bins=rwgt_edge, weights=_dffmc.eval(wgtstr_rwgt).values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ## Calculate the reweight factor
        rwgt = ent_data / ent_mc # len=nbin+2
        
        ## assign the reweight factor to the new column
        for sam in sl_rwgt:
            df1[sam].loc[df1[sam].eval(rwgt_sel), wgtname_binned] = df1[sam].query(rwgt_sel)[rwgt_var].map(lambda val: rwgt[sum(np.array(rwgt_edge)<=val)] )
            rwgt_sel_nobdt = f'fj_x_pt>={pt_range[0]} & fj_x_pt<{pt_range[1]}'
            df1[sam].loc[df1[sam].eval(rwgt_sel_nobdt), wgtname_incl] = sum(ent_data) / sum(ent_mc)
        print (ent_data, rwgt, 'incl:', sum(ent_data) / sum(ent_mc))

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_further_sfbdt_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt",
                             wgtname_binned='sfbdtwgt_g90_binned', wgtname_incl='sfbdtwgt_g90_incl')
extract_further_sfbdt_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig",
                             wgtname_binned='sfbdtwgt_g90_herwig_binned', wgtname_incl='sfbdtwgt_g90_herwig_incl')

assert any([any(np.isnan(df1[sam].query(f'fj_x_sfBDT>0.9')[['sfbdtwgt_g90_binned','sfbdtwgt_g90_incl']].values.flatten())) for sam in ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht']]) == False
assert any([any(np.isnan(df1[sam].query(f'fj_x_sfBDT>0.9')[['sfbdtwgt_g90_herwig_binned','sfbdtwgt_g90_herwig_incl']].values.flatten())) for sam in ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht']]) == False

df1['subst_qcd-mg-noht'][['fj_x_pt', 'fj_idx', 'fj_x_sfBDT', 'sfbdtwgt_g90_incl']]

In [None]:
### ================ 3. [additional] Reweight MC subsitute to data on pT: stored as variable "ad_ptwgt", "ad_ptwgt_herwig" ===================

def extract_mc_to_data_pt_weight(df1, sl_rwgt, wgtstr_rwgt, wgtname):
    r"""Extract the "MC subsisute to data" reweight factor on pT as a optional choice
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
    """
    
    # Apply simple 1D reweight to pT
    rwgt_var, nbin, xmin, xmax  = 'fj_x_pt', 20, 200., 1200.
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Rewight separately on 1st/2nd jet
    for sel, lab in zip(['fj_idx==1', 'fj_idx==2'], ['jet1', 'jet2']):
        _dffdata = df1['jetht-noht'].query(f'fj_x_sfBDT>0.9 & {sel}')
        _dffmc =  pd.concat([df1[sam].query(f'fj_x_sfBDT>0.9 & {sel}') for sam in sl_rwgt])
        
        ## Get data and MC histogram. Note: consider underflow & overflow bins, hence len = nbins+2
        ent_data = get_hist(_dffdata[rwgt_var].values, bins=rwgt_edge, weights=np.ones(_dffdata.shape[0]), underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ent_mc  = get_hist(_dffmc[rwgt_var].values, bins=rwgt_edge, weights=_dffmc.eval(wgtstr_rwgt).values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
        ## Calculate the reweight factor
        rwgt = ent_data / ent_mc # len=nbin+2
        
        ## assign the reweight factor to the new column
        for sam in sl_rwgt:
            df1sel = df1[sam].eval(sel)
            df1[sam].loc[df1sel, wgtname] = df1[sam].loc[df1sel, rwgt_var].map(lambda val: rwgt[int(max(0, min(nbin+1, np.floor((val-1.*xmin)/(1.*xmax-xmin)*nbin) +1 )))] )
        print (ent_data, rwgt)

## Calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_mc_to_data_pt_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'],     wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight",        wgtname='ad_ptwgt')
extract_mc_to_data_pt_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight", wgtname='ad_ptwgt_herwig')

df1['subst_qcd-mg-noht'][['ht', 'fj_x_pt', 'fj_idx', 'htwgt', 'sfbdtwgt_g90_incl', 'ad_ptwgt']]

In [None]:
### ================ 4. [additional] Reweight MC (proxy jet) to H->cc signal jet on either mass/pT/tau21: stored as variable "(mass|pt|tau21)datamcwgt"; similar for herwig  ===================

# First load the h->cc signal ntuple. Adopt the selction used in the analysis
_df0['vhcc-2L'] = uproot.open("samples/trees/20200906_VH_extfillsv_2016_2L/mc/vhcc_tree.root")['Events'].pandas.df()

boosted = "v_pt>200 & ak15_pt>200 & dphi_V_ak15>2.5 & ak15_sdmass>50 & ak15_sdmass<200"
basecut_vhcc_2L = "v_mass>75 & v_mass<105 & ((abs(lep1_pdgId)==11 & passTrigEl) | (abs(lep1_pdgId)==13 & passTrigMu)) & " + boosted + " & n_ak4<3"
df_comp = {}
df_comp['vhcc-2L'] = _df0['vhcc-2L'].query(basecut_vhcc_2L)

def extract_mc_to_signal_weight(df1, sl_rwgt, wgtstr_rwgt, wgtname, rwgt_info):
    r"""Extract the "MC subsisute (proxy) to H->cc signal jet" reweight factor on possible variable
    
    Arguments:
        df1: DataFrame as input
        sl_rwgt: sample list for MC substitue in this reweighting routine
        wgtstr_rwgt: the weight string applied to MC to produce the histogram in this reweighting routine
        wgtname: the reweight name stored as a new column
        rwgt_info: variable and binning info for this reweighting routine
    """
    
    # Reweight info extracted from the function argument
    rwgt_var, nbin, xmin, xmax, rwgt_var_nom  = rwgt_info
    print('rwgt info: ', rwgt_var, nbin, xmin, xmax)
    rwgt_edge = np.linspace(xmin, xmax, nbin+1)
    
    ## Requires the selection sfBDT>0.9 which is used in the fit region
    rwgt_sel = 'fj_x_sfBDT>0.9'
    
    ## Get MC and h->cc signal histogram. Note: consider underflow & overflow bins, hence len = nbins+2
    _dffmc =  pd.concat([df1[sam].query(rwgt_sel) for sam in sl_rwgt])
    _dffmc_wgt = _dffmc.eval(wgtstr_rwgt)
    ent_mc  = get_hist(_dffmc[rwgt_var].values, bins=rwgt_edge, weights=_dffmc_wgt.values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
    yield_mc = _dffmc_wgt.sum()
    _dffhcc_wgt = df_comp['vhcc-2L'].eval('genWeight*xsecWeight*puWeight')
    ent_hcc  = get_hist(df_comp['vhcc-2L'][rwgt_var_nom].values, bins=rwgt_edge, weights=_dffhcc_wgt.values, underflow=True, overflow=True, mergeflowbin=False).view(flow=True).value
    yield_hcc = _dffhcc_wgt.sum()
    
    ## Calculate the reweight factor, and clip to (0, 50)
    rwgt = (ent_hcc/yield_hcc) / (ent_mc/yield_mc) # len=nbin+2
    rwgt = np.clip(rwgt, 0, 50)
    
    ## assign the reweight factor to the new column (to both MC and data)
    for sam in sl_rwgt + ['jetht-noht']:
        df1[sam][wgtname] = df1[sam][rwgt_var].map(lambda val: rwgt[int(max(0, min(nbin+1, np.floor((val-1.*xmin)/(1.*xmax-xmin)*nbin) +1 )))] )
    print (ent_hcc, rwgt)

## For each reweight variable, calculate two sets of reweight factor: one for the MG sample list and another for Herwig sample list
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt*sfbdtwgt_g90_incl",
                            wgtname='massdatamcwgt', rwgt_info=('fj_x_sdmass', 15, 50, 200, 'ak15_sdmass'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig*sfbdtwgt_g90_herwig_incl",
                            wgtname='massdatamcwgt_herwig', rwgt_info=('fj_x_sdmass', 15, 50, 200, 'ak15_sdmass'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt*sfbdtwgt_g90_incl",
                            wgtname='ptdatamcwgt', rwgt_info=('fj_x_pt', 20, 200, 1200, 'ak15_pt'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig*sfbdtwgt_g90_herwig_incl",
                            wgtname='ptdatamcwgt_herwig', rwgt_info=('fj_x_pt', 20, 200, 1200, 'ak15_pt'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt*sfbdtwgt_g90_incl",
                            wgtname='tau21datamcwgt', rwgt_info=('fj_x_tau21', 20, 0, 1, 'ak15_tau21'))
extract_mc_to_signal_weight(df1, sl_rwgt=['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht'], wgtstr_rwgt = f"{lumi[year]}*genWeight*xsecWeight*puWeight*htwgt_herwig*sfbdtwgt_g90_herwig_incl",
                            wgtname='tau21datamcwgt_herwig', rwgt_info=('fj_x_tau21', 20, 0, 1, 'ak15_tau21'))

df1['jetht-noht'][['fj_x_sdmass', 'massdatamcwgt', 'fj_x_pt', 'ptdatamcwgt', 'fj_x_tau21', 'tau21datamcwgt']]

## 4. Make ROOT templates

We produce the ROOT templates using the DataFrame in this step. The outputs are ROOT files with neat structure. After the further reorganization, they can be used as the Higgs Combine input to implement the fit.

As a reference, we provide an example of the output files and their structure. 
E.g., for a **given fit variable**, **given tagger WP** and a **certain jet-pT bin** for **a single fit**, the output ROOT templates should include the pass and fail MC template in the B/C/L flavors, the data template, and the MC systematics for all specified shape uncertainties. The files are organized in the following structure:
```
─── 20201115_SF2017_AK15_qcd_subst_pst_ptw50_TP_msv12_dxysig_log_var22binsv2  [use variable: msv12_dxysig_log, Tight WP]
    └── Cards
        └── bdt900
            ├── pt200to250                 [given pT bin]
            │   ├── nominal                    [the nominal histograms]
            │   │   ├── inputs_fail.root           [include four TH1D: flvC, flvB, flvL, data_obs]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBDown                 [shape uncertainty plots]
            │   │   ├── inputs_fail.root           [include three TH1D: flvC_fracBBDown, flvB_fracBBDown, flvL_fracBBDown]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBUp                   [..]
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdKdeSystDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdKdeSystUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdSystDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── qcdSystUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTFloAroundDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTFloAroundUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTRwgtDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   └── sfBDTRwgtUp
            │       ├── inputs_fail.root
            │       └── inputs_pass.root
            ├── pt250to300
            │   ├── ...
```

The template making is organized in three nested functions.

In [None]:
#### ================================ Global parameters: config me! ================================ ####
g_make_template_mode = 'main'
r"""Options:
        main           : the main fit
        val_pt         : the validation fit -- to use an optional MC subsitute-to-data strategy, i.e. on pT variable only
        val_tosig_mass : the validation fit -- additionally reweight MC & data to h->cc signal jet on mass
        val_tosig_pt   : the validation fit -- additionally reweight MC & data to h->cc signal jet on pt  
        val_tosig_tau21: the validation fit -- additionally reweight MC & data to h->cc signal jet on tau21
        val_vary_sfbdt : the validation fit -- varying sfBDT cut value and drop sfBDT* uncertaint
        val_crop_bin   : the validation fit -- cropping the marginal bins for fit
"""

g_outdir_prefix = f'20201115_SF{year}_AK15_qcd_subst_pst_ptw50'
r"""Prefix for the output dir name """

g_make_unce_types = {'nominal':True, 'pu':True, 'fracBB':True, 'fracCC':True, 'fracLight':True, 'psWeightIsr':False, 'psWeightFsr':False, 'sfBDTRwgt':True, 'sfBDTFloAround':True}
r"""The uncertainty types used in the fit. Use False or remove the key to disable an certain unce type
    Note: "qcdSyst" and "qcdKdeSyst" is not used in this verision. "psWeightIsr" and "psWeightFsr" works fine in 2018 while in 2016/17 one need to first garantee the 2018 histograms exist
          so the unce can be transferred.
""" # for test, we disable psWeightIsr/Fsr

g_do_fit_for = { # for test, we launch the main fit var (1) only
    1: ['TP', 'MP', 'LP'],
#     2: ['TP', 'MP', 'LP'],
#     3: ['TP', 'MP', 'LP'],
}
r""" Do fit for which variable and which WPs"""
#### =============================================================================================== ####

## Consistency check for gloal params
if g_make_template_mode not in ['main', 'val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_vary_sfbdt', 'val_crop_bin']:
    raise RuntimeError('Specified mode cannot be recognized.')
if g_make_template_mode in ['val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_vary_sfbdt'] and list(g_do_fit_for.keys()) != [1]:
    print('Warning: for validation fit, set the fit information to the main variable (1) only')
    g_do_fit_for = {1: ['TP', 'MP', 'LP']}
if g_make_template_mode == 'val_crop_bin' and list(g_do_fit_for.keys()) != [901]:
    print('Warning: for validation fit on cropping the marginal bins, set the fit information to the cropped main variable (901) only')
    g_do_fit_for = {901: ['TP', 'MP', 'LP']}
if g_make_template_mode == 'val_vary_sfbdt':
    g_make_unce_types.pop('sfBDTRwgt', None)
    g_make_unce_types.pop('sfBDTFloAround', None)
    
## The sfBDT varing list. 
## Note: to implement sfBDTFloAround unce, one must first obtain the nominal hist for the cut value 0.85, 0.95
if g_make_template_mode != 'val_vary_sfbdt':
    g_sfBDT_val_list = [0.85, 0.95, 0.9]
else:
    g_sfBDT_val_list = [0.84, 0.86, 0.88, 0.90, 0.92, 0.94] ## for validation: varying sfBDT
    

## Fit info: in the format of [ (fit var, nbins/edges, xmin/None, xmax/None, (underflow, overflow), label), outputdir lambda func ]
g_fitinfo = {
    1: [ ##  main fit var
        ('mSV12_dxysig_log', [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, (True, True), 'mSV12_dxysig_log'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_msv12_dxysig_log_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
    2: [ ## the other var for validation
        ('mSV12_ptmax_log', [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, (True, True), 'mSV12_ptmax_log'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_msv12_ptmax_log_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
    3: [ ## the other var for validation
        ('fj_x_btagcsvv2', [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, (True, True), 'CSVv2'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_csvv2_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
    901: [ ## crop the marginal bins for the main var as a validation
        ('mSV12_dxysig_log', [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8], None, None, (False, False), 'mSV12_dxysig_log'), 
        lambda wp, bdt, pt_range, sys_name: f'results/{g_outdir_prefix}_{wp}_msv12_dxysig_log_var22binsv2/Cards/bdt{int(bdt*1000)}/pt{pt_range[0]}to{pt_range[1]}/{sys_name}/'
    ],
}
g_hist_qcdsyst = {}


## Tagger values in use
g_map_tagger_val = {'TP':0.95, 'MP':0.90, 'LP':0.80}

    
## Necessary KDE parameters used in qcdKdeSyst unce
g_custom_kde_bw = {'fj_x_btagcsvv2':15, 'mSV12_ptmax_log':4, 'mSV12_dxysig_log':4}
g_custom_kde_binmask = {'fj_x_btagcsvv2':[0], 'mSV12_ptmax_log':[-0.4,1.8,2.5,3.2], 'mSV12_dxysig_log':[-0.8,-0.4,1.8,2.5]}

def launch_maker():
    r"""Depth 0: Main function to launch the fit given the global parameters
    """
    for _ifit in g_do_fit_for:
        for _wp in g_do_fit_for[_ifit]:
            
            ## Real tagger range with the given WP
            tagger_range = {'TP': (g_map_tagger_val['TP'], 1.0), 'MP': (g_map_tagger_val['MP'], g_map_tagger_val['TP']), 'LP': (g_map_tagger_val['LP'], g_map_tagger_val['MP'])}

            ## Get fit info and output lambda func
            fitinfo, outdir_func = g_fitinfo[_ifit]

            ## Loop over BDT varing list 
            for sfBDT_val in g_sfBDT_val_list:
                ## The default args in the main fit
                args = {
                    'wgtstr_dm': f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*sfbdtwgt_g90_incl', 'wgtstr_dm_data': None,
                    'sl_dm': ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'],
                    'sl_dm_herwig': ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht'],
                    'config_dm': {
                        'data':  '',
                        'flvB':  'fj_x_nbhadrons>=1',
                        'flvC':  'fj_x_nbhadrons==0 & fj_x_nchadrons>=1',
                        'flvL':  'fj_x_nbhadrons==0 & fj_x_nchadrons==0',
                    },
                    'categories_dm': ['flvL', 'flvB', 'flvC', 'data'],
                    'catMap': {
                        'pass': 'fj_x_ParticleNetMD_XccVsQCD>%.3f & fj_x_ParticleNetMD_XccVsQCD<=%.3f' % (tagger_range[_wp][0], tagger_range[_wp][1]),
                        'fail': 'fj_x_ParticleNetMD_XccVsQCD<=%.3f | fj_x_ParticleNetMD_XccVsQCD>%.3f' % (tagger_range[_wp][0], tagger_range[_wp][1]),
                    },
                }
                ## Modify args according to specified global param
                if g_make_template_mode == 'val_pt':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*ad_ptwgt', None
                elif g_make_template_mode == 'val_tosig_mass':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*sfbdtwgt_g90_incl*massdatamcwgt', 'massdatamcwgt'
                elif g_make_template_mode == 'val_tosig_pt':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*sfbdtwgt_g90_incl*ptdatamcwgt', 'ptdatamcwgt'
                elif g_make_template_mode == 'val_tosig_tau21':
                    args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*sfbdtwgt_g90_incl*tau21datamcwgt', 'tau21datamcwgt'

                ## df1->df2: apply sfBDT cut first
                df2 = {}
                for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']:
                    df2[sam] = df1[sam].query(f'fj_x_sfBDT>{sfBDT_val}')

                wrapperPt(df2, fitinfo, lambda pt_range, sys_name: outdir_func(_wp, sfBDT_val, pt_range, sys_name), sfBDT_val, args)

In [None]:
def wrapperPt(df2, fitinfo, outdir_func, sfBDT_val, args):
    r"""Depth 1: Process the pT cut and wrap all other following steps
    """
    
    for pt_range in [(200, 250), (250, 300), (300, 350), (350, 400), (400, 500), (500, 100000)]:
        print ('pt range:', pt_range)
        
        ## df2->df3: apply the pT cut
        df3 = {}
        for sam in ['subst_qcd-mg-noht', 'subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']:
            df3[sam] = df2[sam].query(f'fj_x_pt>={pt_range[0]} & fj_x_pt<{pt_range[1]}')
        
        makeTemplatesWrapper(df3, fitinfo, lambda sys_name: outdir_func(pt_range, sys_name), sfBDT_val, args)

In [None]:
def makeTemplatesWrapper(df3, fitinfo, outdir_func, sfBDT_val, args):
    r"""Depth 2: Specify which template (nominal or any shape uncertainty) to make in this step
    """
    
    wgtstr_dm = args['wgtstr_dm']
    if 'nominal' in g_make_unce_types.keys() and g_make_unce_types['nominal']:
        sys_name = 'nominal'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    
    ## Below we extract hists for all unce type. Note: we only need such procedure in sfBDT>0.9 case (except for the validaiton when varying the sfBDT)
    if sfBDT_val==g_sfBDT_val_list[-1] or g_make_template_mode=='val_vary_sfbdt':
        if 'pu' in g_make_unce_types.keys() and g_make_unce_types['pu']: 
            sys_name = 'puUp'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightUp'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = 'puDown'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightDown'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        
        if 'fracBB' in g_make_unce_types.keys() and g_make_unce_types['fracBB']: 
            sys_name = "fracBBUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = "fracBBDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        if 'fracCC' in g_make_unce_types.keys() and g_make_unce_types['fracCC']: 
            sys_name = "fracCCUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons==0 & fj_x_nchadrons>1) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons>1)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = "fracCCDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons==0 & fj_x_nchadrons>1) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons>1)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        if 'fracLight' in g_make_unce_types.keys() and g_make_unce_types['fracLight']: 
            sys_name = "fracLightUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons==0 & fj_x_nchadrons==0) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons==0)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = "fracLightDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons==0 & fj_x_nchadrons==0) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons==0)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        
        ## Below unce is not as easily extracted as above by specifying a different weight string. They may need *special treatment* implemented in the depth-3 function
        if 'qcdSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdSyst']: 
            sys_name = "qcdSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = "qcdSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        if 'qcdKdeSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdKdeSyst']: 
            sys_name = "qcdKdeSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = "qcdKdeSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        if 'psWeightIsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightIsr']: 
            sys_name = "psWeightIsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = "psWeightIsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        if 'psWeightFsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightFsr']: 
            sys_name = "psWeightFsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = "psWeightFsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

        if 'sfBDTRwgt' in g_make_unce_types.keys() and g_make_unce_types['sfBDTRwgt']: 
            sys_name = 'sfBDTRwgtUp'; wgtstr_dm_sys = wgtstr_dm.replace('sfbdtwgt_g90_incl','sfbdtwgt_g90_binned'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = 'sfBDTRwgtDown'; wgtstr_dm_sys = wgtstr_dm.replace('sfbdtwgt_g90_incl','(2*sfbdtwgt_g90_incl-sfbdtwgt_g90_binned)'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        if 'sfBDTFloAround' in g_make_unce_types.keys() and g_make_unce_types['sfBDTFloAround']: 
            sys_name = 'sfBDTFloAroundUp'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
            sys_name = 'sfBDTFloAroundDown'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

In [None]:
def makeTemplates(df3, fitinfo, outputdir, sys_name, wgtstr_dm_sys, args):
    r"""Depth 3: The very base implementation that apply the final pass/fail cut and make the template
    """
    
    wgtstr_dm, wgtstr_dm_data, sl_dm, sl_dm_herwig, config_dm, categories_dm, catMap = args['wgtstr_dm'], args['wgtstr_dm_data'], args['sl_dm'], args['sl_dm_herwig'], args['config_dm'], args['categories_dm'], args['catMap']
    
    if not os.path.exists(outputdir):
        os.makedirs(outputdir)

    ## Create the output root file
    print (fitinfo, outputdir, sys_name, wgtstr_dm_sys)
    
    ## Loop over pass and fail region
    for b in ['pass', 'fail']:
        try:
            fw = ROOT.TFile(outputdir+f'inputs_{b}.root', 'recreate')
            vname, nbin, xmin, xmax, (underflow, overflow), vlabel = fitinfo
            
            ## Tranfer the {nbin, xmin, xmax} set to the real bin edge if necessary
            if not isinstance(nbin, int):
                edges = nbin
                nbin = len(edges)-1 # reset nbin to "real" nbin
                edges_inroot = (len(edges)-1, array.array('f', edges))
            else:
                edges = np.linspace(xmin, xmax, nbin+1)
                edges_inroot = (nbin, xmin, xmax)

            hv, hist = {}, {}
            hname_suf = '_'+sys_name if sys_name!='nominal' else ''  ## suffix to the hist name (the Higgs Combine syntax)
            print (' -- ', catMap[b])
            
            ## MC and data dataframe after applying the final selection
            df_mc = pd.concat([df3[sam].query(catMap[b]) for sam in sl_dm[:-1]])
            df_data = df3[sl_dm[-1]].query(catMap[b])
            
            ## Preprocessing for herwig related dataframe if we mean to calculate qcdSyst / qcdKdeSyst unce in this iteration
            if 'qcdSyst' in sys_name or 'qcdKdeSyst' in sys_name:
                df_mc_herwig = pd.concat([df3[sam].query(catMap[b]) for sam in sl_dm_herwig[:-1]])

            # Loop over categories: flvC/flvB/flvL/data
            for cat in config_dm:
                ## hv[] holds the boosted-histogram type derived from the dataframe, hist[] holds the TH1D type to be stored in ROOT
                if cat=='data' and sys_name == 'nominal':
                    ## Get the data hist
                    hv['data'] = get_hist(df_data[vname].values, bins=edges, weights=np.ones(df_data.shape[0]) if wgtstr_dm_data==None else df_data.eval(wgtstr_dm_data).values, underflow=underflow, overflow=overflow).view(flow=True)
                    # Initialize the TH1D hist
                    hist['data'] = ROOT.TH1D('data_obs', 'data_obs;'+vname, *edges_inroot) 
                if cat!='data':
                    df_mc_tmp = df_mc.query(config_dm[cat]) ## category selection based on flavor
                    ## Get the MC hist for certain flavor
                    hv[cat] = get_hist(df_mc_tmp[vname].values, bins=edges, weights=df_mc_tmp.eval(wgtstr_dm_sys).values, underflow=underflow, overflow=overflow).view(flow=True)
                    # Initialize the TH1D hist
                    hist[cat] = ROOT.TH1D(cat+hname_suf, cat+hname_suf+';'+vname, *edges_inroot) # init TH1 hist
                    hist[cat].Sumw2()
            
                    ## For qcdSyst / qcdKdeSyst unce that is actually related to Herwig, hv[cat] is dummy here, 
                    ## and we mean to obtain hv[cat+'_herwig.value'] that will be later filled into hist[cat]
                    if sys_name=='qcdSystUp':
                        ## Get the Herwig fit for certain flavor
                        df_mc_herwig_tmp = df_mc_herwig.query(config_dm[cat]) ## cat selection
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g90','sfbdtwgt_g90_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        hv[cat+'_herwig.value'] = get_hist(df_mc_herwig_tmp[vname].values, bins=edges, 
                                                     weights=df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 
                                                     underflow=underflow, overflow=overflow).view(flow=True).value
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
                    
                    ## Extract the KDE shape directly from herwig shape
                    if sys_name=='qcdKdeSystUp':
                        df_mc_herwig_tmp = df_mc_herwig.query(config_dm[cat])
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g90','sfbdtwgt_g90_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        hv_herwig_orig_value = get_hist(df_mc_herwig_tmp[vname].values, bins=edges, 
                                                     weights=df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 
                                                     underflow=underflow, overflow=overflow).view(flow=True).value
                        
                        ## Calculate KDE shape, apply two times so that we specify a finer KDE bindwidth based on the first result
                        from scipy.stats import gaussian_kde
                        kde = gaussian_kde(df_mc_herwig_tmp[vname].values, weights=np.clip(df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 0, +np.inf))
                        kde = gaussian_kde(df_mc_herwig_tmp[vname].values, weights=np.clip(df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 0, +np.inf), bw_method=kde.factor/g_custom_kde_bw[vname])
                        kde_int = np.zeros([nbin, 2])
                        
                        ## Integrate the KDE function to obtain KDE histogram
                        for i, (low, high) in enumerate(zip(edges[:-1], edges[1:])):
                            if low in g_custom_kde_binmask[vname]:
                                continue
                            kde_int[i] = [kde.integrate_box_1d(low, high), hv_herwig_orig_value[i]]
                        # print('rescale kde sum to original herwig sum: ', kde_int[:,1].sum() / kde_int[:,0].sum())
                        kde_int[:,0] *= kde_int[:,1].sum() / kde_int[:,0].sum()
                        
                        ## Fill with original madgraph hist if we plan to mask the bin for KDE. 
                        ## This is based on the fact that KDE cannot model the hist well in the marginal bins
                        hv[cat+'_herwig.value'] = np.array([kde_int[i][0] if kde_int[i][0]!=0 else hv[cat].value[i] for i in range(nbin)])
                        
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
            
                    ## Extract the PSWeight histogram
                    if 'psWeight' in sys_name:
                        if year==2018:  ## for 2018, calculate the hist by PSWeight vars 
                            ps_idx = {'psWeightIsrUp':2, 'psWeightIsrDown':0, 'psWeightFsrUp':3, 'psWeightFsrDown':1}
                            hv[cat] = get_hist(df_mc_tmp[vname].values, bins=edges, weights=df_mc_tmp.eval(wgtstr_dm_sys+f'*PSWeight{ps_idx[sys_name]+1}').values, underflow=underflow, overflow=overflow).view(flow=True)
                        else:  ## for 2016/17 extract the PSWeight hist based on 2018 result (transfer the ratio for PSWeight/nominal)
                            import re
                            outputdir_ps_18 = outputdir.replace(f'_SF{year}_', '_SF2018_')
                            hv_nom_18 = uproot.open(outputdir_ps_18.replace(sys_name, 'nominal')+f'inputs_{b}.root')[cat]
                            hv_ps_18 = uproot.open(outputdir_ps_18+f'inputs_{b}.root')[cat+'_'+sys_name]
                            hv[cat].value *= hv_ps_18.values / hv_nom_18.values
                        # print (hv[cat].value)
                    
                    ## Extract the sfBDTFloAround histogram.
                    ## Method: to utilize the nominal hist for sfbdt>0.95 or 0.85 and migrate the MC-to-data confidence level in the 0.90 case
                    if 'sfBDTFloAround' in sys_name:
                        from scipy.stats import chi2
                        hv_data = uproot.open(outputdir.replace(sys_name, 'nominal')+f'inputs_{b}.root')['data_obs'].values  ## nominal data hist for 0.90
                        _bdtname = '95' if 'Up' in sys_name else '85'
                        fr = uproot.open(outputdir.replace(sys_name, 'nominal').replace(f'/bdt{int(g_sfBDT_val_list[-1]*1000)}/',f'/bdt{_bdtname}0/')+f'inputs_{b}.root')
                        fr_data, fr_mc = fr['data_obs'].values, fr['flvC'].values+fr['flvB'].values+fr['flvL'].values  ## nominal data & MC hist for 0.95 or 0.85 (depends on Up or Down)
                        
                        ## For each bins, migrate the confidence level of MC yield F0 given data yield D0 to the target data yield D => F
                        hv_mc = []
                        for D, D0, F0 in zip(hv_data, fr_data, fr_mc):
                            ## The precise calculation
                            F = 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0+2), 2*D+2) if F0>D0 else 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0), 2*D)
                            if F == np.inf: ## in case the formula results in inf (may occur if F0 >> D0)
                                assert F0 > D0
                                sigD0 = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D0+2) - D0
                                sigD = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D+2) - D
                                F = D + sigD/sigD0*(F0-D0)
                            hv_mc.append(F)
                        
                        ## Obtain flavor template based on the flavor proportion in 0.95 or 0.85 region
                        hv[cat].value = np.nan_to_num(hv_mc * fr[cat].values / fr_mc, nan=0)
                        
            ## Fill the hv[cat] (for qcd*, fill hv[cat+'_herwig.value']) into TH1D and save into ROOT
            for cat in hist.keys():
                ## Special handling for qcdSyst / qcdKdeSyst
                if 'qcd' in sys_name and 'SystUp' in sys_name:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat+'_herwig.value'][i])
                elif 'qcd' in sys_name and 'SystDown' in sys_name:
                    hv[cat+'_herwig.value'] = g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)]
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, 2 * hv[cat].value[i] - hv[cat+'_herwig.value'][i])
                    g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)] = None

                ## Normal routine
                else:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat].value[i])
                        hist[cat].SetBinError(i+1, np.sqrt(hv[cat].variance[i]))
                
                ## Fix some buggy points
                if cat!='data':
                    for i in range(nbin):
                        if hist[cat].GetBinContent(i+1) <= 1e-3:
                            hist[cat].SetBinContent(i+1, 1e-3)
                            hist[cat].SetBinError(i+1, 1e-3)
                        elif hist[cat].GetBinError(i+1) > hist[cat].GetBinContent(i+1):
                            hist[cat].SetBinError(i+1, hist[cat].GetBinContent(i+1))

                hist[cat].Write()
        ## Close the ROOT file if error occurs (otherwise the notebook is easily corrupted)
        finally:
            fw.Close()

Now we launch the template maker

In [None]:
launch_maker()

# Data/MC comparison plots

Based on the DataFrame `df1`, this section aims to make data and MC plots, while MC is categorized into three flavors: C/B/L.
With the universial make_data_mc_plots function, one can make specify any final selection, any sample list to produce the standard hist+ratio plot.

The below recipe can make a default set of plots.

In [None]:
### ================ configuration  ===================

def make_config_dm(sl_dm, wgtstr_dm):
    return {
        'data':  ('Data',       'jetht-noht',      '1.0',    ''      ),
        'flvB':  ('QCD (flvB)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons>=1'  ),
        'flvC':  ('QCD (flvC)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons==0 & fj_x_nchadrons>=1'  ),
        'flvL':  ('QCD (flvL)', sl_dm[:-1],        wgtstr_dm,   'fj_x_nbhadrons==0 & fj_x_nchadrons==0'  ),
    }

categories_dm = ['flvL', 'flvB', 'flvC', 'data']

bininfo_dm = [ #(savename, vname, nbin, xmin, xmax, label)
    ('ht', 'ht', 50, 0, 2000, r'$H_{T}$ [GeV]'),
    ('fj_x_pt', 'fj_x_pt', 20, 200, 800, r'$p_{T}(AK15)$ [GeV]'),
    ('fj_x_eta', 'fj_x_eta', 20, -2.5, 2.5, r'$\eta(AK15)$'),
    ('fj_x_sdmass', 'fj_x_sdmass', 15, 50, 200, r'$m_{SD}(AK15)$ [GeV]'),
    ('fj_x_sfBDT', 'fj_x_sfBDT', 50, 0.5, 1, r'$sfBDT(AK15)$'),

    ('fj_x_ParticleNetMD_XccVsQCD', 'fj_x_ParticleNetMD_XccVsQCD', 40, 0, 1, r'ParticleNetMD_XccVsQCD(AK15)'),
    ('fj_x_ParticleNetMD_XccVsQCD_08', 'fj_x_ParticleNetMD_XccVsQCD', 40, 0.8, 1, r'ParticleNetMD_XccVsQCD(AK15)-u'),
    
    ("fj_x_btagcsvv2", "fj_x_btagcsvv2", [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, r'$CSVv2$'),
    ("mSV12_ptmax_log", "mSV12_ptmax_log", [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, r'$log(m_{SV1,p_{T}\,max}\; /GeV)$'),
    ("mSV12_dxysig_log", "mSV12_dxysig_log", [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, r'$log(m_{SV1,d_{xy}sig\,max}\; /GeV)$'),
]

In [None]:
### ================ slim on cc-tagger, sfBDT, then make data/MC plots ===================

import seaborn as sns
def set_sns_color(*args):
    sns.palplot(sns.color_palette(*args))
    sns.set_palette(*args)
    
def make_data_mc_plots(sl_dm, config_dm, finsel, prefix, **kwargs):
    r"""To make standard hist+ratio plots based on the sample list and the final selection
    Arguments:
        sl_dm: sample list
        config_dm: configuration set for each categories in the plots, in the dict format. name: (label, sample/sample list, weight string, cat selection)
        finsel: final selections made to produce the plots
        prefix: prefix string used in the output plot title
        kwargs: includes further KDE-related variables
    """
    
    df2 = {}
    for sam in sl_dm:
        df2[sam] = df1[sam].query(finsel)

    result_dic = {savename: {} for savename, _, _, _, _, _ in bininfo_dm}
    for savename, vname, nbin, xmin, xmax, vlabel in bininfo_dm:
        if 'plot_vars' in kwargs and savename not in kwargs['plot_vars']:
            continue
        if not isinstance(nbin, int):
            edges, xmin, xmax, nbin = nbin, min(nbin), max(nbin), len(nbin)
        else:
            edges = np.linspace(xmin, xmax, nbin+1)

        label, hdm = {}, {}
        underflow = False if vlabel[-2:] in ['-u','-a'] else True
        overflow  = False if vlabel[-2:] in ['-o','-a'] else True
        if vlabel[-2:] in ['-u','-o','-a']:
            vlabel = vlabel[:-2]
        
        if 'g_do_kde_vars' in kwargs and savename in kwargs['g_do_kde_vars'] and kwargs['g_do_kde_vars'][savename]==True:
            g_do_kde_vars = True
            kde = {}
        else:
            g_do_kde_vars = False
        
        ## Loop over categories to extract the hist for each flavor and data
        for cat in categories_dm:
            lab, sam, wgt, sel = config_dm[cat]
            label[cat] = lab
            if cat != 'data':
                if not isinstance(sam, list):
                    df2tmp = df2[sam].query(sel) if sel not in ['','1==1'] else df2[sam]
                else:
                    df2tmp = []
                    for s in sam:
                        df2tmp.append(df2[s].query(sel) if sel not in ['','1==1'] else df2[s])
                    df2tmp = pd.concat(df2tmp, ignore_index=True)
                hdm[cat] = get_hist(df2tmp[vname].values, bins=edges, weights=df2tmp.eval(wgt).values, underflow=underflow, overflow=overflow)
                if g_do_kde_vars:
                    from scipy.stats import gaussian_kde
                    from scipy import integrate
                    import multiprocessing
                    if 'custom_kde' in kwargs.keys() and savename in kwargs['custom_kde']:
                        kde[cat] = kwargs['custom_kde'][savename][cat]
                        kde_int_res = [
                                integrate.quad(kde[cat][0], -np.inf if (i==0 and underflow) else edges[i], 
                                                  +np.inf if (i==len(edges)-1 and overflow) else edges[i+1]) for i in range(len(edges)-1)]
                    else:
                        kdetmp = gaussian_kde(df2tmp[vname].values, weights=np.clip(df2tmp.eval(wgt).values, 0, np.inf))
                        if 'g_custom_kde_bw' in kwargs.keys() and savename in kwargs['g_custom_kde_bw']:
                            kdetmp = gaussian_kde(df2tmp[vname].values, weights=np.clip(df2tmp.eval(wgt).values, 0, np.inf), bw_method=kdetmp.factor/kwargs['g_custom_kde_bw'][savename])
                        kde[cat] = (kdetmp, df2tmp.eval(wgt).sum())
                        kde_int_res = [(kde[cat][0].integrate_box_1d(-np.inf if (i==0 and underflow) else edges[i], +np.inf if (i==len(edges)-1 and overflow) else edges[i+1]), 0.) for i in range(len(edges)-1)]
                    hdm[cat+'_kde'] = hdm[cat].copy()
                    hdm[cat+'_kde'].view(flow=True).value = np.array([kde_int_res[i][0] for i in range(len(edges)-1)]) * kde[cat][1]
                    hdm[cat+'_kde'].view(flow=True).variance = np.zeros(len(edges)-1)
                        
            else: ## is data: no sel, weight=1
                hdm[cat] = get_hist(df2[sam][vname].values, bins=edges, weights=np.ones(df2[sam].shape[0]), underflow=underflow, overflow=overflow)
        
        cat_sufs = ['']
        if g_do_kde_vars:
            cat_sufs += ['_kde']
        for cat_suf in cat_sufs:
            ## Draw the standard hist_ratio plot
            set_sns_color('cubehelix_r', 3) ## set the color palette
            f = plt.figure(figsize=(12,12))
            gs = mpl.gridspec.GridSpec(2, 1, height_ratios=[3, 1], hspace=0.05) 
            
            ## Upper histogram panel
            ax = f.add_subplot(gs[0])
            hep.cms.label(data=True, paper=False, year=2016, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
            ax.set_xlim(xmin, xmax); ax.set_xticklabels([]); ax.set_ylabel('Events / bin', ha='right', y=1.0)

            plot_hist([hdm[cat+cat_suf] for cat in categories_dm if cat!='data'], bins=edges, label=[label[cat] for cat in categories_dm if cat!='data'], histtype='fill', edgecolor='k', linewidth=1, stack=True) ## draw stacked bkg
            cats_mc = list(set(categories_dm) - set(['data']))
            hdm_add = hdm[cats_mc[0]+cat_suf].copy()
            for cat in cats_mc[1:]:
                hdm_add += hdm[cat+cat_suf]
            bkgtot, bkgtot_err = hdm_add.view(flow=True).value, np.sqrt(hdm_add.view(flow=True).variance)
            ax.fill_between(edges, (bkgtot-bkgtot_err).tolist()+[0], (bkgtot+bkgtot_err).tolist()+[0], label='BKG unce.', step='post', hatch='///', edgecolor='darkblue', facecolor='none', linewidth=0) ## draw bkg unce.
            plot_hist(hdm['data'], bins=edges, label='Data', histtype='errorbar', color='k', markersize=15, elinewidth=1.5) ## draw data
            # ax.set_yscale('log')

            ax.legend()
            # ax.legend(loc='upper left'); ax.set_ylim(0, 1.4*ax.get_ylim()[1])
            
            ## Ratio panel
            ax1 = f.add_subplot(gs[1]); ax1.set_xlim(xmin, xmax); ax1.set_ylim(0.001, 1.999)
            ax1.set_xlabel(vlabel, ha='right', x=1.0); ax1.set_ylabel('Data / MC', ha='center')
            ax1.plot([xmin,xmax], [1,1], 'k'); ax1.plot([xmin,xmax], [0.5,0.5], 'k:'); ax1.plot([xmin,xmax], [1.5,1.5], 'k:')

            hr = hdm['data'].view(flow=True).value / hdm_add.view(flow=True).value
            # hr_err = hr * np.sqrt(hdm['data'].view(flow=True).variance/(hdm['data'].view(flow=True).value**2) + hdm_add.view(flow=True).variance/(hdm_add.view(flow=True).value**2))
            hr_dataerr = hr * np.sqrt(hdm['data'].view(flow=True).variance/(hdm['data'].view(flow=True).value**2))
            ax1.fill_between(edges, ((bkgtot-bkgtot_err)/bkgtot).tolist()+[0], ((bkgtot+bkgtot_err)/bkgtot).tolist()+[0], step='post', hatch='///', edgecolor='darkblue', facecolor='none', linewidth=0) ## draw bkg unce.
            hep.histplot(np.nan_to_num(hr, nan=-1), bins=edges, yerr=np.nan_to_num(hr_dataerr), histtype='errorbar', color='k', markersize=15, elinewidth=1) ## draw data in ratio plot

            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}__{finsel}__{savename}{cat_suf}.png')
            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}__{finsel}__{savename}{cat_suf}.pdf')

        ## kde/orig comparison plots
        if g_do_kde_vars:
            mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green'])
            f, ax = plt.subplots(figsize=(12,12))
            hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')
            x_contin = np.linspace(xmin, xmax, 201)
            bin_width = edges[int(nbin/2)+1] - edges[int(nbin/2)]
            for cat, color in zip(['flvC', 'flvB', 'flvL'], ['blue', 'red', 'green']):
                lab, sam, wgt, sel = config_dm[cat]
                ax.plot(x_contin, kde[cat][0](x_contin) * kde[cat][1] * bin_width, label=lab+' KDE', linestyle=':', color=color)
            for cat, color in zip(['flvC', 'flvB', 'flvL'], ['blue', 'red', 'green']):
                lab, sam, wgt, sel = config_dm[cat]
                hep.histplot(hdm[cat+'_kde'].view(flow=True).value, bins=edges, label=lab+' KDE integral', linestyle='--', color=color)
                plot_hist(hdm[cat], bins=edges, label=lab, normed=False, color=color)
            ax.set_xlim(xmin, xmax); ax.set_xlabel(vlabel, ha='right', x=1.0); ax.set_ylabel('A.U.', ha='right', y=1.0); ax.legend()

            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}:kde_shape__{finsel}__{savename}.png')
            plt.savefig(f'plots/{g_dirname}_{year}/{prefix}:kde_shape__{finsel}__{savename}.pdf')
            

g_do_kde_vars = {'fj_x_btagcsvv2':True, 'mSV12_ptmax_log':True, 'mSV12_dxysig_log':True}
g_custom_kde_bw = {'fj_x_btagcsvv2':15, 'mSV12_ptmax_log':4, 'mSV12_dxysig_log':4}

g_dirname = 'test_datamc' ## config me
if not os.path.exists(f'plots/{g_dirname}_{year}'):
    os.makedirs(f'plots/{g_dirname}_{year}')

for ptmin, ptmax in [(200, 250), (250, 300), (300, 350), (350, 400), (400, 500), (500, 100000), (200, 100000)]:
    ## 1. With MadGraph sample list
    wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt*sfbdtwgt_g90_incl'
    sl_dm = ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.5', prefix='mg')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.9', prefix='mg')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.9 & fj_x_ParticleNetMD_XccVsQCD>0.95', prefix='mg')

    ## 2. With MadGraph sample list, while using the optional MC-to-data reweight scheme (on pT)
    wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*ad_ptwgt'
    sl_dm = ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.9', prefix='mg_ptwgt')
    
    ## 3. With Herwig sample list
    wgtstr_dm = f'genWeight*xsecWeight*puWeight*{lumi[year]}*htwgt_herwig*sfbdtwgt_g90_herwig_incl'
    sl_dm = ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.5', prefix='herwig')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.9', prefix='herwig')
    make_data_mc_plots(sl_dm, make_config_dm(sl_dm, wgtstr_dm), finsel=f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax} & fj_x_sfBDT>0.9 & fj_x_ParticleNetMD_XccVsQCD>0.95', prefix='herwig', 
                       g_do_kde_vars=g_do_kde_vars, g_custom_kde_bw=g_custom_kde_bw) ## also make the KDE plots

# Signal/proxy comparison plots

Based on the DataFrame `df1`, The below recipe creates the proxy jet (from MC) and h->cc signal jet comparison plots on various jet observables.

In [None]:
## Load the hcc signal tree
_df0['vhcc-2L'] = uproot.open(f"samples/trees/20200906_VH_extfillsv_2016_2L/mc/vhcc_tree.root")['Events'].pandas.df()

boosted = "v_pt>200 & ak15_pt>200 & dphi_V_ak15>2.5 & ak15_sdmass>50 & ak15_sdmass<200"
basecut = f"fj_x_pt>200 & fj_x_sdmass>50 & fj_x_sdmass<200 & passmetfilters & fj_x_nbhadrons==0 & fj_x_nchadrons>=1"
basecut_vhcc_2L = "v_mass>75 & v_mass<105 & ((abs(lep1_pdgId)==11 & passTrigEl) | (abs(lep1_pdgId)==13 & passTrigMu)) & " + boosted + " & n_ak4<3"
df_comp = {}
df_comp['subst_qcd-mg-noht'] = df1['subst_qcd-mg-noht'].query(basecut)
df_comp['vhcc-2L'] = _df0['vhcc-2L'].query(basecut_vhcc_2L)

wgtstr = 'genWeight*xsecWeight*puWeight*htwgt'
wgtstr_vhcc_2L = 'genWeight*xsecWeight*puWeight'
basesel = { # name: cut, label
    'sv': ("fj_x_sj1_nsv>=1 & fj_x_sj2_nsv>=1", r'$N_{SV}^{match}\geq 1$'),
    'tightsv': ("(fj_x_sj1_sv1_ntracks>2 & abs(fj_x_sj1_sv1_dxy)<3 & fj_x_sj1_sv1_dlensig>4 & fj_x_sj2_sv1_ntracks>2 & abs(fj_x_sj2_sv1_dxy)<3 & fj_x_sj2_sv1_dlensig>4)", r'$N_{SV,tight}^{match}\geq 1$'),
}
def func_basesel(name):
    if name in basesel.keys():
        return basesel[name]
    elif name[:5]=='sfbdt':
        x = float(name[5:])/1000.
        return ('fj_x_sfBDT>%.3f'%x, r'$sfBDT>%.2f$'%x)
    else:
        raise RuntimeError('Baseline cut name not recognized.')

In [None]:
bininfo = [ #(vname, nbin, xmin, xmax, label, *vname for nominal*)   
    ('fj_x_ParticleNetMD_XccVsQCD', 20, 0, 1, 'ParticleNetMD_XccVsQCD (AK15)', 'ak15_ParticleNetMD_HccVsQCD'),
    ('fj_x_sdmass', 15, 50, 200, r'$m_{SD}$ (AK15)', 'ak15_sdmass'),
    ('fj_x_tau21', 20, 0, 1, r'$\tau_{21}$ (AK15)', 'ak15_tau21'), ##avaliable
    
    ('fj_x_deltaR_sj12', 40, 0, 1.5, r'$\Delta R_{sj_{1},sj_{2}}$ (AK15)', 'ak15_deltaR_sj12'),
    ('fj_x_pt', 40, 0, 1000, r'$p_{T}$ (AK15)', 'ak15_pt'),
    ('fj_x_sj1_pt', 40, 0, 1000, r'$p_{T,sj_{1}}$ (AK15)', 'ak15_sj1_pt'),
    ('fj_x_sj1_rawmass', 40, 0, 200, r'$m_{sj_{1},raw}$ (AK15)', 'ak15_sj1_rawmass'), ##avaliable
    ('fj_x_sj2_pt', 40, 0, 1000, r'$p_{T,sj_{2}}$ (AK15)', 'ak15_sj2_pt'),
    ('fj_x_sj2_rawmass', 40, 0, 200, r'$m_{sj_{2},raw}$ (AK15)', 'ak15_sj2_rawmass'), ##avaliable
    
    ('fj_x_nsv', 10, 0, 10, r'$N_{SV}$ (AK15)', 'ak15_nlooseSV'), ##avaliable
    ('fj_x_nsv_ptgt25', 8, 0, 8, r'$N_{SV,p_{T}\geq 25}$ (AK15)', 'ak15_nlooseSV_ptgt25'), ##avaliable
    ('fj_x_nsv_ptgt50', 8, 0, 8, r'$N_{SV,p_{T}\geq 50}$ (AK15)', 'ak15_nlooseSV_ptgt50'), ##avaliable
    ('fj_x_ntracks', 20, 0, 20, r'$N_{tracks}$ (AK15)', 'ak15_nlooseSV_ntracks'), ##avaliable
    ('fj_x_ntracks_sv12', 20, 0, 20, r'$N_{tracks\;for\;SV_{1,2}}$ (AK15)', 'ak15_nlooseSV_ntracks_sv12'), ##avaliable
    ('fj_x_sj1_nsv', 20, 0, 20, r'$N_{SV\;from\;sj_{1}}$ (AK15)', 'ak15_sj1_nlooseSV'), ##avaliable
    ('fj_x_sj1_ntracks', 20, 0, 20, r'$N_{tracks\;from\;sj_{1}}$ (AK15)', 'ak15_sj1_nlooseSV_ntracks'), ##avaliable
    ('fj_x_sj1_sv1_pt', 20, 0, 200, r'$p_{T,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_pt'),
    ('fj_x_sj1_sv1_mass', 20, 0, 50, r'$m_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_mass'), ##avaliable
    ('fj_x_sj1_sv1_masscor', 20, 0, 50, r'$m_{cor\;for\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_masscor'),
    ('fj_x_sj1_sv1_ntracks', 20, 0, 20, r'$N_{tracks\;from\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_ntracks'),
    ('fj_x_sj1_sv1_dxy', 20, 0, 5, r'$d_{xy,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dxy'),
    ('fj_x_sj1_sv1_dxysig', 20, 0, 20, r'$\sigma_{d_{xy},\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dxysig'),
    ('fj_x_sj1_sv1_dlen', 20, 0, 5, r'$d_{z,\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dlen'),
    ('fj_x_sj1_sv1_dlensig', 20, 0, 20, r'$\sigma_{d_{z},\;SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_dlensig'),
    ('fj_x_sj1_sv1_chi2ndof', 20, 0, 5, r'$\chi^2 / Ndof_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_chi2ndof'),
    ('fj_x_sj1_sv1_pangle', 40, 0, 5, r'$pAngle_{SV_{1}\;in\;sj_{1}}$ (AK15)', 'ak15_sj1_looseSV_pangle'),
]

In [None]:
g_dirname = 'test_sigpxy' ## config me
if not os.path.exists(f'plots/{g_dirname}_{year}'):
    os.makedirs(f'plots/{g_dirname}_{year}')

## Make comparison plots for normal weight (MC adopt the same weight as in the fit), or for additional mass / pT / tau21 weight
for wgtfac, pfwgt in zip(['1','massdatamcwgt','ptdatamcwgt'], ['nom', 'massdatamcwgt', 'ptdatamcwgt']):

    wgtstr = f'genWeight*xsecWeight*puWeight*htwgt*sfbdtwgt_g90_incl*{wgtfac}'
    wgtstr_vhcc_2L = 'genWeight*xsecWeight*puWeight'

    mpl.rcParams['axes.prop_cycle'] = cycler(color=['blue', 'red', 'green', 'violet', 'darkorange', 'black', 'cyan', 'yellow'])
    do_rwgt = 0
    for ptmin, ptmax in [(200, 250), (250, 300), (300, 350), (350, 400), (400, 500), (500, 100000), (200, 100000)]:
        presel, presel1 = f'fj_x_pt>{ptmin} & fj_x_pt<{ptmax}', f'ak15_pt>{ptmin} & ak15_pt<{ptmax}'
        label = {'subst_qcd-mg-noht': r'g(cc) (subst.)', 'vhcc-2L':r'$Z(\ell\ell)H(cc)$'}

        for vname, nbin, xmin, xmax, vlabel, vname1 in bininfo:
            f, ax = plt.subplots(figsize=(12,12))
            hep.cms.label(data=False, paper=False, year=year, ax=ax, rlabel=r'%s $fb^{-1}$ (13 TeV)'%lumi[year], fontname='sans-serif')

            for sam in ['vhcc-2L']:
                dftmp = df_comp[sam] if presel1=='' else df_comp[sam].query(presel1)
                h = get_hist(dftmp[vname1].values, bins=np.linspace(xmin, xmax, nbin+1), weights=dftmp.eval(wgtstr_vhcc_2L).values)
                plot_hist(h, label=label[sam]+' $N_{SV}^{match}\geq 1$' if sam=='qcd-mg' else label[sam], normed=True)

            for sam in ['subst_qcd-mg-noht']:
                for ext in ['sv+sfbdt500', 'sv+sfbdt850', 'sv+sfbdt900', 'sv+sfbdt950']:
                    cutstr = ' & '.join(list(filter(None, [presel]+[func_basesel(cname)[0] for cname in ext.split('+')]))) ## join the cut string
                    if 'qcd-mg' in sam:  print (cutstr)
                    dftmp = df_comp[sam].query(cutstr)
                    h = get_hist(dftmp[vname].values, bins=np.linspace(xmin, xmax, nbin+1), weights=dftmp.eval(wgtstr))
                    plot_hist(h, label=label[sam]+' '+(rwgt_ext_label if do_rwgt else '')+' & '.join([func_basesel(cname)[1] for cname in ext.split('+')]), normed=True)

            ax.legend()
            ax.set_xlim(xmin, xmax)
            ax.set_xlabel(vlabel, ha='right', x=1.0); ax.set_ylabel('A.U.', ha='right', y=1.0); 
            plt.savefig(f'plots/{g_dirname}_{year}/{pfwgt}_{presel}__{vname}.png')
            plt.savefig(f'plots/{g_dirname}_{year}/{pfwgt}_{presel}__{vname}.pdf')