# Make templates for fit (`pandas` workflow)

This notebook aims to make the ROOT-format templates for fit. It reads the yaml config file and the backuped files from the previous notebook `preprocess.ipynb` to make the templates needed for the fit.

We use `pandas` dataframe for event processing. An alternative notebook `make_template_ak.ipynb` is based on `ak-array` data structure. Using `pandas` can be generally faster to produce various similar templates we need for each shape systematics, because it shrinks the original dataset to a rather small dataframe by applying all pre-selections, then makes various templates based on the dataframe. Although, by reading the original large dataset into `pandas`, it sometimes contumes large RAM (10-30 GB).

In [None]:
from data_utils import get_hist, plot_hist

In [None]:
import uproot3
from uproot3_methods import TLorentzVectorArray, TLorentzVector
import pandas as pd
import numpy as np
import os

In [None]:
## Load the config.yml
import yaml
with open('cards/config_bb_std.yml') as f:
    config = yaml.safe_load(f)

## Load files

Load the ROOT files into pandas DataFrame

In [None]:
year = config['year']

lumi = {2016: 35.92, 2017: 41.53, 2018: 59.74}

read_sample_list_map = {
    'qcd-mg-noht': 'mc/qcd-mg_tree.root',
    'qcd-herwig-noht': 'mc/qcd-herwig_tree.root',
    'top-noht': 'mc/top_tree.root',
    'v-qq-noht': 'mc/v-qq_tree.root',
    'jetht-noht': 'data/jetht_tree.root',
}
if config['samples']['use_bflav']:
    read_sample_list_map['qcd-mg-bflav-noht'] = 'mc/qcd-mg-bflav_tree.root',
omit_herwig = 'optional' in config['samples'] and 'omit_herwig' in config['samples']['optional'] and config['samples']['optional']['omit_herwig']
if omit_herwig:
    read_sample_list_map.pop('qcd-herwig-noht', None)
if 'optional' in config['samples'] and 'exclude_mc_sample_in_making_template' in config['samples']['optional']:
    for ex_sam in config['samples']['optional']['exclude_mc_sample_in_making_template']:
        read_sample_list_map.pop(ex_sam, None)
print('Read samples for making templates:', read_sample_list_map.keys())

minimal_branches = set([  ## minimal set of branches read into the notebook
    "run", "luminosityBlock", "event", "genWeight", "jetR", "passmetfilters", 
    "fj_1_pt", "fj_1_eta", "fj_1_sdmass", "fj_1_tau21", "fj_1_btagcsvv2", "fj_1_btagjp", "fj_1_sfBDT", "fj_1_nbhadrons", "fj_1_nchadrons", "fj_1_sj1_nbhadrons", "fj_1_sj1_nchadrons", "fj_1_sj2_nbhadrons", "fj_1_sj2_nchadrons", 
    "fj_2_pt", "fj_2_eta", "fj_2_sdmass", "fj_2_tau21", "fj_2_btagcsvv2", "fj_2_btagjp", "fj_2_sfBDT", "fj_2_nbhadrons", "fj_2_nchadrons", "fj_2_sj1_nbhadrons", "fj_2_sj1_nchadrons", "fj_2_sj2_nbhadrons", "fj_2_sj2_nchadrons", 
    "passHTTrig", "ht", "fj_1_is_qualified", "fj_2_is_qualified", "puWeight", "puWeightUp", "puWeightDown", "xsecWeight",
    'fj_1_sj1_sv1_ntracks', 'fj_1_sj1_sv1_dxy', 'fj_1_sj1_sv1_dlensig', 'fj_1_sj2_sv1_ntracks', 'fj_1_sj2_sv1_dxy', 'fj_1_sj2_sv1_dlensig', 'fj_2_sj1_sv1_ntracks', 'fj_2_sj1_sv1_dxy', 'fj_2_sj1_sv1_dlensig', 'fj_2_sj2_sv1_ntracks', 'fj_2_sj2_sv1_dxy', 'fj_2_sj2_sv1_dlensig',
])
minimal_branches |= set([config['tagger']['var'].replace('fj_x', 'fj_1'), config['tagger']['var'].replace('fj_x', 'fj_2')])
minimal_branches |= set([ ## for test only!
#     "fj_1_sj1_matchallmu", "fj_1_sj2_matchallmu", "fj_2_sj1_matchallmu", "fj_2_sj2_matchallmu",
#     'fj_1_btagHbb', 'fj_1_btagDeepB', 'fj_1_btagDDBvLV2', 'fj_1_ParticleNetMD_XbbVsQCD', 'fj_1_btagDDCvLV2', 'fj_1_ParticleNetMD_XccVsQCD', 'fj_1_btagDDCvBV2', 'fj_1_ParticleNetMD_Xcc', 'fj_1_ParticleNetMD_Xbb', 'fj_2_btagHbb', 'fj_2_btagDeepB', 'fj_2_btagDDBvLV2', 'fj_2_ParticleNetMD_XbbVsQCD', 'fj_2_btagDDCvLV2', 'fj_2_ParticleNetMD_XccVsQCD', 'fj_2_btagDDCvBV2', 'fj_2_ParticleNetMD_Xcc', 'fj_2_ParticleNetMD_Xbb',
])

ext_hlt_branches = {  ## extra branches depend on year
    2016: ['HLT_PFHT125', 'HLT_PFHT200', 'HLT_PFHT250', 'HLT_PFHT300', 'HLT_PFHT350', 'HLT_PFHT400', 'HLT_PFHT475', 'HLT_PFHT600', 'HLT_PFHT650', 'HLT_PFHT800', 'HLT_PFHT900'],
    2017: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
    2018: ['HLT_PFHT180', 'HLT_PFHT250', 'HLT_PFHT370', 'HLT_PFHT430', 'HLT_PFHT510', 'HLT_PFHT590', 'HLT_PFHT680', 'HLT_PFHT780', 'HLT_PFHT890', 'HLT_PFHT1050', 'HLT_PFHT350'],
}
minimal_branches |= set(ext_hlt_branches[year])
minimal_branches |= set(['nPSWeight', 'PSWeight']) if year==2018 or ('optional' in config['samples'] and config['samples']['optional'].get('use_own_psweight', None)) else set()  ## extra PSWeight branches for 2018
minimal_branches_for_data = set(minimal_branches) - set(["fj_1_dr_H", "fj_1_dr_Z", "fj_2_dr_H", "fj_2_dr_Z", 'genWeight', "puWeight", "puWeightUp", "puWeightDown", "xsecWeight", 'nPSWeight', 'PSWeight',
                                'fj_1_nchadrons', 'fj_1_nbhadrons','fj_2_nbhadrons','fj_1_sj1_nbhadrons','fj_2_sj1_nbhadrons','fj_1_sj2_nbhadrons','fj_2_sj2_nbhadrons',
                                'fj_2_nchadrons','fj_1_sj1_nchadrons','fj_2_sj1_nchadrons','fj_1_sj2_nchadrons','fj_2_sj2_nchadrons'])

## Read into pandas DataFrame
sample_prefix = f"{config['samples']['sample_prefix']}_{year}"
_df0 = {}
for sam in read_sample_list_map:
    _df0[sam] = uproot3.open(f"{sample_prefix}/{read_sample_list_map[sam]}")['Events'].pandas.df(minimal_branches if 'mc/' in read_sample_list_map[sam] else minimal_branches_for_data, flatten=False)

## Load backup pickels

In [None]:
## Load extra variables stored during pre-processing
backup_name = f"{config['samples']['name']}_SF{config['year']}"

import pickle
import awkward1 as ak
for sam in os.listdir(f'prep/{backup_name}'):
    if sam in read_sample_list_map:
        for var in os.listdir(f'prep/{backup_name}/{sam}'):
            if var.startswith('.'):
                continue
            if var == 'maskdict':
                with open(f'prep/{backup_name}/{sam}/maskdict', 'rb') as f:
                    maskdict = pickle.load(f)
                for key in maskdict:
                    _df0[sam]['mask_'+key] = ak.fill_none(maskdict[key], 0)
                print('storing...', sam, 'maskdict', maskdict.keys())
            else:
                with open(f'prep/{backup_name}/{sam}/{var}', 'rb') as f:
                    _df0[sam][var] = ak.fill_none(pickle.load(f), 0)
                print('loading...', sam, var)
    elif not sam.startswith('.') and os.path.isfile(f'prep/{backup_name}/{sam}'):
        with open(f'prep/{backup_name}/{sam}', 'rb') as f:
            _df0[sam] = pickle.load(f)
        print('loading...', sam)

In [None]:
## Combine branches fj_1/2 to fj_x in pandas dataframe
df1 = {}
updated_key_list = list(_df0.keys())
for sam in updated_key_list:
    if sam in read_sample_list_map:
        ## To concatenate event lists where either fj_1 is qualified OR fj_2 is qualified
        fj_branches = [key.replace('fj_2', 'fj_x') for key in _df0[sam].keys() if key.startswith('fj_2')]  ## all fj_2_ branches expect fj_2_is_qualified
        for i, i_inv in zip(['1','2'], ['2','1']):
            df1[sam + i] = _df0[sam].query(f'mask_fj_{i}_base')  ## select events where fj_1/fj_2 is qualified
            df1[sam + i].drop(columns=set([key.replace('fj_x', f'fj_{i_inv}') for key in fj_branches]) | set(list(_df0[sam].filter(regex='mask_*'))), inplace=True)  ## drop fj branches for the other index
            df1[sam + i].rename(columns={key.replace('fj_x', f'fj_{i}'): key for key in fj_branches}, inplace=True)  ## change branches name from fj_1/fj_2 to a unified name fj_x
            df1[sam + i].loc[:, 'fj_idx'] = int(i)  ## label the jet index
            df1[sam + i].loc[:, 'is_qcd'] = True if 'qcd' in sam else False
        df1[sam] = pd.concat([df1[sam + '1'], df1[sam + '2']])
        if 'mc/' in read_sample_list_map[sam]:
            df1['subst_'+sam] = df1[sam]
        del df1[sam + '1'], df1[sam + '2']
#         del _df0[sam]  # to release memory usage if necessary
    else:
        df1[sam] = _df0[sam]

## Make ROOT templates

We produce the ROOT templates using the DataFrame in this step. The outputs are ROOT files with neat structure. After the further reorganization, they can be used as the Higgs Combine input to implement the fit.

As a reference, we provide an example of the output files and their structure. 
E.g., for a **given fit variable**, **given tagger WP** and a **certain jet-pT bin** for **a single fit**, the output ROOT templates should include the pass and fail MC template in the B/C/L flavors, the data template, and the MC systematics for all specified shape uncertainties. The files are organized in the following structure:
```
─── 20210315_SF2018_AK15_qcd_ak_pnV02_HP_msv12_dxysig_log_var22binsv2  [use variable: msv12_dxysig_log, Tight WP]
    └── Cards
        └── pt250to350   [given pT bin]
            ├── bdt719   [the sfBDT cut points]
            │   ├── nominal                    [the nominal histograms]
            │   │   ├── inputs_fail.root           [include four TH1D: flvC, flvB, flvL, data_obs]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBDown                 [shape uncertainty plots]
            │   │   ├── inputs_fail.root           [include three TH1D: flvC_fracBBDown, flvB_fracBBDown, flvL_fracBBDown]
            │   │   └── inputs_pass.root           [..]
            │   ├── fracBBUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracCCUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fracLightUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fitVarRwgtDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── fitVarRwgtUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightFsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── psWeightIsrUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── puUp
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   ├── sfBDTRwgtDown
            │   │   ├── inputs_fail.root
            │   │   └── inputs_pass.root
            │   └── sfBDTRwgtUp
            │       ├── inputs_fail.root
            │       └── inputs_pass.root
            └── bdt752
                ├── nominal
                │   ├── ...
```

The template making is organized in three nested functions.

In [None]:
#### =========================================================================== Global parameters =========================================================================== ####
g_make_template_mode = 'main'
r"""Options:
        main           : the main fit
        val_pt         : the validation fit -- to use an optional MC subsitute-to-data strategy, i.e. on pT variable only
        val_tosig_mass : the validation fit -- additionally reweight MC & data to h->cc signal jet on mass
        val_tosig_pt   : the validation fit -- additionally reweight MC & data to h->cc signal jet on pt  
        val_tosig_tau21: the validation fit -- additionally reweight MC & data to h->cc signal jet on tau21
        val_crop_bin   : the validation fit -- cropping the marginal bins for fit
"""

g_outdir_prefix = f"{config['routine_name']}_SF{config['year']}"
r"""Prefix for the output dir name """

g_make_unce_types = {'nominal':True, 'pu':True, 'fracBB':True, 'fracCC':True, 'fracLight':True, 'psWeightIsr':True, 'psWeightFsr':True, 'sfBDTRwgt':True, 'fitVarRwgt':True}
r"""The uncertainty types used in the fit. Use False or remove the key to disable an certain unce type
    Note: "qcdSyst" and "qcdKdeSyst" is not used in this verision. "psWeightIsr" and "psWeightFsr" works fine in 2018 while in 2016/17 one need to first garantee the 2018 histograms exist
          so the unce can be transferred.
"""

g_do_fit_for_var = [1, 2, 3]
r""" Do fit for which variable"""

g_mode_bdt_runlist = 'all'
r"""Mode of BDT list for the run. Set 'all' for all 11 BDT values, or 'central' for the central BDT value only"""

g_pt_range = config['pt_range']['range']
r"""pT range for define separate fit points"""

g_tagger_range = config['tagger']['working_points']['range']
g_tagger_var = config['tagger']['var']
r"""Trigger info"""

g_use_bflav = config['samples']['use_bflav']
r"""Use additional B flavor MC samples to improve the statistics for the 'b' catogory"""

g_bdt_mod_factor = None
r"""Set the sfBDT selection expr to sfBDT + 0.5*exp(g_bdt_mod_factor*(tagger-1)) in the template extraction"""

g_mode_psWeight_run_templ = None
r"""Set None for the normal run. If set to 2016 or 2017, produce the 2018 templates for psWeightIsr/Fsr unce that can be migarated to 2016/2017 conditions. sfBDT cut value set under the 2016/2017 condition."""

g_dryrun = False
r"""Launch a test process only without writing the ROOT template files"""

#### ===================================================================================================================================================================================== ####

## Fit info: in the format of [ (fit var, nbins/edges, xmin/None, xmax/None, (underflow, overflow), label), outputdir lambda func ]
g_fitinfo = {
    1: [ ##  main fit var
        ('fj_x_mSV12_dxysig_log', [-0.8,-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2], None, None, (True, True), 'mSV12_dxysig_log'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_msv12_dxysig_log_var22binsv2/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
    2: [ ## the other var for validation
        ('fj_x_mSV12_ptmax_log', [-0.4,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.5,3.2,3.9], None, None, (True, True), 'mSV12_ptmax_log'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_msv12_ptmax_log_var22binsv2/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
    3: [ ## the other var for validation
        ('fj_x_btagcsvv2', [0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,0.98,0.99,0.995,1], None, None, (True, True), 'CSVv2'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_csvv2_var22binsv2/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
    901: [ ## crop the marginal bins for the main var as a validation
        ('fj_x_mSV12_dxysig_log', [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8], None, None, (False, False), 'mSV12_dxysig_log'), 
        lambda prefix, wp, bdt, pt_range, sys_name: f'results/{prefix}_{wp}_msv12_dxysig_log_var22binscrop/Cards/pt{pt_range[0]}to{pt_range[1]}/bdt{int(bdt*1000)}/{sys_name}/'
    ],
}

## Necessary KDE parameters used in qcdKdeSyst unce
g_custom_kde_bw = {'fj_x_btagcsvv2':15, 'mSV12_ptmax_log':4, 'mSV12_dxysig_log':4}
g_custom_kde_binmask = {'fj_x_btagcsvv2':[0], 'mSV12_ptmax_log':[-0.4,1.8,2.5,3.2], 'mSV12_dxysig_log':[-0.8,-0.4,1.8,2.5]}

## Some other global vars
g_do_sfBDT_points = None
g_outdir_prefix_used = None
g_hist_qcdsyst = {}
g_wgtstr_dm_sys_fac = {}
g_hist_fitvar_rwgt = {}

def check_consistency(): ## Consistency check for gloal params
    assert g_make_template_mode in ['main', 'val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_vary_sfbdt', 'val_crop_bin'], \
        'Specified mode cannot be recognized.'
    
    global g_do_fit_for_var
    if g_make_template_mode in ['val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21'] and g_do_fit_for_var != [1]:
        print('Warning: for validation fit, set the fit information to the main variable (1) only')
        g_do_fit_for_var = [1]
    if g_make_template_mode == 'val_crop_bin' and g_do_fit_for_var != [901]:
        print('Warning: for validation fit on cropping the marginal bins, set the fit information to the cropped main variable (901) only')
        g_do_fit_for_var = [901]
    
    global g_mode_bdt_runlist
    if g_make_template_mode.startswith('val_') and g_mode_bdt_runlist != 'central':
        print('Warning: for validation fit, set the BDT run list to central')
        g_mode_bdt_runlist = 'central'
    
    global g_do_sfBDT_points
    if g_mode_bdt_runlist == 'all':
        g_do_sfBDT_points = df1[f"bdt_seq_{config['pt_range']['name']}__{config['main_analysis_tree']['name']}"]
    elif g_mode_bdt_runlist == 'central':
        _points = df1[f"bdt_seq_{config['pt_range']['name']}__{config['main_analysis_tree']['name']}"]
        g_do_sfBDT_points = {k:[_points[k][int((len(_points[k])-1)/2)]] for k in _points}
    elif g_mode_bdt_runlist != 'manual':
        raise RuntimeError('Specified mode for BDT runlist cannot be recognized.')
    
    global g_outdir_prefix_used
    g_outdir_prefix_used = g_outdir_prefix + '_' + config['tagger']['working_points']['name']
    if g_make_template_mode.startswith('val_'):
        g_outdir_prefix_used += '_-' + g_make_template_mode + '-'
    if g_bdt_mod_factor is not None:
        g_outdir_prefix_used = 'bdtmod/' + g_outdir_prefix_used
    
    if g_mode_psWeight_run_templ is not None:
        assert year==2018, 'g_mode_psWeight_run_templ only set for year 2016/2017'
        assert int(g_mode_psWeight_run_templ) in [2016, 2017], 'g_mode_psWeight_run_templ can only be 2016 or 2017'
        import pickle
        if g_mode_bdt_runlist != 'manual':
            with open(f"prep/{config['samples']['name']}_SF{g_mode_psWeight_run_templ}/bdt_seq_{config['pt_range']['name']}__{config['main_analysis_tree']['name']}", 'rb') as f:
                g_do_sfBDT_points = pickle.load(f)
        g_outdir_prefix_used += f"_psWeight{g_mode_psWeight_run_templ}"
        g_make_unce_types = {'nominal':True, 'psWeightIsr':True, 'psWeightFsr':True}

def launch_maker():
    r"""Depth 0: Main function to launch the fit given the global parameters
    """
    check_consistency()
    
    print('Launch variablel list:', g_do_fit_for_var)
    for _ifit in g_do_fit_for_var:
        for _wp in g_tagger_range:
            
            ## Get fit info and output lambda func
            fitinfo, outdir_func = g_fitinfo[_ifit]

            ## The default args in the main fit
            args = {
                'wgtstr_dm': f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt', 'wgtstr_dm_data': None,
                'sl_dm': ['subst_'+s if s!='jetht-noht' else s for s in read_sample_list_map if s not in ['qcd-herwig-noht', 'qcd-mg-bflav-noht']], # default is ['subst_qcd-mg-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
                'sl_dm_herwig': ['subst_'+s if s!='jetht-noht' else s for s in read_sample_list_map if s not in ['qcd-mg-noht', 'qcd-mg-bflav-noht']], # default is ['subst_qcd-herwig-noht', 'subst_top-noht', 'subst_v-qq-noht', 'jetht-noht']
                'config_dm': {
                    'data':  '',
                    'flvB':  'fj_x_nbhadrons>=1',
                    'flvC':  'fj_x_nbhadrons==0 & fj_x_nchadrons>=1',
                    'flvL':  'fj_x_nbhadrons==0 & fj_x_nchadrons==0',
                },
                'categories_dm': ['flvL', 'flvB', 'flvC', 'data'],
                'catMap': {
                    'pass': f'{g_tagger_var}>{g_tagger_range[_wp][0]:.3f} & {g_tagger_var}<={g_tagger_range[_wp][1]:.3f}',
                    'fail': f'{g_tagger_var}<={g_tagger_range[_wp][0]:.3f} | {g_tagger_var}>{g_tagger_range[_wp][1]:.3f}',
                },
                'use_bflav': g_use_bflav, 'args_bflav': {
                    'sl_dm_bflav': ['subst_qcd-mg-bflav-noht'], 'sl_dm_bflav_orig': ['subst_qcd-mg-noht'],
                    'wgtstropt_bflav': lambda s: s.replace('fj_x_htwgt', '(fj_x_htwgt*fj_x_bflav_htwgt)'),
                },
            }
            ## Modify args according to specified global param
            if g_make_template_mode == 'val_pt':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_ad_ptwgt', None
            elif g_make_template_mode == 'val_tosig_mass':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_massdatamcwgt', 'fj_x_massdatamcwgt'
            elif g_make_template_mode == 'val_tosig_pt':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_ptdatamcwgt', 'fj_x_ptdatamcwgt'
            elif g_make_template_mode == 'val_tosig_tau21':
                args['wgtstr_dm'], args['wgtstr_dm_data'] = f'genWeight*xsecWeight*puWeight*{lumi[year]}*fj_x_htwgt*fj_x_tau21datamcwgt', 'fj_x_tau21datamcwgt'

            wrapperPt(df1, fitinfo, lambda bdt, pt_range, sys_name: outdir_func(g_outdir_prefix_used, _wp, bdt, pt_range, sys_name), args)

In [None]:
def wrapperPt(df2, fitinfo, outdir_func, args):
    r"""Depth 1: Process the pT cut and wrap all other following steps
    """
    print('Launch pT range:', g_pt_range)
    for pt_range in g_pt_range:
        pt_range = tuple(pt_range)
        print ('pt range:', pt_range)
        
        ## df2->df2a: apply the pT cut (to speed up) (plus additional selection, if applied)
        df2a = {}
        for sam in ['subst_'+s if s!='jetht-noht' else s for s in read_sample_list_map]:
            df2a[sam] = df2[sam].query(f'fj_x_pt>={pt_range[0]} & fj_x_pt<{pt_range[1]}')
            if 'optional' in config['samples'] and 'additional_selection_in_making_template' in config['samples']['optional']:
                df2a[sam] = df2a[sam].query(config['samples']['optional']['additional_selection_in_making_template'].replace('np.','').replace('ak.',''))
        
        sfBDT_list = g_do_sfBDT_points[pt_range]
        if isinstance(sfBDT_list, dict):
            sfBDT_list = sfBDT_list.values()
        bdt_expr = 'fj_x_sfBDT'
        if g_bdt_mod_factor is not None:
            bdt_expr = f'fj_x_sfBDT + 0.5*exp({g_bdt_mod_factor}*({g_tagger_var}-1))'
        for sfBDT_val in sfBDT_list:
            print(' sfBDT cut at:', sfBDT_val)
            
            ## df2a->df3: apply the corresponding bdt cut
            df3 = {}
            for sam in ['subst_'+s if s!='jetht-noht' else s for s in read_sample_list_map]:
                df3[sam] = df2a[sam].query(f'{bdt_expr}>{sfBDT_val}')

            makeTemplatesWrapper(df3, fitinfo, lambda sys_name: outdir_func(sfBDT_val, pt_range, sys_name), sfBDT_val, args)

In [None]:
def makeTemplatesWrapper(df3, fitinfo, outdir_func, sfBDT_val, args):
    r"""Depth 2: Specify which template (nominal or any shape uncertainty) to make in this step
    """
    global g_wgtstr_dm_sys_fac, g_hist_qcdsyst, g_hist_fitvar_rwgt
    g_wgtstr_dm_sys_fac, g_hist_qcdsyst = {}, {} ## clear
    g_hist_fitvar_rwgt = {}
    
    wgtstr_dm = args['wgtstr_dm']
    if 'nominal' in g_make_unce_types.keys() and g_make_unce_types['nominal']:
        sys_name = 'nominal'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    
    ## Below we extract hists for all unce type
    if 'pu' in g_make_unce_types.keys() and g_make_unce_types['pu']: 
        sys_name = 'puUp'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightUp'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = 'puDown'; wgtstr_dm_sys = wgtstr_dm.replace('puWeight','puWeightDown'); makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

    if 'fracBB' in g_make_unce_types.keys() and g_make_unce_types['fracBB']: 
        sys_name = "fracBBUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "fracBBDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons>1) + 1.0*(fj_x_nbhadrons<=1))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'fracCC' in g_make_unce_types.keys() and g_make_unce_types['fracCC']: 
        sys_name = "fracCCUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons==0 & fj_x_nchadrons>1) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons>1)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "fracCCDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons==0 & fj_x_nchadrons>1) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons>1)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'fracLight' in g_make_unce_types.keys() and g_make_unce_types['fracLight']: 
        sys_name = "fracLightUp"; wgtstr_dm_sys = wgtstr_dm+'*(1.2*(fj_x_nbhadrons==0 & fj_x_nchadrons==0) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons==0)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "fracLightDown"; wgtstr_dm_sys = wgtstr_dm+'*(0.8*(fj_x_nbhadrons==0 & fj_x_nchadrons==0) + 1.0*(not(fj_x_nbhadrons==0 & fj_x_nchadrons==0)))'; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

    ## Below unce is not as easily extracted as above by specifying a different weight string. They may need *special treatment* implemented in the depth-3 function
    if 'qcdSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdSyst']: 
        sys_name = "qcdSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "qcdSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'qcdKdeSyst' in g_make_unce_types.keys() and g_make_unce_types['qcdKdeSyst']: 
        sys_name = "qcdKdeSystUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "qcdKdeSystDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'psWeightIsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightIsr']: 
        sys_name = "psWeightIsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "psWeightIsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    if 'psWeightFsr' in g_make_unce_types.keys() and g_make_unce_types['psWeightFsr']: 
        sys_name = "psWeightFsrUp"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = "psWeightFsrDown"; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)

    if 'sfBDTRwgt' in g_make_unce_types.keys() and g_make_unce_types['sfBDTRwgt']: 
        sys_name = 'sfBDTRwgtUp'; wgtstr_dm_sys = wgtstr_dm;'''factors decided by special_wgtstr argument'''; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, special_wgtstr='fj_x_sfbdtwgt_g50')
        sys_name = 'sfBDTRwgtDown'; wgtstr_dm_sys = wgtstr_dm;'''factors decided by special_wgtstr argument'''; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args, special_wgtstr='fj_x_sfbdtwgt_g50')
    
    if 'fitVarRwgt' in g_make_unce_types.keys() and g_make_unce_types['fitVarRwgt']: 
        sys_name = 'fitVarRwgtUp'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
        sys_name = 'fitVarRwgtDown'; wgtstr_dm_sys = wgtstr_dm; makeTemplates(df3, fitinfo, outdir_func(sys_name), sys_name, wgtstr_dm_sys, args)
    

In [None]:
def makeTemplates(df3, fitinfo, outputdir, sys_name, wgtstr_dm_sys, args, special_wgtstr=None):
    r"""Depth 3: The very base implementation that apply the final pass/fail cut and make the template
    """
    
    wgtstr_dm, wgtstr_dm_data, sl_dm, sl_dm_herwig, config_dm, categories_dm, catMap = args['wgtstr_dm'], args['wgtstr_dm_data'], args['sl_dm'], args['sl_dm_herwig'], args['config_dm'], args['categories_dm'], args['catMap']
    
    ## Create the output root file
    if not os.path.exists(outputdir) and not g_dryrun:
        os.makedirs(outputdir)

    import ROOT, array  ## use ROOT to write file...
    vname, nbin, xmin, xmax, (underflow, overflow), vlabel = fitinfo
    ## Tranfer the {nbin, xmin, xmax} set to the real bin edge if necessary
    if not isinstance(nbin, int):
        edges = nbin
        nbin = len(edges)-1 # reset nbin to "real" nbin
        edges_inroot = (len(edges)-1, array.array('f', edges))
    else:
        edges = np.linspace(xmin, xmax, nbin+1)
        edges_inroot = (nbin, xmin, xmax)

    ## Impose the overall factor between MC and data
    def extract_factor_overal(_sl, _wgtstr):
        return np.round(df3[_sl[-1]].shape[0] * 1. / sum([df3[sam].eval(_wgtstr).sum() for sam in _sl[:-1]]), 4)
    
    if special_wgtstr is None: ## no special weight string provided -> use the nominal one
        if any([_sys in sys_name for _sys in ['qcdSyst','qcdKdeSyst']]): # note that qcd syst uses the setting of the herwig sample
            fac_overal = g_wgtstr_dm_sys_fac['qcdSystUp'] if 'qcdSystUp' in g_wgtstr_dm_sys_fac else \
                         g_wgtstr_dm_sys_fac['qcdKdeSystUp'] if 'qcdKdeSystUp' in g_wgtstr_dm_sys_fac else None
            if fac_overal is None:
                fac_overal = extract_factor_overal(sl_dm_herwig, wgtstr_dm.replace('htwgt','htwgt_herwig'))
        else:  # nominal case
            fac_overal = g_wgtstr_dm_sys_fac['nominal'] if 'nominal' in g_wgtstr_dm_sys_fac else None
            if fac_overal is None:
                fac_overal = extract_factor_overal(sl_dm, wgtstr_dm)
        # equip the weight factor
        g_wgtstr_dm_sys_fac[sys_name] = fac_overal
        wgtstr_dm_sys = wgtstr_dm_sys+f'*{fac_overal}'

    else: ## special weight string specified
        if sys_name.endswith('Up'):
            fac_overal = extract_factor_overal(sl_dm, wgtstr_dm+f'*{special_wgtstr}')
            # equip the weight factor
            g_wgtstr_dm_sys_fac[sys_name] = fac_overal
            wgtstr_dm_sys = wgtstr_dm+f'*{special_wgtstr}*{fac_overal}'
        else:
            wgtstr_dm_sys = wgtstr_dm+f"*(2*{g_wgtstr_dm_sys_fac['nominal']}-{special_wgtstr}*{g_wgtstr_dm_sys_fac[sys_name.replace('Down','Up')]})"

    print (fitinfo, outputdir, sys_name, wgtstr_dm_sys)
    
    ## Preprocess for fitVarRwgt
    if sys_name == 'fitVarRwgtUp':
        _df_mc = pd.concat([df3[sam] for sam in sl_dm[:-1]])
        _df_data = df3[sl_dm[-1]]
        _h_data = get_hist(_df_data[vname].values, bins=edges, weights=np.ones(_df_data.shape[0]) if wgtstr_dm_data==None else _df_data.eval(wgtstr_dm_data).values, underflow=underflow, overflow=overflow).view(flow=True)
        _h_mc = get_hist(_df_mc[vname].values, bins=edges, weights=_df_mc.eval(wgtstr_dm_sys).values, underflow=underflow, overflow=overflow).view(flow=True)
        g_hist_fitvar_rwgt[sys_name] = _h_data.value / _h_mc.value
    
    ## Loop over pass and fail region
    for b in ['pass', 'fail']:
        try:
            if not g_dryrun:
                fw = ROOT.TFile(outputdir+f'inputs_{b}.root', 'recreate')
            
            hv, hist = {}, {}
            hname_suf = '_'+sys_name if sys_name!='nominal' else ''  ## suffix to the hist name (the Higgs Combine syntax)
            print (' -- ', catMap[b])
            
            ## MC and data dataframe after applying the final selection
            df_mc = pd.concat([df3[sam].query(catMap[b]) for sam in sl_dm[:-1]])
            df_data = df3[sl_dm[-1]].query(catMap[b])
            
            ## Preprocessing for herwig related dataframe if we mean to calculate qcdSyst / qcdKdeSyst unce in this iteration
            if 'qcdSyst' in sys_name or 'qcdKdeSyst' in sys_name:
                df_mc_herwig = pd.concat([df3[sam].query(catMap[b]) for sam in sl_dm_herwig[:-1]])

            # Loop over categories: flvC/flvB/flvL/data
            for cat in config_dm:
                ## hv[] holds the boosted-histogram type derived from the dataframe, hist[] holds the TH1D type to be stored in ROOT
                if cat=='data' and sys_name == 'nominal':
                    ## Get the data hist
                    hv['data'] = get_hist(df_data[vname].values, bins=edges, weights=np.ones(df_data.shape[0]) if wgtstr_dm_data==None else df_data.eval(wgtstr_dm_data).values, underflow=underflow, overflow=overflow).view(flow=True)
                    # Initialize the TH1D hist
                    hist['data'] = ROOT.TH1D('data_obs', 'data_obs;'+vname, *edges_inroot) 
                if cat!='data':
                    df_mc_tmp = df_mc.query(config_dm[cat]) ## category selection based on flavor
                    ## Get the MC hist for certain flavor
                    hv[cat] = get_hist(df_mc_tmp[vname].values, bins=edges, weights=df_mc_tmp.eval(wgtstr_dm_sys).values, underflow=underflow, overflow=overflow).view(flow=True)
                    # Initialize the TH1D hist
                    hist[cat] = ROOT.TH1D(cat+hname_suf, cat+hname_suf+';'+vname, *edges_inroot) # init TH1 hist
                    hist[cat].Sumw2()
            
                    ## For qcdSyst / qcdKdeSyst unce that is actually related to Herwig, hv[cat] is dummy here, 
                    ## and we mean to obtain hv[cat+'_herwig.value'] that will be later filled into hist[cat]
                    if sys_name=='qcdSystUp':
                        ## Get the Herwig fit for certain flavor
                        df_mc_herwig_tmp = df_mc_herwig.query(config_dm[cat]) ## cat selection
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g50','sfbdtwgt_g50_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        hv[cat+'_herwig.value'] = get_hist(df_mc_herwig_tmp[vname].values, bins=edges, 
                                                     weights=df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 
                                                     underflow=underflow, overflow=overflow).view(flow=True).value
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
                    
                    ## Extract the KDE shape directly from herwig shape
                    if sys_name=='qcdKdeSystUp':
                        df_mc_herwig_tmp = df_mc_herwig.query(config_dm[cat])
                        wgtstr_dm_sys_herwig = wgtstr_dm_sys.replace('htwgt','htwgt_herwig').replace('sfbdtwgt_g50','sfbdtwgt_g50_herwig').replace('ad_ptwgt','ad_ptwgt_herwig').replace('datamcwgt','datamcwgt_herwig')
                        hv_herwig_orig_value = get_hist(df_mc_herwig_tmp[vname].values, bins=edges, 
                                                     weights=df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 
                                                     underflow=underflow, overflow=overflow).view(flow=True).value
                        
                        ## Calculate KDE shape, apply two times so that we specify a finer KDE bindwidth based on the first result
                        from scipy.stats import gaussian_kde
                        kde = gaussian_kde(df_mc_herwig_tmp[vname].values, weights=np.clip(df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 0, +np.inf))
                        kde = gaussian_kde(df_mc_herwig_tmp[vname].values, weights=np.clip(df_mc_herwig_tmp.eval(wgtstr_dm_sys_herwig).values, 0, +np.inf), bw_method=kde.factor/g_custom_kde_bw[vname])
                        kde_int = np.zeros([nbin, 2])
                        
                        ## Integrate the KDE function to obtain KDE histogram
                        for i, (low, high) in enumerate(zip(edges[:-1], edges[1:])):
                            if low in g_custom_kde_binmask[vname]:
                                continue
                            kde_int[i] = [kde.integrate_box_1d(low, high), hv_herwig_orig_value[i]]
                        # print('rescale kde sum to original herwig sum: ', kde_int[:,1].sum() / kde_int[:,0].sum())
                        kde_int[:,0] *= kde_int[:,1].sum() / kde_int[:,0].sum()
                        
                        ## Fill with original madgraph hist if we plan to mask the bin for KDE. 
                        ## This is based on the fact that KDE cannot model the hist well in the marginal bins
                        hv[cat+'_herwig.value'] = np.array([kde_int[i][0] if kde_int[i][0]!=0 else hv[cat].value[i] for i in range(nbin)])
                        
                        ## Store the histogram into global var so we can recycle the same hist in the "Down" routine
                        g_hist_qcdsyst[(sys_name, b, cat)] = hv[cat+'_herwig.value']
            
                    ## Extract the PSWeight histogram
                    if 'psWeight' in sys_name:
                        if year==2018:  ## for 2018, calculate the hist by PSWeight vars 
                            ps_idx = {'psWeightIsrUp':2, 'psWeightIsrDown':0, 'psWeightFsrUp':3, 'psWeightFsrDown':1}
                            hv[cat] = get_hist(df_mc_tmp[vname].values, bins=edges, weights=df_mc_tmp.eval(wgtstr_dm_sys).values*df_mc_tmp['PSWeight'].map(lambda v: v[ps_idx[sys_name]]).values, underflow=underflow, overflow=overflow).view(flow=True)
                        else:  ## for 2016/17 extract the PSWeight hist based on 2018 result (transfer the ratio for PSWeight/nominal)
                            import re
                            outputdir_ps_18 = re.sub('^(.+)_SF201[6-8]_%s_(.*)$' % config['tagger']['working_points']['name'], f'\g<1>_SF2018_%s_psWeight{year}_\g<2>' % config['tagger']['working_points']['name'], outputdir)
                            hv_nom_18 = uproot3.open(outputdir_ps_18.replace(sys_name, 'nominal')+f'inputs_{b}.root')[cat]
                            hv_ps_18 = uproot3.open(outputdir_ps_18+f'inputs_{b}.root')[cat+'_'+sys_name]
                            hv[cat].value *= hv_ps_18.values / hv_nom_18.values
                        # print (hv[cat].value)
                    
                    ## Extract the sfBDTFloAround histogram.
                    ## Method: to utilize the nominal hist for sfbdt>0.95 or 0.85 and migrate the MC-to-data confidence level in the 0.90 case
                    if 'sfBDTFloAround' in sys_name:
                        from scipy.stats import chi2
                        hv_data = uproot3.open(outputdir.replace(sys_name, 'nominal')+f'inputs_{b}.root')['data_obs'].values  ## nominal data hist for 0.90
                        _bdtname = '95' if 'Up' in sys_name else '85'
                        fr = uproot3.open(outputdir.replace(sys_name, 'nominal').replace(f'/bdt{int(g_sfBDT_val_list[-1]*1000)}/',f'/bdt{_bdtname}0/')+f'inputs_{b}.root')
                        fr_data, fr_mc = fr['data_obs'].values, fr['flvC'].values+fr['flvB'].values+fr['flvL'].values  ## nominal data & MC hist for 0.95 or 0.85 (depends on Up or Down)
                        
                        ## For each bins, migrate the confidence level of MC yield F0 given data yield D0 to the target data yield D => F
                        hv_mc = []
                        for D, D0, F0 in zip(hv_data, fr_data, fr_mc):
                            ## The precise calculation
                            F = 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0+2), 2*D+2) if F0>D0 else 0.5*chi2.ppf(chi2.cdf(2*F0, 2*D0), 2*D)
                            if F == np.inf: ## in case the formula results in inf (may occur if F0 >> D0)
                                assert F0 > D0
                                sigD0 = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D0+2) - D0
                                sigD = 0.5 * chi2.ppf(1-(1-0.682689492)/2, 2*D+2) - D
                                F = D + sigD/sigD0*(F0-D0)
                            hv_mc.append(F)
                        
                        ## Obtain flavor template based on the flavor proportion in 0.95 or 0.85 region
                        hv[cat].value = np.nan_to_num(hv_mc * fr[cat].values / fr_mc, nan=0)
                    
                    ## Modify hv[cat] based on extracted pass+fail histogram
                    if 'fitVarRwgt' in sys_name:
                        if sys_name == 'fitVarRwgtUp':
                            hv[cat].value = hv[cat].value * g_hist_fitvar_rwgt['fitVarRwgtUp']
                        else:
                            hv[cat].value = 2 * hv[cat].value - hv[cat].value * g_hist_fitvar_rwgt['fitVarRwgtUp']
                    
                    ## Use bflav qcd samples to stitch the final bflav template
                    if 'use_bflav' in args and args['use_bflav'] and cat == 'flvB' and not all([s in sys_name for s in ['qcd','Syst']]):
                        # print('---', hv[cat])
                        ## Get the MC hist from the new b-enriched sample
                        df_mc_bflav = pd.concat([df3[sam].query(f'({catMap[b]}) & ({config_dm[cat]})') for sam in args['args_bflav']['sl_dm_bflav']])
                        hv_bflav = get_hist(df_mc_bflav[vname].values, bins=edges, weights=df_mc_bflav.eval(args['args_bflav']['wgtstropt_bflav'](wgtstr_dm_sys)).values, underflow=underflow, overflow=overflow).view(flow=True)
                        df_mc_bflav_og = pd.concat([df3[sam].query(f'({catMap[b]}) & ({config_dm[cat]})') for sam in args['args_bflav']['sl_dm_bflav_orig']])
                        hv_bflav_og = get_hist(df_mc_bflav_og[vname].values, bins=edges, weights=df_mc_bflav_og.eval(wgtstr_dm_sys).values, underflow=underflow, overflow=overflow).view(flow=True)
                        ## Combine histogram
                        hv_bflav_og.variance[hv_bflav_og.variance==0] = 1e20
                        hv_bflav.variance[hv_bflav.variance==0] = 1e20
                        hv_bflav_comb = hv[cat].copy()
                        hv_bflav_comb.value = (hv_bflav_og.value*(1/hv_bflav_og.variance) + hv_bflav.value*(1/hv_bflav.variance)) / (1/hv_bflav_og.variance + 1/hv_bflav.variance)
                        hv_bflav_comb.variance = 1 / (1/hv_bflav_og.variance + 1/hv_bflav.variance)
                        ## Further combine with the non no-QCD contribution
                        hv_bflav_nonsubst = hv[cat].copy() # histogram constitution not to be combined (i.e. no-QCD contribution)
                        hv_bflav_nonsubst.value -= hv_bflav_og.value
                        hv_bflav_nonsubst.variance -= hv_bflav_og.variance
                        hv[cat] = hv_bflav_comb + hv_bflav_nonsubst
                        # print('+++', hv_bflav_og, hv_bflav, hv_bflav_comb, hv_bflav_nonsubst, hv[cat])
                    
            ## Fill the hv[cat] (for qcd*, fill hv[cat+'_herwig.value']) into TH1D and save into ROOT
            for cat in hist.keys():
                ## Special handling for qcdSyst / qcdKdeSyst
                if 'qcd' in sys_name and 'SystUp' in sys_name:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat+'_herwig.value'][i])
                elif 'qcd' in sys_name and 'SystDown' in sys_name:
                    hv[cat+'_herwig.value'] = g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)]
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, 2 * hv[cat].value[i] - hv[cat+'_herwig.value'][i])
                    g_hist_qcdsyst[(sys_name.replace('Down','Up'), b, cat)] = None

                ## Normal routine
                else:
                    for i in range(nbin):
                        hist[cat].SetBinContent(i+1, hv[cat].value[i])
                        hist[cat].SetBinError(i+1, np.sqrt(hv[cat].variance[i]))
                
                ## Fix some buggy points
                if cat!='data':
                    for i in range(nbin):
                        if hist[cat].GetBinContent(i+1) <= 1e-3:
                            hist[cat].SetBinContent(i+1, 1e-3)
                            hist[cat].SetBinError(i+1, 1e-3)
                        elif hist[cat].GetBinError(i+1) > hist[cat].GetBinContent(i+1):
                            hist[cat].SetBinError(i+1, hist[cat].GetBinContent(i+1))

                if not g_dryrun:
                    hist[cat].Write()
        ## Close the ROOT file if error occurs (otherwise the notebook is easily corrupted)
        finally:
            if not g_dryrun:
                fw.Close()

Now we launch the template maker

In [None]:
def launch_std_routine():
    global g_bdt_mod_factor
    g_bdt_mod_factor = None; launch_maker()
    load_factor = df1[f"bdt_mod_factor_{config['pt_range']['name']}__{config['main_analysis_tree']['name']}"]
    if load_factor is not None:
        ## launch again for g_bdt_mod_factor set
        g_bdt_mod_factor = load_factor; launch_maker()

In [None]:
## ====================================================================================================
## Main fit routine: launch all sfBDT values, only run on 1st variable
g_dryrun = False
g_make_template_mode = 'main'; g_mode_bdt_runlist = 'all'
g_make_unce_types = {'nominal':True, 'pu':True, 'fracBB':True, 'fracCC':True, 'fracLight':True, 'psWeightIsr':True, 'psWeightFsr':True, 'sfBDTRwgt':True, 'fitVarRwgt':True}
g_mode_psWeight_run_templ = None
g_do_fit_for_var = [1] # only run the first fit variable (2, 3 are for validation fit)
launch_std_routine()
# g_mode_bdt_runlist = 'manual'; g_do_sfBDT_points = {tuple(k):[0.75, 0.80, 0.85, 0.88, 0.90, 0.92, 0.94, 0.96, 0.98] for k in g_pt_range}; launch_maker() # if chooses to use fixed sfBDT points

--------
**For year 2018**: you need to run the following block to provide psWeight templates for year 2016 and 2017 (otherwise for year condition 2016 and 2017 the above block will report errors)

However, to acomplish the following block, you need to first run the same `preprocess.ipynb` for the corresponding 2016 and 2017 to extract the sfBDT sequence in that year condition.

In [None]:
## ====================================================================================================
## For year 2018, extract necessary psWeight templates for 2016/2017
if year == 2018:
    for ext_year in [2016, 2017]:
        g_make_template_mode = 'main'; g_mode_bdt_runlist = 'all'
        g_make_unce_types = {'nominal':True, 'psWeightIsr':True, 'psWeightFsr':True}
        g_mode_psWeight_run_templ = ext_year
        g_do_fit_for_var = [1] # only run the first fit variable (2, 3 are for validation fit)
        launch_std_routine()
        # g_mode_bdt_runlist = 'manual'; g_do_sfBDT_points = {tuple(k):[0.75, 0.80, 0.85, 0.88, 0.90, 0.92, 0.94, 0.96, 0.98] for k in g_pt_range}; launch_maker() # if chooses to use fixed sfBDT points

--------
Below are optional routines for the validation fit. No need to launch during the first run.

In [None]:
# ## ====================================================================================================
# ## Validation on other variables
# g_make_template_mode = 'main'; g_mode_bdt_runlist = 'all'
# g_make_unce_types = {'nominal':True, 'pu':True, 'fracBB':True, 'fracCC':True, 'fracLight':True, 'psWeightIsr':True, 'psWeightFsr':True, 'sfBDTRwgt':True, 'fitVarRwgt':True}
# g_mode_psWeight_run_templ = None
# g_do_fit_for_var = [2, 3]
# launch_maker()

# ## ====================================================================================================
# ## Multiple validations modes: only run the central sfBDT cut point is fine
# for mode in ['val_pt', 'val_tosig_mass', 'val_tosig_pt', 'val_tosig_tau21', 'val_crop_bin']:
#     g_make_template_mode = mode; g_mode_bdt_runlist = 'central'
#     g_mode_psWeight_run_templ = None
#     g_do_fit_for_var = [1]
#     launch_maker()