In [1]:
import uproot3 as uproot
from uproot3_methods import TLorentzVectorArray
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

In [2]:
YEAR = 2018
# SAMPLE = "vbshwwc2v"
SAMPLE = "bosons"

NOMINAL_SAMPLE = f"../outputs/minintuphadd/v2.6/miniNtupV3/{YEAR}/{SAMPLE}.root"
JEC_UP_SAMPLE = f"../outputs/minintuphadd/v2.6/miniNtupV3_jecUp/{YEAR}/{SAMPLE}.root"
JEC_DN_SAMPLE = f"../outputs/minintuphadd/v2.6/miniNtupV3_jecDn/{YEAR}/{SAMPLE}.root"

RUN2_NOMINAL_SAMPLE = f"../outputs/minintuphadd/v2.6/miniNtupV3/Run2/{SAMPLE}.root"

In [3]:
def get_systs(df, wgt_col_pattern, sr_col_pattern, nom_wgt):
    rows = []
    for SR in df.columns[df.columns.str.contains(sr_col_pattern)]:
        for wgt_col in df.columns[df.columns.str.contains(wgt_col_pattern)]:
            in_SR = df[SR]
            n_pass_wgt = np.sum(df[in_SR][nom_wgt]*df[in_SR][wgt_col])
            n_pass_nom = np.sum(df[in_SR][nom_wgt])
            rows.append({
                "region":SR.split("is_")[-1],
                "wgt_name": wgt_col,
                "n_pass_wgt": n_pass_wgt,
                "n_pass_nom": n_pass_nom,
                "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100
            })
            
    return pd.DataFrame(data=rows)

def make_syst_table(df, wgt_col_pattern, sr_col_pattern, nominal_wgt="", debug=False):
    # Compute all systematics
    systs = get_systs(df, wgt_col_pattern, sr_col_pattern, nominal_wgt)
    if debug:
        display(systs)
    # Designate how columns should be renamed
    renames = {
        "delta_percent": wgt_col_pattern.replace("_", ""), 
        "region": "systematics"
    }
    # Return only one row: max systematic for each SR
    return (systs.groupby("region", as_index=False)[["delta_percent"]]
                 .agg(func=lambda x: np.max(np.abs(x)))
                 .round(1)
                 .rename(columns=renames)
                 .set_index("systematics")
                 .transpose())

def append_systs(syst_table, new_syst):
    if np.any(np.isin(new_syst.index, syst_table.index)):
        print("WARNING: this systematic is already in the table. No action taken.")
        return syst_table
    else:
        return syst_table.append(new_syst)

In [4]:
with uproot.open(NOMINAL_SAMPLE) as f:
    df = f.get("variable").pandas.df(
        branches=["is_ps*", "is_bdt_*", "is_cut_*", "LHE_*", "pu_rewgt*",
                  "lepsf*", "btagsf*", "trigsf*", "wgt", "genrewgt"]
    )
# Compute nominal event weight
df["evt_wgt_nominal"] = df.wgt*df.genrewgt*df.lepsf*df.btagsf*df.trigsf*df.pu_rewgt
# Alter up/down variations of scale factors s.t. we can apply them as follows:
# (event weight up) = (nominal event weight)*(sf up/down)
for sf in ["lepsf", "btagsf", "trigsf", "pu_rewgt"]:
    df[f"{sf}_up"] /= df[sf]
    df[f"{sf}_dn"] /= df[sf]

df[df.columns[df.columns.str.contains("is_")]] = df[df.columns[df.columns.str.contains("is_")]].astype(bool)

df

Unnamed: 0_level_0,is_ps_el,is_ps_mu,is_ps_tau,is_ps_neg,is_ps_lgt,is_ps,is_bdt_sr_el,is_bdt_sr_mu,is_bdt_sr_tau,is_bdt_sr_neg,...,lepsf_dn,btagsf,btagsf_up,btagsf_dn,trigsf,trigsf_up,trigsf_dn,wgt,genrewgt,evt_wgt_nominal
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,True,False,True,False,False,False,False,...,0.996663,0.968175,1.017207,0.982789,0.99,1.01,0.99,0.000489,1.0,0.000435
1,False,True,False,False,True,True,False,False,False,False,...,0.984220,1.156882,1.303957,0.720493,1.00,1.01,0.99,0.000489,1.0,0.000472
2,True,False,False,False,True,True,False,False,False,False,...,0.972215,1.085466,1.066529,0.935146,1.00,1.02,0.98,0.000489,1.0,0.000412
3,False,True,False,False,True,True,False,False,False,False,...,0.989066,0.957826,1.024336,0.975903,0.99,1.01,0.99,0.000489,1.0,0.000503
4,False,False,False,True,False,True,False,False,False,False,...,0.997051,1.091043,1.112917,0.893015,0.99,1.01,0.99,0.000489,1.0,0.000640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,False,False,False,True,False,True,False,False,False,False,...,0.970560,0.855898,1.022367,0.977881,1.00,1.01,0.99,0.000765,1.0,0.000506
918,True,False,False,False,True,True,False,False,False,False,...,0.977218,1.294906,1.251570,0.773581,1.00,1.01,0.99,0.000765,1.0,0.000871
919,False,False,False,True,False,True,False,False,False,False,...,0.989808,1.287248,1.083123,0.918473,0.98,1.01,0.99,0.000765,1.0,0.000934
920,False,True,False,False,True,True,False,False,False,False,...,0.969267,1.869489,1.321055,0.713698,1.00,1.01,0.99,0.000765,1.0,0.001287


In [5]:
for SR in df.columns[df.columns.str.contains("is_cut_sr_")]:
    print(f"{SR}: {df[df[SR]].evt_wgt_nominal.sum()}")

print("")
    
for SR in df.columns[df.columns.str.contains("is_bdt_sr_")]:
    print(f"{SR}: {df[df[SR]].evt_wgt_nominal.sum()}")

is_cut_sr_el: 0.0035382816568017006
is_cut_sr_mu: 0.003278080141171813
is_cut_sr_tau: 0.0
is_cut_sr_neg: 0.0039031351916491985
is_cut_sr_lgt: 0.01221412606537342

is_bdt_sr_el: 0.00872168317437172
is_bdt_sr_mu: 0.014009497128427029
is_bdt_sr_tau: 0.0
is_bdt_sr_neg: 0.004376492463052273


In [6]:
cut_based_table = pd.DataFrame()
bdt_based_table = pd.DataFrame()

# b-tagging scale factor variations

In [7]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="btagsf_", sr_col_pattern="is_cut_sr_"
)
display(systs)
cut_based_table = append_systs(cut_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
btagsf,8.8,11.7,16.5,4.6,


In [8]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="btagsf_", sr_col_pattern="is_bdt_sr_"
)
display(systs)
bdt_based_table = append_systs(bdt_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
btagsf,8.7,13.5,6.0,


# Jet energy scale variations

In [9]:
if JEC_UP_SAMPLE and JEC_DN_SAMPLE:
    with uproot.open(JEC_UP_SAMPLE) as f:
        df_up = f.get("variable").pandas.df(
            branches=["is_ps*", "is_bdt_*", "is_cut_*", "LHE_*", "pu_rewgt*",
                      "lepsf*", "btagsf*", "trigsf*", "wgt", "genrewgt"]
        )
    # Compute nominal event weight
    df_up["evt_wgt_nominal"] = df_up.wgt*df_up.genrewgt*df_up.lepsf*df_up.btagsf*df_up.trigsf*df_up.pu_rewgt
    df_up[df_up.columns[df_up.columns.str.contains("is_")]] = df_up[df_up.columns[df_up.columns.str.contains("is_")]].astype(bool)

    with uproot.open(JEC_DN_SAMPLE) as f:
        df_dn = f.get("variable").pandas.df(
            branches=["is_ps*", "is_bdt_*", "is_cut_*", "LHE_*", "pu_rewgt*",
                      "lepsf*", "btagsf*", "trigsf*", "wgt", "genrewgt"]
        )
    # Compute nominal event weight
    df_dn["evt_wgt_nominal"] = df_dn.wgt*df_dn.genrewgt*df_dn.lepsf*df_dn.btagsf*df_dn.trigsf*df_dn.pu_rewgt
    df_dn[df_dn.columns[df_dn.columns.str.contains("is_")]] = df_dn[df_dn.columns[df_dn.columns.str.contains("is_")]].astype(bool)

In [10]:
if JEC_UP_SAMPLE and JEC_DN_SAMPLE:
    # Compute all systematics
    rows = []
    for SR in df.columns[df.columns.str.contains("is_cut_sr_")]:
        n_pass_jec_up = df_up[df_up[SR]].evt_wgt_nominal.sum()
        n_pass_jec_dn = df_dn[df_dn[SR]].evt_wgt_nominal.sum()
        n_pass_nom = df[df[SR]].evt_wgt_nominal.sum()
        rows.append({
            "region":SR.split("is_")[-1],
            "wgt_name": "jec_up",
            "n_pass_wgt": n_pass_jec_up,
            "n_pass_nom": n_pass_nom,
            "delta_percent": (1 - n_pass_jec_up/(n_pass_nom))*100
        })
        rows.append({
            "region":SR.split("is_")[-1],
            "wgt_name": "jec_dn",
            "n_pass_wgt": n_pass_jec_dn,
            "n_pass_nom": n_pass_nom,
            "delta_percent": (1 - n_pass_jec_dn/(n_pass_nom))*100
        })

    systs = pd.DataFrame(data=rows)
    # Designate how columns should be renamed
    renames = {
        "delta_percent": "jec", 
        "region": "systematics"
    }
    # Return only one row: max systematic for each SR
    row = (systs.groupby("region", as_index=False)[["delta_percent"]]
                .agg(func=lambda x: np.max(np.abs(x)))
                .round(1)
                .rename(columns=renames)
                .set_index("systematics")
                .transpose())
    display(row)

    cut_based_table = append_systs(cut_based_table, row)

  "delta_percent": (1 - n_pass_jec_up/(n_pass_nom))*100
  "delta_percent": (1 - n_pass_jec_dn/(n_pass_nom))*100


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
jec,100.0,87.3,42.3,30.0,


In [11]:
if JEC_UP_SAMPLE and JEC_DN_SAMPLE:
    # Compute all systematics
    rows = []
    for SR in df.columns[df.columns.str.contains("is_bdt_sr_")]:
        n_pass_jec_up = df_up[df_up[SR]].evt_wgt_nominal.sum()
        n_pass_jec_dn = df_dn[df_dn[SR]].evt_wgt_nominal.sum()
        n_pass_nom = df[df[SR]].evt_wgt_nominal.sum()
        rows.append({
            "region":SR.split("is_")[-1],
            "wgt_name": "jec_up",
            "n_pass_wgt": n_pass_jec_up,
            "n_pass_nom": n_pass_nom,
            "delta_percent": (1 - n_pass_jec_up/(n_pass_nom))*100
        })
        rows.append({
            "region":SR.split("is_")[-1],
            "wgt_name": "jec_dn",
            "n_pass_wgt": n_pass_jec_dn,
            "n_pass_nom": n_pass_nom,
            "delta_percent": (1 - n_pass_jec_dn/(n_pass_nom))*100
        })

    systs = pd.DataFrame(data=rows)
    # Designate how columns should be renamed
    renames = {
        "delta_percent": "jec", 
        "region": "systematics"
    }
    # Return only one row: max systematic for each SR
    row = (systs.groupby("region", as_index=False)[["delta_percent"]]
                .agg(func=lambda x: np.max(np.abs(x)))
                .round(1)
                .rename(columns=renames)
                .set_index("systematics")
                .transpose())
    display(row)

    bdt_based_table = append_systs(bdt_based_table, row)

  "delta_percent": (1 - n_pass_jec_up/(n_pass_nom))*100
  "delta_percent": (1 - n_pass_jec_dn/(n_pass_nom))*100


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
jec,14.0,82.8,28.9,


# PU reweighting

In [12]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="pu_rewgt_", sr_col_pattern="is_cut_sr_"
)
display(systs)
cut_based_table = append_systs(cut_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
purewgt,7.5,0.9,8.7,4.6,


In [13]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="pu_rewgt_", sr_col_pattern="is_bdt_sr_"
)
display(systs)
bdt_based_table = append_systs(bdt_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
purewgt,11.3,17.9,3.5,


# LHE scale weight variations

In [14]:
# LHE_scale_wgt_sums = {}
# with open("LHE_scale_wgt_sums.txt", "r") as f_in:
#     for line in f_in.readlines():
#         name, value = line.split()
#         LHE_scale_wgt_sums[name] = float(value)
        
# NOMINAL_N_TOTAL = LHE_scale_wgt_sums["LHE_muF1p0_muR0p5"] # mu_R does not effect our signal sample

# print(LHE_scale_wgt_sums)

In [15]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="LHE_mu", sr_col_pattern="is_cut_sr_"
)
display(systs)
cut_based_table = append_systs(cut_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
LHEmu,9.9,11.2,10.0,8.5,


In [16]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="LHE_mu", sr_col_pattern="is_bdt_sr_"
)
display(systs)
bdt_based_table = append_systs(bdt_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
LHEmu,10.9,15.1,9.1,


# PDF variations

In [17]:
# LHE_pdf_wgt_sums = {}
# with open("LHE_pdf_wgt_sums.txt", "r") as f_in:
#     print(line)
#     for line in f_in.readlines():
#         name, value = line.split()
#         LHE_pdf_wgt_sums[name] = float(value)

# print(LHE_pdf_wgt_sums)

In [18]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="LHE_pdf_wgt", sr_col_pattern="is_cut_sr_"
)
display(systs)
cut_based_table = append_systs(cut_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
LHEpdfwgt,0.3,0.4,0.8,2.3,


In [19]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="LHE_pdf_wgt", sr_col_pattern="is_bdt_sr_"
)
display(systs)
bdt_based_table = append_systs(bdt_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
LHEpdfwgt,0.3,0.6,2.3,


# Trigger scale factor variations

In [20]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="trigsf", sr_col_pattern="is_cut_sr_"
)
display(systs)
cut_based_table = append_systs(cut_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
trigsf,1.0,1.4,1.0,1.7,


In [21]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="trigsf", sr_col_pattern="is_bdt_sr_"
)
display(systs)
bdt_based_table = append_systs(bdt_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
trigsf,1.0,2.3,1.5,


# Lepton ID scale factor variations

In [22]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="lepsf", sr_col_pattern="is_cut_sr_"
)
display(systs)
cut_based_table = append_systs(cut_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
lepsf,13.3,6.9,10.8,0.4,


In [23]:
systs = make_syst_table(
    df, nominal_wgt="evt_wgt_nominal", wgt_col_pattern="lepsf", sr_col_pattern="is_bdt_sr_"
)
display(systs)
bdt_based_table = append_systs(bdt_based_table, systs)

  "delta_percent": (1 - n_pass_wgt/(n_pass_nom))*100


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
lepsf,11.1,1.3,1.3,


# Statistical uncertainty

In [24]:
with uproot.open(RUN2_NOMINAL_SAMPLE) as f:
    run2_df = f.get("variable").pandas.df(
        branches=["is_bdt_*", "is_cut_*", "pu_rewgt", "lepsf", "btagsf", "trigsf", "wgt", "genrewgt"]
    )
# Compute nominal event weight
run2_df["evt_wgt_nominal"] = run2_df.wgt*run2_df.genrewgt*run2_df.lepsf*run2_df.btagsf*run2_df.trigsf*run2_df.pu_rewgt

run2_df[run2_df.columns[run2_df.columns.str.contains("is_")]] = run2_df[run2_df.columns[run2_df.columns.str.contains("is_")]].astype(bool)

run2_df

Unnamed: 0_level_0,is_bdt_sr_el,is_bdt_sr_mu,is_bdt_sr_tau,is_bdt_sr_neg,is_bdt_cr_el,is_bdt_cr_mu,is_bdt_cr_tau,is_bdt_cr_neg,is_bdt_sr2_el,is_bdt_sr2_mu,...,is_cut_cr2,is_cut_sm_sr2,is_cut_c3_sr2,pu_rewgt,lepsf,btagsf,trigsf,wgt,genrewgt,evt_wgt_nominal
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,False,False,False,False,False,False,False,False,False,...,True,True,True,0.921167,1.008093,0.968175,0.99,0.000489,1.0,0.000435
1,False,False,False,False,False,False,False,False,False,False,...,True,True,True,0.998717,0.834367,1.156882,1.00,0.000489,1.0,0.000472
2,False,False,False,False,True,False,False,False,False,False,...,True,True,True,1.051490,0.738605,1.085466,1.00,0.000489,1.0,0.000412
3,False,False,False,False,False,False,False,False,False,False,...,True,True,True,1.075390,1.008093,0.957826,0.99,0.000489,1.0,0.000503
4,False,False,False,False,False,False,False,False,False,False,...,True,True,True,1.217420,0.994935,1.091043,0.99,0.000489,1.0,0.000640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,False,False,False,False,False,False,False,False,False,False,...,True,True,True,1.081840,0.714478,0.855898,1.00,0.000765,1.0,0.000506
918,False,False,False,False,False,False,False,False,False,False,...,True,True,True,0.990842,0.887539,1.294906,1.00,0.000765,1.0,0.000871
919,False,False,False,False,False,False,False,True,False,False,...,True,True,True,0.980164,0.986817,1.287248,0.98,0.000765,1.0,0.000934
920,False,False,False,False,False,False,False,False,False,False,...,True,True,True,1.109820,0.810666,1.869489,1.00,0.000765,1.0,0.001287


In [25]:
for SR in run2_df.columns[run2_df.columns.str.contains("is_cut_sr_")]:
    count = run2_df[run2_df[SR]].evt_wgt_nominal.sum()
    error = np.sqrt(np.sum(run2_df[run2_df[SR]].evt_wgt_nominal**2))
    print(f"{SR}: {count} +- {error}")

print("")
    
for SR in run2_df.columns[run2_df.columns.str.contains("is_bdt_sr_")]:
    count = run2_df[run2_df[SR]].evt_wgt_nominal.sum()
    error = np.sqrt(np.sum(run2_df[run2_df[SR]].evt_wgt_nominal**2))
    print(f"{SR}: {count} +- {error}")

is_cut_sr_el: 0.0035382816568017006 +- 0.002620960818603635
is_cut_sr_mu: 0.003278080141171813 +- 0.0023902978282421827
is_cut_sr_tau: 0.0 +- 0.0
is_cut_sr_neg: 0.0039031351916491985 +- 0.0028557779733091593
is_cut_sr_lgt: 0.01221412606537342 +- 0.0051465705037117004

is_bdt_sr_el: 0.00872168317437172 +- 0.004363036248832941
is_bdt_sr_mu: 0.014009497128427029 +- 0.004927818197757006
is_bdt_sr_tau: 0.0 +- 0.0
is_bdt_sr_neg: 0.004376492463052273 +- 0.0028947428800165653


In [26]:
def get_stats(df, sr_col_pattern, nom_wgt):
    rows = []
    for SR in df.columns[df.columns.str.contains(sr_col_pattern)]:
        in_SR = df[SR]
        count = np.sum(df[in_SR][nom_wgt])
        error = np.sqrt(np.sum(df[in_SR][nom_wgt]**2))
        rows.append({
            "region":SR.split("is_")[-1],
            "count": count,
            "error": error,
            "delta_percent": (error/count)*100
        })
            
    return pd.DataFrame(data=rows)

def make_stat_table(df, sr_col_pattern, nominal_wgt="", debug=False):
    # Compute all systematics
    stats = get_stats(df, sr_col_pattern, nominal_wgt)
    if debug:
        display(stats)
    # Designate how columns should be renamed
    renames = {
        "delta_percent": "statunc", 
        "region": "systematics"
    }
    # Return only one row: max systematic for each SR
    return (stats.groupby("region", as_index=False)[["delta_percent"]]
                 .agg(func=lambda x: np.max(np.abs(x)))
                 .round(1)
                 .rename(columns=renames)
                 .set_index("systematics")
                 .transpose())

In [27]:
stats = make_stat_table(run2_df, "is_cut_sr_", nominal_wgt="evt_wgt_nominal", debug=True)
display(stats)
cut_based_table = append_systs(cut_based_table, stats)

  "delta_percent": (error/count)*100


Unnamed: 0,region,count,error,delta_percent
0,cut_sr_el,0.003538,0.002621,74.074399
1,cut_sr_mu,0.003278,0.00239,72.917616
2,cut_sr_tau,0.0,0.0,
3,cut_sr_neg,0.003903,0.002856,73.166257
4,cut_sr_lgt,0.012214,0.005147,42.136216


systematics,cut_sr_el,cut_sr_lgt,cut_sr_mu,cut_sr_neg,cut_sr_tau
statunc,74.1,42.1,72.9,73.2,


In [28]:
stats = make_stat_table(run2_df, "is_bdt_sr_", nominal_wgt="evt_wgt_nominal", debug=True)
display(stats)
bdt_based_table = append_systs(bdt_based_table, stats)

  "delta_percent": (error/count)*100


Unnamed: 0,region,count,error,delta_percent
0,bdt_sr_el,0.008722,0.004363,50.025165
1,bdt_sr_mu,0.014009,0.004928,35.174841
2,bdt_sr_tau,0.0,0.0,
3,bdt_sr_neg,0.004376,0.002895,66.142988


systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_neg,bdt_sr_tau
statunc,50.0,35.2,66.1,


# Final tables

In [29]:
cut_based_table = cut_based_table[["cut_sr_el", "cut_sr_mu", "cut_sr_tau", "cut_sr_neg", "cut_sr_lgt"]]
display(cut_based_table)
print(cut_based_table.to_latex())

systematics,cut_sr_el,cut_sr_mu,cut_sr_tau,cut_sr_neg,cut_sr_lgt
btagsf,8.8,16.5,,4.6,11.7
jec,100.0,42.3,,30.0,87.3
purewgt,7.5,8.7,,4.6,0.9
LHEmu,9.9,10.0,,8.5,11.2
LHEpdfwgt,0.3,0.8,,2.3,0.4
trigsf,1.0,1.0,,1.7,1.4
lepsf,13.3,10.8,,0.4,6.9
statunc,74.1,72.9,,73.2,42.1


\begin{tabular}{lrrrrr}
\toprule
systematics &  cut\_sr\_el &  cut\_sr\_mu &  cut\_sr\_tau &  cut\_sr\_neg &  cut\_sr\_lgt \\
\midrule
btagsf    &        8.8 &       16.5 &         NaN &         4.6 &        11.7 \\
jec       &      100.0 &       42.3 &         NaN &        30.0 &        87.3 \\
purewgt   &        7.5 &        8.7 &         NaN &         4.6 &         0.9 \\
LHEmu     &        9.9 &       10.0 &         NaN &         8.5 &        11.2 \\
LHEpdfwgt &        0.3 &        0.8 &         NaN &         2.3 &         0.4 \\
trigsf    &        1.0 &        1.0 &         NaN &         1.7 &         1.4 \\
lepsf     &       13.3 &       10.8 &         NaN &         0.4 &         6.9 \\
statunc   &       74.1 &       72.9 &         NaN &        73.2 &        42.1 \\
\bottomrule
\end{tabular}



In [30]:
aggs = cut_based_table.transpose().agg(["min", "max"]).transpose()
aggs["min_to_max"] = aggs["min"].astype(str) + "%-" + aggs["max"].astype(str) + "%"
print(aggs[["min_to_max"]].to_latex())

\begin{tabular}{ll}
\toprule
{} &    min\_to\_max \\
\midrule
btagsf    &    4.6\%-16.5\% \\
jec       &  30.0\%-100.0\% \\
purewgt   &     0.9\%-8.7\% \\
LHEmu     &    8.5\%-11.2\% \\
LHEpdfwgt &     0.3\%-2.3\% \\
trigsf    &     1.0\%-1.7\% \\
lepsf     &    0.4\%-13.3\% \\
statunc   &   42.1\%-74.1\% \\
\bottomrule
\end{tabular}



In [31]:
bdt_based_table = bdt_based_table[["bdt_sr_el", "bdt_sr_mu", "bdt_sr_tau", "bdt_sr_neg"]]
display(bdt_based_table)
print(bdt_based_table.to_latex())

systematics,bdt_sr_el,bdt_sr_mu,bdt_sr_tau,bdt_sr_neg
btagsf,8.7,13.5,,6.0
jec,14.0,82.8,,28.9
purewgt,11.3,17.9,,3.5
LHEmu,10.9,15.1,,9.1
LHEpdfwgt,0.3,0.6,,2.3
trigsf,1.0,2.3,,1.5
lepsf,11.1,1.3,,1.3
statunc,50.0,35.2,,66.1


\begin{tabular}{lrrrr}
\toprule
systematics &  bdt\_sr\_el &  bdt\_sr\_mu &  bdt\_sr\_tau &  bdt\_sr\_neg \\
\midrule
btagsf    &        8.7 &       13.5 &         NaN &         6.0 \\
jec       &       14.0 &       82.8 &         NaN &        28.9 \\
purewgt   &       11.3 &       17.9 &         NaN &         3.5 \\
LHEmu     &       10.9 &       15.1 &         NaN &         9.1 \\
LHEpdfwgt &        0.3 &        0.6 &         NaN &         2.3 \\
trigsf    &        1.0 &        2.3 &         NaN &         1.5 \\
lepsf     &       11.1 &        1.3 &         NaN &         1.3 \\
statunc   &       50.0 &       35.2 &         NaN &        66.1 \\
\bottomrule
\end{tabular}



In [32]:
aggs = bdt_based_table.transpose().agg(["min", "max"]).transpose()
aggs["min_to_max"] = aggs["min"].astype(str) + "%-" + aggs["max"].astype(str) + "%"
print(aggs[["min_to_max"]].to_latex())

\begin{tabular}{ll}
\toprule
{} &   min\_to\_max \\
\midrule
btagsf    &   6.0\%-13.5\% \\
jec       &  14.0\%-82.8\% \\
purewgt   &   3.5\%-17.9\% \\
LHEmu     &   9.1\%-15.1\% \\
LHEpdfwgt &    0.3\%-2.3\% \\
trigsf    &    1.0\%-2.3\% \\
lepsf     &   1.3\%-11.1\% \\
statunc   &  35.2\%-66.1\% \\
\bottomrule
\end{tabular}

