In [1]:
import uproot
from uproot_methods import TLorentzVectorArray
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob
import scipy.stats
import pyhf # limit computations
import pyhf.contrib.viz.brazil
pyhf.set_backend("numpy")
from scipy.optimize import toms748

In [2]:
def get_p4(tree, key):
    """
    Retrieve a 4-vector object from ROOT using the fCoordinates properties
    
    Necessary when reading ROOT files in via Uproot, since Uproot does not natively 
    recognize and convert LorentzVector objects
    """
    return TLorentzVectorArray.from_ptetaphim(
        tree[key]["fCoordinates.fPt"].array(),
        tree[key]["fCoordinates.fEta"].array(),
        tree[key]["fCoordinates.fPhi"].array(),
        tree[key]["fCoordinates.fM"].array(),
    )

def smart_clip(np_array, bins):
    clip_low = 0.5 * (bins[0] + bins[1])
    clip_high = 0.5 * (bins[-2] + bins[-1])
    return np.clip(np_array, clip_low, clip_high)

In [3]:
def get_centrality(p4, vbs0_p4, vbs1_p4):
    deta_jj = np.abs(vbs0_p4.eta - vbs1_p4.eta)
    return np.abs(p4.eta - 0.5*(vbs0_p4.eta + vbs1_p4.eta))/(0.5*(deta_jj))

def get_df(signal="vbshwwlvlvbb_c2v", lhe_rewgt_idx=-1):
    samples = [
        "tt1lpowheg", "tt2lpowheg", "ttw", "ttz", "raretop",
        "bosons", signal
    ]
    baby_dir = "/nfs-7/userdata/jguiang/VBSHWWBaby/v2.6_SS/v2/createMini_Run2"
    df = []
    for sample_path in glob.glob(f"{baby_dir}/*.root"):
        name = sample_path.split("/")[-1].split(".root")[0]
        if name not in samples:
            continue
        # Retrieve TTree
        with uproot.open(sample_path) as f:
            tree = f.get("variable")
            temp_df = tree.pandas.df(
                branches=["is_ps*", "wgt", "btagsf", "lepsf", 
                          "xsec_sf", "genrewgt", "trigsf", "pu_rewgt",
                          "drbb", "mbb", "mbbIn", "ptbb", "dphibb",
                          "mjj", "detajj", "lt", "st"]
            )
            # Read in useful 4-vectors
            ld_lep_p4 = get_p4(tree, "leadlep")
            tr_lep_p4 = get_p4(tree, "subllep")
            ld_bjet_p4 = get_p4(tree, "b0")
            tr_bjet_p4 = get_p4(tree, "b1")
            ld_vbs_p4 = get_p4(tree, "j0")
            tr_vbs_p4 = get_p4(tree, "j1")
            H_p4 = (ld_bjet_p4 + tr_bjet_p4)
            # Set simple columns from 4-vectors
            temp_df["ld_lep_pt"] = ld_lep_p4.pt
            temp_df["tr_lep_pt"] = tr_lep_p4.pt
            # Set complicated columns from 4-vectors
            temp_df["dphi_ll"] = np.arccos(np.cos(np.abs(ld_lep_p4.phi - tr_lep_p4.phi)))
            temp_df["dphi_H_ldlep"] = np.arccos(np.cos(np.abs(H_p4.phi - ld_lep_p4.phi)))
            temp_df["dphi_H_trlep"] = np.arccos(np.cos(np.abs(H_p4.phi - tr_lep_p4.phi)))
            temp_df["ld_lep_cent"] = get_centrality(ld_lep_p4, ld_vbs_p4, tr_vbs_p4)
            temp_df["tr_lep_cent"] = get_centrality(tr_lep_p4, ld_vbs_p4, tr_vbs_p4)
            temp_df["dilep_cent"] = get_centrality((ld_lep_p4 + tr_lep_p4), ld_vbs_p4, tr_vbs_p4)
            temp_df["ld_bjet_cent"] = get_centrality(ld_bjet_p4, ld_vbs_p4, tr_vbs_p4)
            temp_df["tr_bjet_cent"] = get_centrality(tr_bjet_p4, ld_vbs_p4, tr_vbs_p4)
            temp_df["H_cent"] = get_centrality((ld_bjet_p4 + tr_bjet_p4), ld_vbs_p4, tr_vbs_p4)
            temp_df["m_l0bb"] = (ld_lep_p4 + ld_bjet_p4 + tr_bjet_p4).mass
            temp_df["m_l1bb"] = (tr_lep_p4 + ld_bjet_p4 + tr_bjet_p4).mass
            # Compute additional columns
            temp_df["name"] = name
            temp_df["is_signal"] = (name == signal)
            temp_df["weight"] = (temp_df.wgt
                                 * temp_df.btagsf
                                 * temp_df.lepsf
                                 * temp_df.genrewgt
                                 * temp_df.trigsf
                                 * temp_df.pu_rewgt)
            # Apply sig-specific event weights
            if name == signal and lhe_rewgt_idx >= 0:
                temp_df["weight"] *= tree["lherewgts"].array()[:,lhe_rewgt_idx]
            # Apply bkg-specific event weights
            if name != signal:
                temp_df["weight"] *= temp_df.xsec_sf
            # Drop the columns we don't need
            temp_df.drop(
                columns=["wgt", "btagsf", "lepsf", "xsec_sf", "genrewgt", 
                         "trigsf", "pu_rewgt"], 
                inplace=True
            )
            df.append(temp_df)

    # Put dataframe together
    df = pd.concat(df)
    # Cast boolean-like columns to proper bools
    bool_like_cols = df.columns[df.columns.str.contains("is_")]
    df[bool_like_cols] = df[bool_like_cols].astype(bool)
    # Make sample col a categorical
    df["name"] = df.name.astype("category")
    return df

In [4]:
BASE_SEL = lambda df: (df.is_ps & df.mbbIn)
STRAWMAN_LEAD_EL = lambda df: (BASE_SEL(df) & df.is_ps_el)
STRAWMAN_LEAD_MU = lambda df: (BASE_SEL(df) & df.is_ps_mu)
STRAWMAN_LEP_TAU = lambda df: (BASE_SEL(df) & df.is_ps_tau)
STRAWMAN_NEG_NEG = lambda df: (BASE_SEL(df) & df.is_ps_neg)
STRAWMEN = lambda df: [
    ("mu+l+", STRAWMAN_LEAD_MU(df)),
    ("e+l+", STRAWMAN_LEAD_EL(df)),
    ("l+tau+", STRAWMAN_LEP_TAU(df)),
    ("lep-lep-", STRAWMAN_NEG_NEG(df))
]
STRAWMAN_LEP_LEP = lambda df: (BASE_SEL(df) & df.is_ps_lgt)

In [5]:
def get_limit_1bin(S, B, POI_val=1., all_results=True):
    if B <= 0:
        B = 1e-6
    model = pyhf.simplemodels.hepdata_like(
        signal_data=[S], bkg_data=[B], bkg_uncerts=[0.58*np.sqrt(B)]
    )
    data = [B] + model.config.auxdata
    # Get CL values (results) at mu = POI_val
    results = pyhf.infer.hypotest(POI_val, data, model, test_stat="qtilde", return_expected_set=True)
    if all_results:
        return results
    else:
        # Unpack results
        CL_obs, (CL_exp_m2sigma, CL_exp_m1sigma, CL_exp, CL_exp_p1sigma, CL_exp_p2sigma) = results
        return float(CL_exp)

def plot_limits_1bin(S, B, axes=None):
    if B <= 0:
        B = 1e-6
    model = pyhf.simplemodels.hepdata_like(
        signal_data=[S], bkg_data=[B], bkg_uncerts=[0.58*np.sqrt(B)]
    )
    data = [B] + model.config.auxdata

    POI_vals = np.linspace(0, 5, 41) # points of interest (mu)
    results = [
        pyhf.infer.hypotest(
            POI_val, data, model, test_stat="qtilde", return_expected_set=True
        )
        for POI_val in POI_vals
    ]

    if not axes:
        fig, axes = plt.subplots(figsize=(12, 9))
    return pyhf.contrib.viz.brazil.plot_results(axes, POI_vals, results)

def upperlimit_auto(data, model, low, high, level=0.05, atol=2e-12, rtol=1e-15,
                    obs_eq_exp=False):
    """
    Calculate an upper limit interval ``(0, poi_up)`` for a single
    Parameter of Interest (POI) using an automatic scan through
    POI-space, using the TOMS748 algorithm.

    ..., mostly copied from upperlimit docstring.
    """

    def f_all(mu):
        if high > 10:
            par_bounds=[(b[0], high) for b in model.config.suggested_bounds()]
        else:
            par_bounds=None
        return pyhf.infer.hypotest(
            mu, data, model, test_stat="qtilde", return_expected_set=True, par_bounds=par_bounds
        )

    def f(mu, limit=0):
        # Use integers for limit so we don't need a string comparison
        if limit == 0:
            # Obs
            return f_all(mu)[0] - level
        else:
            # Exp (These are in the order -2, -1, 0, 1, 2 sigma)
            return f_all(mu)[1][limit - 1] - level

    tb, _ = pyhf.get_backend()
    obs = tb.astensor(toms748(f, low, high, args=(0), k=2, xtol=atol, rtol=rtol))
    if obs_eq_exp:
        return float(obs)
    else:
        exp = [
            tb.astensor(toms748(f, low, high, args=(i), k=2, xtol=atol, rtol=rtol))
            for i in range(1, 6)
        ]
        return float(obs), exp

def figure_of_merit(sig_counts, bkg_counts, bkg_errors=[], obs_fluctuations=[], 
                    low=0.1, high=10000, tol=0.001):
    sig_counts = [S if S > 0 else 1e-6 for S in sig_counts]
    bkg_counts = [B if B > 0 else 1e-6 for B in bkg_counts]
    if len(bkg_errors) == 0:
        bkg_errors = [0.6*B for B in bkg_counts]
    model = pyhf.simplemodels.hepdata_like(
        signal_data=sig_counts,
        bkg_data=bkg_counts,
        bkg_uncerts=bkg_errors
    )
    if len(obs_fluctuations) == 0:
        obs_fluctuations = [0 for _ in bkg_counts]
    data = list(np.array(bkg_counts) + np.array(obs_fluctuations)) + model.config.auxdata
    try:
        return upperlimit_auto(
            data, model,
            low, high,
            level=0.05, atol=tol, rtol=tol,
            obs_eq_exp=True
        )
    except ValueError as e:
        print(f"WARNING: {e}")
        print(f"Defaulting to error-handling behaviour: returning {high}")
        return high

In [6]:
df = get_df(signal="vbshwwlvlvbb_c3")
df

Unnamed: 0_level_0,is_ps_el,is_ps_mu,is_ps_tau,is_ps_neg,is_ps_lgt,is_ps,drbb,mbb,mbbIn,ptbb,...,tr_lep_cent,dilep_cent,ld_bjet_cent,tr_bjet_cent,H_cent,m_l0bb,m_l1bb,name,is_signal,weight
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,False,True,False,False,True,True,2.633019,160.106079,0,100.903931,...,0.752227,0.668484,0.720484,1.245227,1.034878,336.586578,190.603271,vbshwwlvlvbb_c3,True,0.000346
1,False,False,False,True,False,True,1.511868,179.537537,0,205.446213,...,0.395541,0.137209,0.097592,0.751077,0.464564,334.771118,214.284042,vbshwwlvlvbb_c3,True,0.001729
2,False,True,False,False,True,True,1.566205,125.546730,1,169.493591,...,0.435795,0.584926,0.150139,0.524538,0.281554,314.028961,173.651428,vbshwwlvlvbb_c3,True,0.002007
3,True,False,False,False,True,True,0.589889,124.385864,1,410.051636,...,0.517018,0.006332,0.118691,0.051637,0.045191,733.555847,430.192505,vbshwwlvlvbb_c3,True,0.001773
4,False,True,False,False,True,True,2.321324,108.144234,1,61.758877,...,0.202318,0.230248,0.711445,0.225783,0.686864,239.142349,170.546555,vbshwwlvlvbb_c3,True,0.001955
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9166,False,True,False,False,True,True,2.173155,176.771103,0,107.177139,...,0.262567,1.093112,0.251284,0.628896,0.679815,489.268982,210.979233,ttz,False,0.001524
9167,False,False,True,False,False,True,3.277597,399.772156,0,18.825386,...,0.434180,0.752075,0.178681,0.529697,0.492569,731.045044,495.866882,ttz,False,0.001239
9168,False,True,False,False,True,True,3.269205,246.192139,0,84.842918,...,0.480486,0.846757,1.391145,0.134421,0.564388,873.469971,373.930756,ttz,False,0.001301
9169,False,False,False,True,False,True,2.359409,236.620605,0,135.982590,...,0.199740,0.849951,0.056898,0.805816,0.372169,963.794495,299.590302,ttz,False,-0.001245


In [7]:
# Algorithmically chosen and rounded
c3_sr = "lt >= 200.0 and st >= 400.0 and mjj >= 950.0 and detajj >= 5.5" # (AND Mbb WINDOW IMPLICITLY!!)

rows = []
for name, channel in STRAWMEN(df):
    in_c3_sr = df.eval(c3_sr)
    sig_count = df[channel & in_c3_sr &  df.is_signal].weight.sum()
    bkg_count = df[channel & in_c3_sr & ~df.is_signal].weight.sum()
    rows.append({
        "channel": name,
        "region": c3_sr,
        "sig": sig_count,
        "bkg": bkg_count,
        "limit": figure_of_merit([sig_count], [bkg_count], low=1, high=3500.)
    })
    
table = pd.DataFrame(data=rows)
display(table)
print(table.to_csv(index=False))
print(figure_of_merit(table.sig.values, table.bkg.values, low=1, high=1000.))
print(figure_of_merit([table.sig.sum()], [table.bkg.sum()], low=1, high=1000.))

Unnamed: 0,channel,region,sig,bkg,limit
0,mu+l+,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 a...,0.782884,0.536999,3.781392
1,e+l+,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 a...,0.488734,0.361716,5.548416
2,l+tau+,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 a...,0.181553,1.39444,21.716473
3,lep-lep-,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 a...,0.291952,1.805307,14.915262


channel,region,sig,bkg,limit
mu+l+,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 and detajj >= 5.5,0.7828844785690308,0.5369994640350342,3.781392366074849
e+l+,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 and detajj >= 5.5,0.4887343645095825,0.361716091632843,5.548416250524063
l+tau+,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 and detajj >= 5.5,0.18155300617218018,1.3944400548934937,21.716472766135368
lep-lep-,lt >= 200.0 and st >= 400.0 and mjj >= 950.0 and detajj >= 5.5,0.29195207357406616,1.8053066730499268,14.915261793836727

2.512606816439076
3.687205647297052


In [8]:
# Original BSM signal regions
base_c2v_sr = "detajj >= 5"
tight_c2v_sr = "lt >= 500 and st >= 900"
loose_c2v_sr = f"lt >= 300 and st >= 700 and not ({tight_c2v_sr})"
# 4 tight bins
rows = []
for name, channel in STRAWMEN(df):
    in_c2v_sr = df.eval(f"{base_c2v_sr} and {tight_c2v_sr}")
    sig_counts = df[channel & in_c2v_sr &  df.is_signal].weight.values
    bkg_counts = df[channel & in_c2v_sr & ~df.is_signal].weight.values
    rows.append({
        "channel": name,
        "region": f"{base_c2v_sr} and {tight_c2v_sr}",
        "sig": sig_counts.sum(),
        "sig_err": np.sqrt(np.sum(sig_counts**2)),
        "bkg": bkg_counts.sum(),
        "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
        "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=10, high=10000.)
    })
# 1 loose bin
in_c2v_sr = df.eval(f"{base_c2v_sr} and {loose_c2v_sr}")
sig_counts = df[STRAWMAN_LEP_LEP(df) & in_c2v_sr &  df.is_signal].weight.values
bkg_counts = df[STRAWMAN_LEP_LEP(df) & in_c2v_sr & ~df.is_signal].weight.values
rows.append({
    "channel": "l+l+",
    "region": f"{base_c2v_sr} and {loose_c2v_sr}",
    "sig": sig_counts.sum(),
    "sig_err": np.sqrt(np.sum(sig_counts**2)),
    "bkg": bkg_counts.sum(),
    "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
    "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=1, high=1000.)
})

table = pd.DataFrame(data=rows)
display(table)
print(table.to_csv(index=False))

bkg_stat_err = table.bkg_err.values
bkg_syst_err = 0.6*np.array(table.bkg.values)
bkg_errors = list(np.sqrt(bkg_syst_err**2 + bkg_stat_err**2))
print(figure_of_merit(table.sig.values, table.bkg.values, bkg_errors=bkg_errors, low=1, high=1000.))
print(figure_of_merit([table.sig.sum()], [table.bkg.sum()], low=1, high=2000.))

Unnamed: 0,channel,region,sig,sig_err,bkg,bkg_err,limit
0,mu+l+,detajj >= 5 and lt >= 500 and st >= 900,0.205124,0.018615,0.053186,0.03234,10.327914
1,e+l+,detajj >= 5 and lt >= 500 and st >= 900,0.114149,0.013108,0.00335,0.009005,17.013305
2,l+tau+,detajj >= 5 and lt >= 500 and st >= 900,0.05712,0.009092,0.007102,0.003204,34.325587
3,lep-lep-,detajj >= 5 and lt >= 500 and st >= 900,0.051162,0.009017,0.072579,0.050715,42.426717
4,l+l+,detajj >= 5 and lt >= 300 and st >= 700 and no...,0.387715,0.024729,0.097092,0.022169,5.753501


channel,region,sig,sig_err,bkg,bkg_err,limit
mu+l+,detajj >= 5 and lt >= 500 and st >= 900,0.20512357354164124,0.018614841625094414,0.053185705095529556,0.03233965113759041,10.327914352394082
e+l+,detajj >= 5 and lt >= 500 and st >= 900,0.11414901912212372,0.013107752427458763,0.003349980339407921,0.009004796855151653,17.01330472280167
l+tau+,detajj >= 5 and lt >= 500 and st >= 900,0.057120125740766525,0.00909178052097559,0.007101947441697121,0.0032036947086453438,34.32558707165312
lep-lep-,detajj >= 5 and lt >= 500 and st >= 900,0.05116163194179535,0.009017334319651127,0.07257908582687378,0.05071507394313812,42.426717343853284
l+l+,detajj >= 5 and lt >= 300 and st >= 700 and not (lt >= 500 and st >= 900),0.38771456480026245,0.024729494005441666,0.09709172695875168,0.022168656811118126,5.7535011074535625

2.977094645020018
3.0700621748834975


In [9]:
# Algorithmically chosen and rounded and outside of other SRs
c2v_sr = "detajj > 5 and lt >= 300 and st >= 700"
# c3_sr = f"lt >= 200.0 and st >= 400.0 and mjj >= 950.0 and detajj >= 5 and not ({c2v_sr})" # (AND Mbb WINDOW IMPLICITLY!!)
c3_sr = f"lt >= 200.0 and st >= 500.0 and ptbb > 100 and not ({c2v_sr})"

rows = []
for name, channel in STRAWMEN(df):
    in_c3_sr = df.eval(c3_sr)
    sig_counts = df[channel & in_c3_sr &  df.is_signal].weight.values
    bkg_counts = df[channel & in_c3_sr & ~df.is_signal].weight.values
    rows.append({
        "channel": name,
        "region": c3_sr,
        "sig": sig_counts.sum(),
        "sig_err": np.sqrt(np.sum(sig_counts**2)),
        "bkg": bkg_counts.sum(),
        "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
        "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=1, high=10000.)
    })
    
table = pd.DataFrame(data=rows)
display(table)
print(table.to_csv(index=False))
print(figure_of_merit(table.sig.values, table.bkg.values, low=1, high=1000.))
print(figure_of_merit([table.sig.sum()], [table.bkg.sum()], low=1, high=1000.))

Unnamed: 0,channel,region,sig,sig_err,bkg,bkg_err,limit
0,mu+l+,lt >= 200.0 and st >= 500.0 and ptbb > 100 and...,0.501651,0.028951,1.212314,0.10481,7.479043
1,e+l+,lt >= 200.0 and st >= 500.0 and ptbb > 100 and...,0.253045,0.019405,0.715389,0.100175,12.599585
2,l+tau+,lt >= 200.0 and st >= 500.0 and ptbb > 100 and...,0.103855,0.012474,2.028871,0.270042,44.009678
3,lep-lep-,lt >= 200.0 and st >= 500.0 and ptbb > 100 and...,0.197416,0.01751,3.104055,0.282855,28.1703


channel,region,sig,sig_err,bkg,bkg_err,limit
mu+l+,lt >= 200.0 and st >= 500.0 and ptbb > 100 and not (detajj > 5 and lt >= 300 and st >= 700),0.501650869846344,0.02895050123333931,1.2123140096664429,0.10481015592813492,7.479043317545965
e+l+,lt >= 200.0 and st >= 500.0 and ptbb > 100 and not (detajj > 5 and lt >= 300 and st >= 700),0.25304484367370605,0.019405202940106392,0.7153888940811157,0.10017503052949905,12.599584571565405
l+tau+,lt >= 200.0 and st >= 500.0 and ptbb > 100 and not (detajj > 5 and lt >= 300 and st >= 700),0.10385473817586899,0.012474065646529198,2.0288712978363037,0.2700420618057251,44.009677602693365
lep-lep-,lt >= 200.0 and st >= 500.0 and ptbb > 100 and not (detajj > 5 and lt >= 300 and st >= 700),0.19741596281528473,0.0175103098154068,3.1040549278259277,0.28285452723503113,28.170300074013248

5.473926715958591
8.452189050694322


In [10]:
# Algorithmically chosen and rounded and outside of other SRs plus original BSM SRs
c2v_sr = "detajj > 5 and lt >= 300 and st >= 700"
# c3_sr = f"lt >= 200.0 and st >= 400.0 and mjj >= 1000.0 and detajj > 5 and not ({c2v_sr})" # (AND Mbb WINDOW IMPLICITLY!!)
c3_sr = f"lt >= 200.0 and st >= 400.0 and detajj > 5 and not ({c2v_sr})" # (AND Mbb WINDOW IMPLICITLY!!)

rows = []
for name, channel in STRAWMEN(df):
    in_c3_sr = df.eval(c3_sr)
    sig_counts = df[channel & in_c3_sr &  df.is_signal].weight.values
    bkg_counts = df[channel & in_c3_sr & ~df.is_signal].weight.values
    rows.append({
        "channel": name,
        "region": c3_sr,
        "sig": sig_counts.sum(),
        "sig_err": np.sqrt(np.sum(sig_counts**2)),
        "bkg": bkg_counts.sum(),
        "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
        "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=1, high=1000.),
        "is_C3_sr": True
    })

base_c2v_sr = "detajj >= 5"
tight_c2v_sr = "lt >= 500 and st >= 900"
loose_c2v_sr = f"lt >= 300 and st >= 700 and not ({tight_c2v_sr})"
for name, channel in STRAWMEN(df):
    in_c2v_sr = df.eval(f"{base_c2v_sr} and {tight_c2v_sr}")
    sig_counts = df[channel & in_c2v_sr &  df.is_signal].weight.values
    bkg_counts = df[channel & in_c2v_sr & ~df.is_signal].weight.values
    rows.append({
        "channel": name,
        "region": f"{base_c2v_sr} and {tight_c2v_sr}",
        "sig": sig_counts.sum(),
        "sig_err": np.sqrt(np.sum(sig_counts**2)),
        "bkg": bkg_counts.sum(),
        "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
        "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=1, high=1000.),
        "is_C3_sr": False
    })
in_c2v_sr = df.eval(f"{base_c2v_sr} and {loose_c2v_sr}")
sig_counts = df[STRAWMAN_LEP_LEP(df) & in_c2v_sr &  df.is_signal].weight.values
bkg_counts = df[STRAWMAN_LEP_LEP(df) & in_c2v_sr & ~df.is_signal].weight.values
rows.append({
    "channel": name,
    "region": f"{base_c2v_sr} and {loose_c2v_sr}",
    "sig": sig_counts.sum(),
    "sig_err": np.sqrt(np.sum(sig_counts**2)),
    "bkg": bkg_counts.sum(),
    "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
    "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=1, high=1000.),
    "is_C3_sr": False
})

table = pd.DataFrame(data=rows)
display(table)
print(table.to_csv(index=False))
print(figure_of_merit(table.sig.values, table.bkg.values, low=1, high=1000.))

Unnamed: 0,channel,region,sig,sig_err,bkg,bkg_err,limit,is_C3_sr
0,mu+l+,lt >= 200.0 and st >= 400.0 and detajj > 5 and...,0.577872,0.030891,0.966229,0.137156,6.026403,True
1,e+l+,lt >= 200.0 and st >= 400.0 and detajj > 5 and...,0.345024,0.022667,0.644597,0.158601,8.98533,True
2,l+tau+,lt >= 200.0 and st >= 400.0 and detajj > 5 and...,0.141013,0.014288,2.780226,0.328015,37.368484,True
3,lep-lep-,lt >= 200.0 and st >= 400.0 and detajj > 5 and...,0.255212,0.019894,3.776705,0.349602,24.117841,True
4,mu+l+,detajj >= 5 and lt >= 500 and st >= 900,0.205124,0.018615,0.053186,0.03234,10.327915,False
5,e+l+,detajj >= 5 and lt >= 500 and st >= 900,0.114149,0.013108,0.00335,0.009005,17.013303,False
6,l+tau+,detajj >= 5 and lt >= 500 and st >= 900,0.05712,0.009092,0.007102,0.003204,34.325587,False
7,lep-lep-,detajj >= 5 and lt >= 500 and st >= 900,0.051162,0.009017,0.072579,0.050715,42.426717,False
8,lep-lep-,detajj >= 5 and lt >= 300 and st >= 700 and no...,0.387715,0.024729,0.097092,0.022169,5.753501,False


channel,region,sig,sig_err,bkg,bkg_err,limit,is_C3_sr
mu+l+,lt >= 200.0 and st >= 400.0 and detajj > 5 and not (detajj > 5 and lt >= 300 and st >= 700),0.5778721570968628,0.030891060829162598,0.966229259967804,0.13715556263923645,6.026402603276512,True
e+l+,lt >= 200.0 and st >= 400.0 and detajj > 5 and not (detajj > 5 and lt >= 300 and st >= 700),0.34502357244491577,0.022667158395051956,0.6445968747138977,0.15860146284103394,8.985329863429937,True
l+tau+,lt >= 200.0 and st >= 400.0 and detajj > 5 and not (detajj > 5 and lt >= 300 and st >= 700),0.14101330935955048,0.01428823359310627,2.780226230621338,0.3280152380466461,37.368483946295505,True
lep-lep-,lt >= 200.0 and st >= 400.0 and detajj > 5 and not (detajj > 5 and lt >= 300 and st >= 700),0.25521188974380493,0.019893553107976913,3.776704788208008,0.34960225224494934,24.117840646101474,True
mu+l+,detajj >= 5 and lt >= 500 and st >= 900,0.20512357354164124,0.018614841625094414,0.053185705095529556,0.03233965113759041,10.327914511565

In [11]:
textable = table.round(3).copy()
textable["Signal Region"] = [
    "$\muchan$", "$\elchan$", "$\tauchan$", "$\negchan$",
    "$\muchan$", "$\elchan$", "$\tauchan$", "$\negchan$", "$\lgtchan$"
]
textable["$N_{sig}$"] = textable.sig.astype(str)+" $\pm$ "+textable.sig_err.astype(str)
textable["$N_{bkg}$"] = textable.bkg.astype(str)+" $\pm$ "+textable.bkg_err.astype(str)
textable["$\mu_{0.05}$"] = textable.limit
textable.drop(columns=table.columns, inplace=True)
textable = textable.reindex([1,0,2,3,5,4,6,7,8])
print(textable.to_latex(index=False, escape=False))

\begin{tabular}{lllr}
\toprule
Signal Region &          $N_{sig}$ &          $N_{bkg}$ &  $\mu_{0.05}$ \\
\midrule
    $\elchan$ &  0.345 $\pm$ 0.023 &  0.645 $\pm$ 0.159 &         8.985 \\
    $\muchan$ &  0.578 $\pm$ 0.031 &  0.966 $\pm$ 0.137 &         6.026 \\
   $\tauchan$ &  0.141 $\pm$ 0.014 &   2.78 $\pm$ 0.328 &        37.368 \\
   $\negchan$ &   0.255 $\pm$ 0.02 &   3.777 $\pm$ 0.35 &        24.118 \\
    $\elchan$ &  0.114 $\pm$ 0.013 &  0.003 $\pm$ 0.009 &        17.013 \\
    $\muchan$ &  0.205 $\pm$ 0.019 &  0.053 $\pm$ 0.032 &        10.328 \\
   $\tauchan$ &  0.057 $\pm$ 0.009 &  0.007 $\pm$ 0.003 &        34.326 \\
   $\negchan$ &  0.051 $\pm$ 0.009 &  0.073 $\pm$ 0.051 &        42.427 \\
   $\lgtchan$ &  0.388 $\pm$ 0.025 &  0.097 $\pm$ 0.022 &         5.754 \\
\bottomrule
\end{tabular}



In [12]:
bsm_df = get_df()
bsm_df

Unnamed: 0_level_0,is_ps_el,is_ps_mu,is_ps_tau,is_ps_neg,is_ps_lgt,is_ps,drbb,mbb,mbbIn,ptbb,...,tr_lep_cent,dilep_cent,ld_bjet_cent,tr_bjet_cent,H_cent,m_l0bb,m_l1bb,name,is_signal,weight
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,True,False,False,False,True,True,0.497081,25.816963,1,96.786095,...,0.308719,0.226823,1.121493,1.175936,1.148793,473.280151,242.922455,bosons,False,0.000000
1,False,False,False,True,False,True,1.646614,61.345638,1,65.223434,...,0.279254,0.089646,0.145967,0.600519,0.399470,173.505966,181.829041,bosons,False,0.005594
2,False,False,False,True,False,True,3.262026,166.077347,0,213.228149,...,0.428899,0.180906,0.004749,0.497544,0.051179,487.613953,258.246368,bosons,False,0.000000
3,False,True,False,False,True,True,0.523911,26.258171,1,80.480110,...,0.404866,0.638758,0.478509,0.280626,0.419247,170.880524,318.585052,bosons,False,0.005536
4,False,True,False,False,True,True,2.596516,94.711746,1,51.146160,...,0.810273,0.924421,0.336551,0.147055,0.562886,311.905731,245.789078,bosons,False,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9166,False,True,False,False,True,True,2.173155,176.771103,0,107.177139,...,0.262567,1.093112,0.251284,0.628896,0.679815,489.268982,210.979233,ttz,False,0.001524
9167,False,False,True,False,False,True,3.277597,399.772156,0,18.825386,...,0.434180,0.752075,0.178681,0.529697,0.492569,731.045044,495.866882,ttz,False,0.001239
9168,False,True,False,False,True,True,3.269205,246.192139,0,84.842918,...,0.480486,0.846757,1.391145,0.134421,0.564388,873.469971,373.930756,ttz,False,0.001301
9169,False,False,False,True,False,True,2.359409,236.620605,0,135.982590,...,0.199740,0.849951,0.056898,0.805816,0.372169,963.794495,299.590302,ttz,False,-0.001245


In [13]:
c2v_sr = "detajj > 5 and lt >= 300 and st >= 700"
c3_sr = f"st >= 300 and mjj >= 1700 and detajj >= 4 and not ({c2v_sr})" # (AND Mbb WINDOW IMPLICITLY!!)

rows = []
for name, channel in STRAWMEN(bsm_df):
    in_c3_sr = bsm_df.eval(c3_sr)
    sig_counts = bsm_df[channel & in_c3_sr &  bsm_df.is_signal].weight.values
    bkg_counts = bsm_df[channel & in_c3_sr & ~bsm_df.is_signal].weight.values
    rows.append({
        "channel": name,
        "region": c3_sr,
        "sig": sig_counts.sum(),
        "sig_err": np.sqrt(np.sum(sig_counts**2)),
        "bkg": bkg_counts.sum(),
        "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
        "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=1, high=10000.),
        "is_C3_sr": True
    })
    
base_c2v_sr = "detajj >= 5"
tight_c2v_sr = "lt >= 500 and st >= 900"
loose_c2v_sr = f"lt >= 300 and st >= 700 and not ({tight_c2v_sr})"
# 4 tight bins
for name, channel in STRAWMEN(bsm_df):
    in_c2v_sr = bsm_df.eval(f"{base_c2v_sr} and {tight_c2v_sr}")
    sig_counts = bsm_df[channel & in_c2v_sr &  bsm_df.is_signal].weight.values
    bkg_counts = bsm_df[channel & in_c2v_sr & ~bsm_df.is_signal].weight.values
    rows.append({
        "channel": name,
        "region": f"{base_c2v_sr} and {tight_c2v_sr}",
        "sig": sig_counts.sum(),
        "sig_err": np.sqrt(np.sum(sig_counts**2)),
        "bkg": bkg_counts.sum(),
        "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
        "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=0.1, high=10.),
        "is_C3_sr": False
    })
# 1 loose bin
in_c2v_sr = bsm_df.eval(f"{base_c2v_sr} and {loose_c2v_sr}")
sig_counts = bsm_df[STRAWMAN_LEP_LEP(bsm_df) & in_c2v_sr &  bsm_df.is_signal].weight.values
bkg_counts = bsm_df[STRAWMAN_LEP_LEP(bsm_df) & in_c2v_sr & ~bsm_df.is_signal].weight.values
rows.append({
    "channel": "l+l+",
    "region": f"{base_c2v_sr} and {loose_c2v_sr}",
    "sig": sig_counts.sum(),
    "sig_err": np.sqrt(np.sum(sig_counts**2)),
    "bkg": bkg_counts.sum(),
    "bkg_err": np.sqrt(np.sum(bkg_counts**2)),
    "limit": figure_of_merit([sig_counts.sum()], [bkg_counts.sum()], low=0.1, high=10.),
    "is_C3_sr": False
})
    
table = pd.DataFrame(data=rows)
display(table)
print(table.to_csv(index=False))

# Compute errors
bkg_stat_err = table.bkg_err.values
bkg_syst_err = 0.6*np.array(table.bkg.values)
bkg_errors = list(np.sqrt(bkg_syst_err**2 + bkg_stat_err**2))

print("Using C2V+C3 SRs:")
print(figure_of_merit(table.sig.values, table.bkg.values, bkg_errors=bkg_errors, low=0.1, high=10.))
print("Variations:")
for i, name in enumerate(table[table.is_C3_sr].channel.values):
    obs_fluctuations = [0, 0, 0, 0, 0, 0, 0, 0, 0]
    obs_fluctuations[i] = 1
    fom = figure_of_merit(table.sig.values, table.bkg.values, bkg_errors=bkg_errors,
                          obs_fluctuations=obs_fluctuations, 
                          low=0.1, high=10.)
    print(f"Varying obs events in C3 {name} SR by +1 event: {fom}")
    obs_fluctuations[i] = 2
    fom = figure_of_merit(table.sig.values, table.bkg.values, bkg_errors=bkg_errors,
                          obs_fluctuations=obs_fluctuations, 
                          low=0.1, high=10.)
    print(f"Varying obs events in C3 {name} SR by +2 events: {fom}")

Unnamed: 0,channel,region,sig,sig_err,bkg,bkg_err,limit,is_C3_sr
0,mu+l+,st >= 300 and mjj >= 1700 and detajj >= 4 and ...,0.306287,0.017215,0.564734,0.076037,9.785526,True
1,e+l+,st >= 300 and mjj >= 1700 and detajj >= 4 and ...,0.148198,0.011227,0.627001,0.154663,20.768195,True
2,l+tau+,st >= 300 and mjj >= 1700 and detajj >= 4 and ...,0.084044,0.008135,1.734802,0.270836,50.989524,True
3,lep-lep-,st >= 300 and mjj >= 1700 and detajj >= 4 and ...,0.090218,0.009089,1.803916,0.233192,48.251872,True
4,mu+l+,detajj >= 5 and lt >= 500 and st >= 900,1.658314,0.040238,0.053186,0.03234,1.277502,False
5,e+l+,detajj >= 5 and lt >= 500 and st >= 900,0.863704,0.027407,0.00335,0.009005,2.248522,False
6,l+tau+,detajj >= 5 and lt >= 500 and st >= 900,0.437412,0.019123,0.007102,0.003204,4.482465,False
7,lep-lep-,detajj >= 5 and lt >= 500 and st >= 900,0.423834,0.019629,0.072579,0.050715,5.121393,False
8,l+l+,detajj >= 5 and lt >= 300 and st >= 700 and no...,0.658006,0.024757,0.097092,0.022169,3.390117,False


channel,region,sig,sig_err,bkg,bkg_err,limit,is_C3_sr
mu+l+,st >= 300 and mjj >= 1700 and detajj >= 4 and not (detajj > 5 and lt >= 300 and st >= 700),0.30628710985183716,0.01721525564789772,0.5647343993186951,0.07603704184293747,9.785525973669888,True
e+l+,st >= 300 and mjj >= 1700 and detajj >= 4 and not (detajj > 5 and lt >= 300 and st >= 700),0.14819835126399994,0.0112268952652812,0.6270008087158203,0.1546628326177597,20.76819459172367,True
l+tau+,st >= 300 and mjj >= 1700 and detajj >= 4 and not (detajj > 5 and lt >= 300 and st >= 700),0.08404399454593658,0.00813460722565651,1.73480224609375,0.2708361744880676,50.989523586671744,True
lep-lep-,st >= 300 and mjj >= 1700 and detajj >= 4 and not (detajj > 5 and lt >= 300 and st >= 700),0.09021788835525513,0.009089270606637001,1.8039155006408691,0.2331920564174652,48.2518720458487,True
mu+l+,detajj >= 5 and lt >= 500 and st >= 900,1.658313512802124,0.040238093584775925,0.053185705095529556,0.03233965113759041,1.2775019211319525,False
e

In [14]:
display(table[~table.is_C3_sr])

bkg_stat_err = table[~table.is_C3_sr].bkg_err.values
bkg_syst_err = 0.6*np.array(table[~table.is_C3_sr].bkg.values)
bkg_errors = list(np.sqrt(bkg_syst_err**2 + bkg_stat_err**2))

print("Using only C2V SRs:")
print(figure_of_merit(table[~table.is_C3_sr].sig.values, table[~table.is_C3_sr].bkg.values, 
                      bkg_errors=bkg_errors, low=0.1, high=10.))
print("Variations:")
for i, name in enumerate(table[~table.is_C3_sr].channel.values):
    obs_fluctuations = [0, 0, 0, 0, 0]
    obs_fluctuations[i] = 1
    fom = figure_of_merit(table[~table.is_C3_sr].sig.values, table[~table.is_C3_sr].bkg.values, 
                          bkg_errors=bkg_errors,
                          obs_fluctuations=obs_fluctuations, 
                          low=0.1, high=10.)
    print(f"Varying obs events in C2V {name} SR by +1 event: {fom}")
    obs_fluctuations[i] = 2
    fom = figure_of_merit(table[~table.is_C3_sr].sig.values, table[~table.is_C3_sr].bkg.values, 
                          bkg_errors=bkg_errors,
                          obs_fluctuations=obs_fluctuations, 
                          low=0.1, high=10.)
    print(f"Varying obs events in C2V {name} SR by +2 events: {fom}")

Unnamed: 0,channel,region,sig,sig_err,bkg,bkg_err,limit,is_C3_sr
4,mu+l+,detajj >= 5 and lt >= 500 and st >= 900,1.658314,0.040238,0.053186,0.03234,1.277502,False
5,e+l+,detajj >= 5 and lt >= 500 and st >= 900,0.863704,0.027407,0.00335,0.009005,2.248522,False
6,l+tau+,detajj >= 5 and lt >= 500 and st >= 900,0.437412,0.019123,0.007102,0.003204,4.482465,False
7,lep-lep-,detajj >= 5 and lt >= 500 and st >= 900,0.423834,0.019629,0.072579,0.050715,5.121393,False
8,l+l+,detajj >= 5 and lt >= 300 and st >= 700 and no...,0.658006,0.024757,0.097092,0.022169,3.390117,False


Using only C2V SRs:
0.5916254659330469
Variations:
Varying obs events in C2V mu+l+ SR by +1 event: 1.016559895320274
Varying obs events in C2V mu+l+ SR by +2 events: 1.387965554253165
Varying obs events in C2V e+l+ SR by +1 event: 1.0376223091326264
Varying obs events in C2V e+l+ SR by +2 events: 1.4114383643531454
Varying obs events in C2V l+tau+ SR by +1 event: 1.025953639457816
Varying obs events in C2V l+tau+ SR by +2 events: 1.3995121427565747
Varying obs events in C2V lep-lep- SR by +1 event: 0.8836985621351543
Varying obs events in C2V lep-lep- SR by +2 events: 1.2057919389647063
Varying obs events in C2V l+l+ SR by +1 event: 0.9234140931987093
Varying obs events in C2V l+l+ SR by +2 events: 1.273153089446546
