In [2]:
import dask_awkward as dak
import awkward as ak
from distributed import LocalCluster, Client, progress
import time
import numpy as np
import matplotlib.pyplot as plt
import json
import mplhep as hep
import glob
import pandas as pd

plt.style.use(hep.style.CMS)

client =  Client(n_workers=15,  threads_per_worker=2, processes=True, memory_limit='8 GiB') 


Perhaps you already have a cluster running?
Hosting the HTTP server on port 44007 instead
Task exception was never retrieved
future: <Task finished name='Task-5055053' coro=<Client._gather.<locals>.wait() done, defined at /depot/cms/kernels/root632/lib/python3.12/site-packages/distributed/client.py:2197> exception=AllExit()>
Traceback (most recent call last):
  File "/depot/cms/kernels/root632/lib/python3.12/site-packages/distributed/client.py", line 2206, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
future: <Task finished name='Task-5055031' coro=<Client._gather.<locals>.wait() done, defined at /depot/cms/kernels/root632/lib/python3.12/site-packages/distributed/client.py:2197> exception=AllExit()>
Traceback (most recent call last):
  File "/depot/cms/kernels/root632/lib/python3.12/site-packages/distributed/client.py", line 2206, in wait
    raise AllExit()
distributed.client.AllExit
Task exception was never retrieved
future: <Task finished 

In [2]:
training_samples = {
        "background": [ # for some reason, having more than dy causes things to break
            "dy_M-100To200", 
            "ttjets_dl",
            "ttjets_sl",
            "st_tw_top",
            "st_tw_antitop",
            "ww_2l2nu",
            "wz_1l1nu2q",
            "wz_2l2q",
            "wz_3lnu",
            "zz",
            "ewk_lljj_mll50_mjj120",
        ],
        "signal": [
            "ggh_powheg", 
            "vbf_powheg",
    ],
}
MC_groups = {
    "DY" : ["dy_M-100To200"],
    "Top": ["ttjets_dl",
            "ttjets_sl",
            "st_tw_top",
            "st_tw_antitop",],
    "VV": ["ww_2l2nu",
            "wz_1l1nu2q",
            "wz_2l2q",
            "wz_3lnu",
            "zz",],
    "EWK" : ["ewk_lljj_mll50_mjj120"]
}

In [3]:
start_time = time.time()
cols_of_interest = [
    'dimuon_mass',
]
additional_fields = [
    "wgt_nominal_total",
    "h_sidebands",
    "h_peak",
    "vbf_cut",
    "nBtagLoose",
    "nBtagMedium",
    "mu1_pt",
    "mu2_pt",
    "mu1_pt_roch",
    "mu2_pt_roch",
    "mu1_pt_fsr",
    "mu2_pt_fsr",
    # "mu1_pt_gf",
    # "mu2_pt_gf",
    "mu1_pt_raw",
    "mu1_eta",
    "mu2_eta",
    "mu1_phi",
    "mu2_phi",
    "dimuon_pt",
    "dimuon_eta",
    "dimuon_phi",
    "dimuon_mass",
    "jet1_rapidity",
    "jet2_rapidity",
    "jet1_phi",
    "jet1_pt",
    "jet2_pt",
    "jet2_phi",
    "jet1_eta",
    "jet2_eta",
    "jj_mass",
    "jj_dEta",
    "event"
]
fields2compute = cols_of_interest +  additional_fields
fields2compute = list(set(fields2compute))


In [2]:
def getHist(events, field2plot, binning):
    weight = ak.fill_none(events.wgt_nominal_total, value=0)
    value = ak.fill_none(events[field2plot], value=-999)
    # use np.isnan to filter away remaining nan values
    nan_filter = ~(np.isnan(weight) | np.isnan(value)) # some nans are not None, apparently
    weight = weight[nan_filter]
    weight = ak.values_astype(weight, np.float64)
    value = value[nan_filter]
    
    print(f"getHist weight sum: {np.sum(weight)}")
    print(f"getHist value sum: {np.sum(value)}")
    print(f"getHist weight: {weight}")
    print(f"getHist value: {value}")
    # print(f"getHist is none weight: {ak.sum(ak.is_none(weight))}")
    # print(f"getHist is none value: {ak.sum(ak.is_none(value))}")
    # weight = weight/ np.sum(weight) # normalize to one
    # print(f"np.sum(weight): {np.sum(weight)}")
    hist, edges = np.histogram(value, bins=binning, weights=weight)
    print(f"getHist hist b4 normalization: {hist}")
    hist = hist / np.sum(hist)
    print(f"np.sum(hist): {np.sum(hist)}")
    return hist, edges

def applyGGH_cut(events):
    btag_cut =ak.fill_none((events.nBtagLoose >= 2), value=False) | ak.fill_none((events.nBtagMedium >= 1), value=False)
    # vbf_cut = ak.fill_none(events.vbf_cut, value=False
    vbf_cut = (events.jj_mass > 400) & (events.jj_dEta > 2.5)
    # vbf_cut = (events.jj_mass > 400) & (events.jj_dEta > 2.5) & (events.jet1_pt > 35) 
    vbf_cut = ak.fill_none(vbf_cut, value=False)
    # region = events.h_sidebands | events.h_peak
    dimuon_mass = events.dimuon_mass
    region = (dimuon_mass >= 110) & (dimuon_mass <= 150.0)
    
    # region = events.h_sidebands 
    ggH_filter = (
        ~vbf_cut & 
        region &
        ~btag_cut # btag cut is for VH and ttH categories
    )
    return events[ggH_filter]


def applyVBF_cut(events):
    btag_cut =ak.fill_none((events.nBtagLoose >= 2), value=False) | ak.fill_none((events.nBtagMedium >= 1), value=False)
    # vbf_cut = ak.fill_none(events.vbf_cut, value=False
    vbf_cut = (events.jj_mass > 400) & (events.jj_dEta > 2.5) & (events.jet1_pt > 35) 
    vbf_cut = ak.fill_none(vbf_cut, value=False)
    # region = events.h_peak 
    # region = events.h_sidebands | events.h_peak
    # region = events.h_sidebands 
    dimuon_mass = events.dimuon_mass
    # region = (dimuon_mass >= 110) & (dimuon_mass <= 150.0)
    region = (dimuon_mass > 115.03) & (dimuon_mass < 135.03)
    # region = ((dimuon_mass > 110) & (dimuon_mass < 115.03)) | ((dimuon_mass > 135.03) & (dimuon_mass < 150))
    VBF_filter = (
        vbf_cut & 
        region &
        ~btag_cut # btag cut is for VH and ttH categories
    )
    trues = ak.ones_like(dimuon_mass, dtype="bool")
    falses = ak.zeros_like(dimuon_mass, dtype="bool")
    events["vbf_filter"] = ak.where(VBF_filter, trues,falses)
    return events[VBF_filter]
    # return events

def calculateEdgeCases(events):
    dimuon_mass = events.dimuon_mass
    within_SR = (dimuon_mass >= 110) & (dimuon_mass <= 150)
    print(f"ak.num(within_SR,axis=0): {ak.num(within_SR,axis=0)}")
    region = events.h_sidebands | events.h_peak
    print(f"ak.num(region,axis=0): {ak.num(region,axis=0)}")
    print(f"ak.sum(region): {ak.sum(region)}")
    test_filter = ~(within_SR & region)
    test_filter = test_filter[within_SR]
    return test_filter

def getDeltaPhi(phi1,phi2):
    phi1 = ak.values_astype(phi1, np.float64)
    phi2 = ak.values_astype(phi2, np.float64)
    # print(f"phi1: {phi1.compute()}")
    dphi = abs(np.mod(phi1 - phi2 + np.pi, 2 * np.pi) - np.pi)
    return dphi

def computeBkgFromParquet(load_path, bkgSample_l, fields2compute):
    zip_l = []
    # fields2compute =  fields2compute +["wgt_nominal_zpt_wgt"]
    for sample in bkgSample_l:
        events = dak.from_parquet(load_path+f"/{sample}/*/*.parquet")
        # print(events.fields)
        # print(events.wgt_nominal_zpt_wgt)
        # events["jj_dRapidity"] = np.abs(events.jet1_rapidity - events.jet2_rapidity)
        # events["mmj1_dRapidity"] = np.abs(events.jet1_rapidity - events.dimuon_rapidity)
        # events["mmj2_dRapidity"] = np.abs(events.jet2_rapidity - events.dimuon_rapidity)
        events["jj_dPhiV2"] = ak.fill_none(getDeltaPhi(events.jet1_phi, events.jet2_phi), value=-1)
        # bool_filter = ak.fill_none((events.mmj1_dEta < events.mmj2_dEta), value=True)
        # events["mmj_min_dEtaV2"] = ak.where(bool_filter, events.mmj1_dEta, events.mmj2_dEta)
        # bool_filter = ak.fill_none((events.mmj1_dPhi < events.mmj2_dPhi), value=True)
        # events["mmj_min_dPhiV2"] = ak.where(bool_filter, events.mmj1_dPhi, events.mmj2_dPhi)
        zip = ak.zip({field: events[field] for field in fields2compute}).compute()
        zip_l.append(zip)
    
    final_zip = ak.concatenate(zip_l)
    # zpt removal test start ------------------------
    # final_zip["wgt_nominal_total"] = final_zip.wgt_nominal_total / final_zip.wgt_nominal_zpt_wgt
    # zpt removal test end ------------------------
    return final_zip



In [3]:
# """
# similar test, but with VBF
# """
# # normal from_parquet doesn't work, so using convoluted concatenating method
# # year = 2018
# # load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}*/f1_0"
# # load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}preVFP/f1_0"
# # load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}/f1_0"
# # load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}/f1_0"
# # load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/*/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_WgtON_original_AN_BDTV2_Oct18/*/f1_0"


# events_data = dak.from_parquet(f"{load_path}/data_*/*/*.parquet")
# events_data = ak.zip({field: events_data[field] for field in fields2compute}).compute()
# edge_cases = calculateEdgeCases(events_data)
# print(ak.sum(edge_cases))
# print(ak.num(edge_cases, axis=0))
# # events_data = applyVBF_cut(events_data)
# # data_yield = ak.num(events_data.dimuon_mass, axis=0)
# # data_yield

In [1]:
import dask_awkward as dak
import awkward as ak
from distributed import LocalCluster, Client, progress
import time
import numpy as np
import matplotlib.pyplot as plt
import json
import mplhep as hep
import glob
import pandas as pd

plt.style.use(hep.style.CMS)

client =  Client(n_workers=15,  threads_per_worker=2, processes=True, memory_limit='10 GiB') 

In [None]:
"""
copperheadV1 test
"""
import glob

def applyVBF_cutV1(events):
    btag_cut =ak.fill_none((events.nBtagLoose_nominal >= 2), value=False) | ak.fill_none((events.nBtagMedium_nominal >= 1), value=False)
    # vbf_cut = ak.fill_none((events.jj_mass_nominal > 400), value=False) & ak.fill_none((events.jj_dEta_nominal > 2.5), value=False) & ak.fill_none((events.jet1_pt_nominal > 35), value=False) 
    vbf_cut = ak.fill_none((events.jj_mass_nominal > 400), value=False) & ak.fill_none((events.jj_dEta_nominal > 2.5), value=False)
    # vbf_cut = ak.fill_none(vbf_cut, value=False)
    dimuon_mass = events.dimuon_mass
    # region = (dimuon_mass >= 110) & (dimuon_mass <= 150.0)
    # region = (dimuon_mass > 115.03) & (dimuon_mass < 135.03)
    # region = ((dimuon_mass > 110) & (dimuon_mass < 115.03)) | ((dimuon_mass > 135.03) & (dimuon_mass < 150))

    # region = (events.region == "h-peak") | (events.region == "h-sidebands")
    # region = events.region == "h-sidebands"
    VBF_filter = (
        vbf_cut & 
        ~btag_cut # btag cut is for VH and ttH categories
    )
    trues = ak.ones_like(dimuon_mass, dtype="bool")
    falses = ak.zeros_like(dimuon_mass, dtype="bool")
    events["vbf_filter"] = ak.where(VBF_filter, trues,falses)
    return events[VBF_filter]
    # return events

def applyGGH_cutV1(events):
    btag_cut =ak.fill_none((events.nBtagLoose_nominal >= 2), value=False) | ak.fill_none((events.nBtagMedium_nominal >= 1), value=False)
    vbf_cut = ak.fill_none((events.jj_mass_nominal > 400), value=False) & ak.fill_none((events.jj_dEta_nominal > 2.5), value=False)
    # vbf_cut = (events.jj_mass_nominal > 400) & (events.jj_dEta_nominal > 2.5) & (events.jet1_pt_nominal > 35) 
    # vbf_cut = ak.fill_none(vbf_cut, value=False)
    dimuon_mass = events.dimuon_mass
    ggH_filter = (
        ~vbf_cut & 
        ~btag_cut # btag cut is for VH and ttH categories
    )
    return events[ggH_filter]

def applyttH_hadronic_cut(events):
    btag_cut = ak.fill_none((events.nBtagLoose_nominal >= 2), value=False) | ak.fill_none((events.nBtagMedium_nominal >= 1), value=False)
    ttH_hadronic_filter = (
        btag_cut
    )
    return events[ttH_hadronic_filter]

def filterRegion(events, region="h-peak"):
    dimuon_mass = events.dimuon_mass
    if region =="h-peak":
        region = (dimuon_mass > 115.03) & (dimuon_mass < 135.03)
    elif region =="h-sidebands":
        region = ((dimuon_mass > 110) & (dimuon_mass < 115.03)) | ((dimuon_mass > 135.03) & (dimuon_mass < 150))
    elif region =="signal":
        region = (dimuon_mass >= 110) & (dimuon_mass <= 150.0)

    # mu1_pt = events.mu1_pt
    # mu1ptOfInterest = (mu1_pt > 75) & (mu1_pt < 150.0)
    # events = events[region&mu1ptOfInterest]
    events = events[region]
    return events
    

V1_fields_2compute = [
    "wgt_nominal",
    "nBtagLoose_nominal",
    "nBtagMedium_nominal",
    "mu1_pt",
    "mu2_pt",
    "mu1_eta",
    "mu2_eta",
    "mu1_phi",
    "mu2_phi",
    "dimuon_pt",
    "dimuon_eta",
    "dimuon_phi",
    "dimuon_mass",
    "jet1_phi_nominal",
    "jet1_pt_nominal",
    "jet2_pt_nominal",
    "jet2_phi_nominal",
    "jet1_eta_nominal",
    "jet2_eta_nominal",
    "jj_mass_nominal",
    "jj_dEta_nominal",
    # "region",
    "event",
]
 
# year = "2018"
# year = "2016postVFP"
# load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/rereco_yun_Dec05_btagSystFixed_JesJerUncOn/stage1_output/{year}/"
# load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec10/stage1_output/{year}/"
# load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_L1JecOff/stage1_output/{year}/"
# load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff/stage1_output/{year}/"
# load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/{year}/"
# load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec15_JecOff_JesJerUncOn_2016LumiFix/stage1_output/{year}/"

# load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/V2_Dec20_RERECO_MuIdMuIsoRoccor/stage1_output/{year}/f1_0"

# label = "V2_Jan17_JecDefault_valerieZpt"
label = "test_test"
total_integral = 0
for year in ["2018", "2017", "2016postVFP", "2016preVFP"]:
# for year in ["2018", "2017", "2016"]:
    load_path =f"/depot/cms/users/yun79/hmm/copperheadV1clean/{label}/stage1_output/{year}/f1_0"
    # load_path =f"//depot/cms/users/yun79/hmm/copperheadV1clean/rereco_yun_Dec05_btagSystFixed_JesJerUncOn/stage1_output/{year}/"
    filelist = glob.glob(f"{load_path}/data_*")
    print(filelist)
    
    for region in ["signal"]:
        for file in filelist:
            # events_data = dak.from_parquet(f"{file}/*.parquet")
            events_data = dak.from_parquet(f"{file}/*/*.parquet")
            # print(events_data.fields)
            # events_data.fields
            events_data = ak.zip({field: events_data[field] for field in V1_fields_2compute}).compute()
            
            # print(region)
            # raise ValueError
            events_data = filterRegion(events_data, region=region)
            events_data = applyGGH_cutV1(events_data)
            # events_data = applyVBF_cutV1(events_data)
            # events_data = applyttH_hadronic_cut(events_data)
            
            data_yield = ak.num(events_data.dimuon_mass, axis=0)
            # data_yield = ak.num(events_data.dimuon_mass, axis=0).compute()
            # ak.to_dataframe(events_data).to_csv("event_dataC_V1.csv")
            # df = pd.DataFrame({field: ak.fill_none(events_data[field], value=-999.9) for field in events_data.fields})
            # df.to_csv("event_dataC_V1.csv")
            print(f"data_yield for {file}: {data_yield}")
            total_integral += data_yield
print(f"total integral for {region} region : {total_integral}")


['/depot/cms/users/yun79/hmm/copperheadV1clean/test_test/stage1_output/2018/f1_0/data_C']


In [6]:
! ls /depot/cms/users/yun79/hmm/copperheadV1clean/

DmitryMaster_JECoff_GeofitFixed_Nov01
recreate_val_Nov02
rereco_yun_addJecUnc_Nov5
rereco_yun_Dec02_puidWgtOn_JecOff
rereco_yun_Dec02_puidWgtOn_OrigData
rereco_yun_Dec04NewBtag
rereco_yun_Dec04_trigmatchOn
rereco_yun_Dec05_btagSystFixed_JesJerUncOn
rereco_yun_JecDefaultZptOnJecUncOn_Nov20
ul_yun_Dec12_JecOff
ul_yun_Dec12_JecOff_JesJerUncOn
ul_yun_Dec15_JecOff_JesJerUncOn_2016LumiFix
V2_Dec16
V2_Dec18_RERECO_MuIdMuIso
V2_Dec18_RERECO_MuIdMuIso_puOn
V2_Dec20_RERECO_MuIdMuIsoRoccor
V2_Dec21
V2_Dec21_HEMVetoOnZptOn
V2_Dec22_HEMVetoOnZptOn_RerecoBtagSF_XS_Rereco
V2_Dec22_HEMVetoOnZptOn_RerecoBtagSF_XS_Rereco_BtagWPsFixed
V2_Dec22_HEMVetoOnZptOn_RerecoBtagSF_XS_Rereco_BtagWPsFixed_changedCleanJet
V2_Dec22_HEMVetoOnZptOn_ULBtagSF_XS_Rereco_BtagWPsFixed
V2_Jan09_ForZptReWgt
V2_Jan10_NewZptReWgt
V2_Jan10_NewZptReWgt_JecOff
V2_Jan11_ForZptReWgt_JecFixed
V2_Jan11_NewZptReWgt
V2_Jan14_JecOn_oldZptWgt
V2_Jan16_JecDefault_oldZptWgt
V2_Jan16_JecDefault_plotEveryonesZptWgt
V2_Jan17_JecDefault_plotEver

In [8]:
! ls /depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/2018/

data_A	       dy_m105_160_vbf_amc	      st_tw_top		 ww_2l2nu
data_B	       ewk_lljj_mll105_160_ptj0       ttjets_dl		 wz_1l1nu2q
data_C	       ewk_lljj_mll105_160_py_dipole  ttjets_sl		 wz_2l2q
data_D	       ggh_powhegPS		      vbf_powheg_dipole  wz_3lnu
dy_M-100To200  st_tw_antitop		      vbf_powheg_herwig  zz


In [12]:
# do same for dy sampel

filelist = glob.glob(f"{load_path}/dy_M-100To200")
# print(filelist)

total_integral = 0
for file in filelist:
    events = dak.from_parquet(f"{file}/*.parquet")
    # events = dak.from_parquet(f"{file}/*/*.parquet")
    events = ak.zip({field: events[field] for field in V1_fields_2compute}).compute()
    events = applyVBF_cutV1(events)
    events = filterRegion(events, region="h-sidebands")
    dy_yield = ak.sum(events.wgt_nominal)
    dy_nevents = ak.num(events.wgt_nominal, axis=0)
    # dy_yield = ak.sum(events.wgt_nominal).compute()
    # dy_nevents = ak.num(events.wgt_nominal, axis=0).compute()

    print(f"dy_yield for {file}: {dy_yield}")
    print(f"dy_nevents for {file}: {dy_nevents}")
    total_integral += dy_yield
total_integral




dy_yield for //depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/2018//dy_M-100To200: 3469.7648647866
dy_nevents for //depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/2018//dy_M-100To200: 92788


3469.7648647866

In [14]:
# do same for ewk sampel

filelist = glob.glob(f"{load_path}/ewk_lljj_mll105_160_ptj0")
# print(filelist)

total_integral = 0
for region in ["h-peak","h-sidebands"]:
    for file in filelist:
        events = dak.from_parquet(f"{file}/*.parquet")
        # events = dak.from_parquet(f"{file}/*/*.parquet")
        events = ak.zip({field: events[field] for field in V1_fields_2compute}).compute()
        events = applyVBF_cutV1(events)
        events = filterRegion(events, region=region)
        dy_yield = ak.sum(events.wgt_nominal)
        dy_nevents = ak.num(events.wgt_nominal, axis=0)
        # dy_yield = ak.sum(events.wgt_nominal).compute()
        # dy_nevents = ak.num(events.wgt_nominal, axis=0).compute()
    
        print(f"yield for {file} {region}: {dy_yield}")





yield for //depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/2018//ewk_lljj_mll105_160_ptj0 h-peak: 125.8663830517629




yield for //depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/2018//ewk_lljj_mll105_160_ptj0 h-sidebands: 107.88340534697751


In [15]:
# do same for ewk sampel

filelist = glob.glob(f"{load_path}/vbf_powheg_dipole")
# print(filelist)

total_integral = 0
for region in ["h-peak","h-sidebands"]:
    for file in filelist:
        events = dak.from_parquet(f"{file}/*.parquet")
        # events = dak.from_parquet(f"{file}/*/*.parquet")
        events = ak.zip({field: events[field] for field in V1_fields_2compute}).compute()
        events = applyVBF_cutV1(events)
        events = filterRegion(events, region=region)
        dy_yield = ak.sum(events.wgt_nominal)
        dy_nevents = ak.num(events.wgt_nominal, axis=0)
        # dy_yield = ak.sum(events.wgt_nominal).compute()
        # dy_nevents = ak.num(events.wgt_nominal, axis=0).compute()
    
        print(f"yield for {file} {region}: {dy_yield}")





yield for //depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/2018//vbf_powheg_dipole h-peak: 11.85564329646733




yield for //depot/cms/users/yun79/hmm/copperheadV1clean/ul_yun_Dec12_JecOff_JesJerUncOn/stage1_output/2018//vbf_powheg_dipole h-sidebands: 0.18357360850705176


In [11]:
! ls /depot/cms/users/yun79/hmm/copperheadV1clean/DmitryMaster_JECoff_GeofitFixed_Oct29/stage1_output/2018/dy_m105_160_amc

001a0b68977f160cbdea9addb8b8d62c.parquet
001b9f33d62357037a3b1baff5135f49.parquet
0042b5b6cc2e80ae841fffe018f06781.parquet
00ea897bc8928d1f7a7df42f4207e033.parquet
0183f6d1f5c941bdbec2ce2c13dfd6a9.parquet
022d078f8178e1a7215d298862ec6d8f.parquet
02470fdac661d52b8116cb3c7db13138.parquet
0309bb4e56abdbad340083a422d50148.parquet
030c225fef8b7cfdb6d433fe691c44c6.parquet
03be8bed7e70bc7b0dd6c557f51d0341.parquet
05bccdd6c9004b818081f8e86cda7086.parquet
067e58cea84904cb8db4236bf83c4ee8.parquet
0724ba603a7271dbab354db8f4e40460.parquet
07d822d987b1bfaa0dee360e2c139da9.parquet
07ecd20fac9c9d74efa02a2ced6e6a3c.parquet
08a263b5aa7afd8168dabe5146ca8ca6.parquet
08a9b357ce6248b4f75edbb8fc42ea62.parquet
08b5544ac4ba4c6fcb061071b147dd91.parquet
0998d9bb3dfc14d21f6fbb52ff0733ee.parquet
09d34b596de17c7d60ca394cf304dd6a.parquet
09e0d7990f9e962e14df1a13c35a95cb.parquet
0a92a353e22de4f7aeb8fb16db1e6c60.parquet
0be379f3570fe35ed5cec324c3c3a71c.parquet
0c53f1f8ffcdf75e7a066dc1003659f2.parquet
0c8cdf233b391d5b

In [1]:
# import uproot
# import glob
# import numba
# import numpy as np
# import concurrent.futures

# @numba.jit(parallel=True, nopython=False)
# def getBadFileIdx(fnames):
#     bad_files = np.zeros(len(fnames))
#     for file_idx in numba.prange(len(fnames)):
#         up_file = uproot.open(fnames[file_idx]) 
#         if not ("HLT_IsoMu27" in up_file['Events'].keys()):
#             # print(fnames[file_idx])
#             bad_files[file_idx] = 1
#             # bad_files.append(file)
#     return bad_files

In [12]:
len(results)

2500

In [9]:
import ROOT as rt

file = rt.TFile(filelist[0])
file.ls()

TFile**		/eos/purdue/store/group/local/hmm/FSRnano18D_NANOV10b/SingleMuon/RunIISummer16MiniAODv3_FSRnano18D_NANOV10b_un2018D-22Jan2019-v2/200408_223523/0000/nano18D_NANO_1.root	
 TFile*		/eos/purdue/store/group/local/hmm/FSRnano18D_NANOV10b/SingleMuon/RunIISummer16MiniAODv3_FSRnano18D_NANOV10b_un2018D-22Jan2019-v2/200408_223523/0000/nano18D_NANO_1.root	
  KEY: TObjString	tag;1	Collectable string class
  KEY: TTree	Events;1	Events
  KEY: TTree	LuminosityBlocks;1	LuminosityBlocks
  KEY: TTree	Runs;1	Runs
  KEY: TTree	MetaData;1	Job metadata
  KEY: TTree	ParameterSets;1	Parameter sets


In [80]:
! ls /depot/cms/users/yun79/hmm/copperheadV1clean/DmitryMaster_w_centralRERECO_Oct25/stage1_output/2017

data_B


In [35]:
! ls /depot/cms/users/yun79/hmm/copperheadV1clean/RERECO_Oct24/stage1_output/2018/

data_A	data_B	data_C	data_D


In [5]:

# events_bkg = dak.from_parquet(bkg_l) 
# events_bkg = ak.zip({field : events_bkg[field] for field in fields2compute}).compute()

# normal from_parquet doesn't work, so using convoluted concatenating method
# year = 2018
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}*/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}/f1_0"
load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/*/f1_0"
events_data = dak.from_parquet(f"{load_path}/data_*/*/*.parquet")
events_data = ak.zip({field: events_data[field] for field in fields2compute}).compute()
events_data = applyGGH_cut(events_data)
data_yield = ak.num(events_data.dimuon_mass, axis=0)



In [None]:
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2RERECO_data_OCt14_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2RERECO_data_w_LumiMask_JETID_JETPUID_BTag_DefaultJEC_Oct15_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2RERECO_data_w_LumiMask_JETID_JETPUID_BTag_Oct16_2014/{year}_RERECO/f1_0"
load_path = f"/depot/cms/users/yun79/results/stage1/Run2RERECO_data_w_LumiMask_JETID_JETPUID_BTag_DefaultJEC_Oct15_2014/*_RERECO/f1_0"


events_data_rereco = dak.from_parquet(f"{load_path}/data_*/*/*.parquet")
events_data_rereco = ak.zip({field: events_data_rereco[field] for field in fields2compute}).compute()
events_data_rereco = applyGGH_cut(events_data_rereco)
data_yield_rereco = ak.num(events_data_rereco.dimuon_mass, axis=0)

In [7]:
print(data_yield)
print(data_yield_rereco)

1609961
1610974


In [10]:
"""
similar test, but with data_C 2018 UL only
"""
# normal from_parquet doesn't work, so using convoluted concatenating method
year = 2018
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}*/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}preVFP/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/{year}/f1_0"
load_path = f"/depot/cms/users/yun79/results/stage1/BDT_inputValidation_JetIdUpdate/20*/f1_0"

# events_data = dak.from_parquet(f"{load_path}/data_C/*/*.parquet")
events_data = dak.from_parquet(f"{load_path}/data_*/*/*.parquet")
events_data = ak.zip({field: events_data[field] for field in fields2compute}).compute()
events_data = applyGGH_cut(events_data)
data_yield = ak.num(events_data.dimuon_mass, axis=0)

# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_LumiMask_DefaultJEC_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_LumiMask_DefaultJEC_RochOff_GeofitOff_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_LumiMask_DefaultJEC_RochOff_GeofitOff_FsrOff_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_LumiMask_DefaultJEC_RochOff_GeofitOff_ElectronEcalGapOn_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_LumiMask_DefaultJEC_ElectronEcalGapOn_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_LumiMask_ElectronEcalGapOn_TriggerMatchOff_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_LumiMask_ElectronEcalGapOn_TriggerMatchOff_MuLeadingPtCutOff_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_ElectronEcalGapOn_TriggerMatchOff_MuLeadingPtCutOff_DefaultJEC_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_ElectronEcalGapOnNCorrected_TriggerMatchOff_MuLeadingPtCutOff_DefaultJEC_Oct16_2014/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_MuLeadingPtCutOff_defaultJEC_BtagCorrected_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2UL_data_w_ElectronEcalGapOnNCorrected_MuLeadingPtCutOff_defaultJEC_BtagCorrected_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_defaultJEC_BtagCorrected_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_defaultJEC_BtagCorrected_newEventFilter_Oct16_2014/{year}_RERECO/f1_0"


# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_defaultJEC_BtagCorrected_newEventFilter_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_defaultJEC_BtagCorrected_newEventFilter_HLTFixed_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_TriggerMatchOff_defaultJEC_BtagCorrected_newEventFilter_HLTFixed_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_defaultJEC_BtagCorrected_newEventFilter_HLTFixed_Oct16_2014/*_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_TriggerMatchOff_defaultJEC_BtagCorrected_newEventFilter_HLTFixed_Oct16_2014/*_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_TriggerMatchOff_defaultJEC_BtagCorrected_newEventFilter_HLTFixed_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/Run2_data_w_MuLeadingPtCutOff_TriggerMatchOff_defaultJEC_BtagCorrected_newEventFilter_HLTFixed_HltFillNoneTest_Oct16_2014/{year}_RERECO/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_WgtON_original_AN_BDTV2_Oct18/{year}/f1_0"
# load_path = f"/depot/cms/users/yun79/results/stage1/BDT_WgtON_original_AN_BDTV2_Oct18/*/f1_0"
load_path = f"/depot/cms/users/yun79/results/stage1/RERECO_Run2_data_w_MuLeadingPtCutOff_defaultJEC_BtagCorrected_newEventFilter_HLTFixed_HltFillNoneTest_Oct18/*_RERECO/f1_0"



# events_data_rereco = dak.from_parquet(f"{load_path}/data_C/*/*.parquet")
events_data_rereco = dak.from_parquet(f"{load_path}/data_*/*/*.parquet")
events_data_rereco = ak.zip({field: events_data_rereco[field] for field in fields2compute}).compute()
events_data_rereco = applyGGH_cut(events_data_rereco)
data_yield_rereco = ak.num(events_data_rereco.dimuon_mass, axis=0)
print(data_yield)
print(data_yield_rereco)

1606887
1620838


In [6]:
# isnan = np.isnan(ak.to_numpy(events_bkg.wgt_nominal_total))
# np.sum(isnan)
# # hist_bkg

In [7]:
# nbins = 60
# bin_map = {
#      "dimuon_pt": [0,200, nbins], 
#     "dimuon_rapidity" : [-2.5,2.5, nbins], 
#     "dimuon_eta" : [-8,8, nbins],
# }
with open("./plot_settings_gghCat_BDT_input.json", "r") as file:
    bin_map = json.load(file)
# bin_map

In [8]:
year = "Run2"
# for field in cols_of_interest:
for field in (["jj_dPhiV2"]):
    binning = np.linspace(*bin_map[field]["binning_linspace"])
    xmin = bin_map[field]["binning_linspace"][0]
    xmax = bin_map[field]["binning_linspace"][1]
    hist_sig, edges = getHist(events_sig, field, binning)
    # raise ValueError
    hist_bkg, edges = getHist(events_bkg, field, binning)
    fig, ax_main = plt.subplots()
    # plt.stairs(hist_sig,edges=edges,label="signal", color="blue")
    # plt.stairs(hist_bkg,edges=edges,label="background", color="red")
    hep.histplot(
        hist_sig, 
        bins=binning, 
        stack=False, 
        histtype='step', 
        color='blue', 
        label='signal', 
        ax=ax_main,
    )
    # print(f"hist_bkg: {hist_bkg}")
    hep.histplot(
        hist_bkg, 
        bins=binning, 
        stack=False, 
        histtype='step', 
        color='red', 
        label='background', 
        ax=ax_main,
    )
    ax_main.set_xlabel(bin_map[field]["xlabel"])
    ax_main.set_ylabel("A.U.")
    if bin_map[field]["logscale"]:
        plt.yscale('log')  # Set y-axis to log scale
        plt.ylim(1e-3, 1)
    plt.xlim(xmin, xmax)
    plt.legend()
    # plt.show()
    CenterOfMass = 13
    # lumi = 59.97 # 2018 lumi value
    lumi = 137.9 # Run2 value
    hep.cms.label(data=True, loc=0, label="Private Work", com=CenterOfMass, ax=ax_main, lumi=lumi)
    plt.savefig(f"plots/BDT_input{year}_{field}")
    plt.clf()

getHist weight sum: 13.84659559419017
getHist value sum: -13723.506476295119
getHist weight: [0.000352, 0.000295, 0.000369, 0.000393, ..., 2.4e-05, 2.13e-05, 2.67e-05]
getHist value: [-1, -1, -1, -1, 2.36, -1, -1, 1.37, ..., -1, -1, -1, -1, -1, 2.56, 1.86, -1]
getHist hist b4 normalization: [0.0459, 0.0445, 0.0458, 0.0512, 0.0449, ..., 0.0584, 0.069, 0.065, 0.0677]
np.sum(hist): 0.9999999999999999
getHist weight sum: 809642.0579307787
getHist value sum: -9613472.744227482
getHist weight: [0.0325, 0.0273, 0.0258, 0.0332, 0.0251, ..., 0.0132, 0.0178, 0.0119, 0.0143]
getHist value: [-1, -1, -1, -1, -1, -1, -1, 1.44, ..., -1, 2.78, 2.18, -1, 2.33, -1, 1.8, 2.28]
getHist hist b4 normalization: [1.18e+03, 1.2e+03, 1.2e+03, 1.24e+03, ..., 3.39e+03, 3.58e+03, 3.7e+03]
np.sum(hist): 0.9999999999999998


<Figure size 1000x1000 with 0 Axes>

In [9]:
ak.sum(ak.is_none(events_bkg.jj_dPhiV2))

0

In [10]:
import awkward as ak
import numpy as np

A = ak.Array([None,1])
# np.isnan(ak.to_numpy(A))
np.isnan( ak.Array([None,1]))
np.any(ak.is_none(ak.Array([None,1])))

True

In [11]:
np.logspace(2, 3, num=9+1)

array([ 100.        ,  129.1549665 ,  166.81005372,  215.443469  ,
        278.25594022,  359.38136638,  464.15888336,  599.48425032,
        774.26368268, 1000.        ])

In [12]:
np.logspace(-1, 0, num=9+1)

array([0.1       , 0.12915497, 0.16681005, 0.21544347, 0.27825594,
       0.35938137, 0.46415888, 0.59948425, 0.77426368, 1.        ])

In [12]:
sorted([0.5, 0.15, 0.83, 0.02, 0.28, 0.67])

[0.02, 0.15, 0.28, 0.5, 0.67, 0.83]

In [None]:
# import dask_awkward as dak
# import awkward as ak
# from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
# from dask_awkward.lib.core import map_partitions
# import numpy as np

# input_dict = {"test": {
#     "files": {
#         # "root://eos.cms.rcac.purdue.edu:1094//store/mc/RunIISummer20UL18NanoAODv9/DYJetsToLL_M-100to200_TuneCP5_13TeV-amcatnloFXFX-pythia8/NANOAODSIM/106X_upgrade2018_realistic_v16_L1v1-v1/40000/AA6F89B0-EDAA-3942-A3BB-BC3709722EB4.root": {
#         "root://eos.cms.rcac.purdue.edu//store/group/local/hmm/FSRnano18D_NANOV10b/SingleMuon/RunIISummer16MiniAODv3_FSRnano18D_NANOV10b_un2018D-22Jan2019-v2/200408_223523/0004/nano18D_NANO_4814.root" : {
#             "object_path": "Events", "steps": [[0, 10000],[10000, 20000]], "num_entries": 19, "uuid": "77528b92-6481-11ef-aeb3-83890d0abeef"}
#     }, 
#     "form": None, "metadata": {"sumGenWgts": 217532.83170000004, "nGenEvts": 1959, "data_entries": None, "fraction": 1.0, "original_fraction": 1.0, "is_mc": True, "dataset": "test"}}}

# events = NanoEventsFactory.from_root(
#     input_dict['test']['files'],
#     schemaclass=NanoAODSchema,
# ).events()

In [36]:
ak.sum(events.GenJet.pt < 10).compute()

Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector
Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


0

In [14]:
# import ROOT as rt

# # file = rt.TFile("/eos/purdue/store/group/local/hmm/FSRnano18D_NANOV10b/SingleMuon/RunIISummer16MiniAODv3_FSRnano18D_NANOV10b_un2018D-22Jan2019-v2/200408_223523/0004/nano18D_NANO_4814.root")
# file = rt.TFile("/eos/purdue/store/group/local/hmm/FSRnano18D_NANOV10b/SingleMuon/RunIISummer16MiniAODv3_FSRnano18D_NANOV10b_un2018D-22Jan2019-v2/200408_223523/0004/nano18D_NANO_4813.root")

