In [1]:
# open files / get meta conditions
from coffea import nanoevents
events = nanoevents \
         .NanoEventsFactory \
         .from_root(
             '../data/954C2B25-9CC5-004D-9CCE-FA674345E337.root',
             metadata = {"dataset": "test"}
         ).events()

import json
import os.path as osp
metaconditions_file = "../Era2017_legacy_v1.json"
with open(metaconditions_file) as f:
    metaconditions = json.load(f)



In [2]:
import coffea
from coffea import hist, processor
import numpy as np
import awkward as ak
import pandas as pd
import functools as ft
import operator as op
import os
import shutil as shu
import pathlib as pl

class DYStudiesProcessor(processor.ProcessorABC):
    def __init__(self, metaconditions, do_systematics=False, apply_trigger=False, output_location=None):
        self.meta = metaconditions
        self.do_systematics = do_systematics
        self.apply_trigger = apply_trigger
        self.output_location = output_location
        self.trigger_group = ".*DoubleEG.*"
        self.analysis = "mainAnalysis"

        # diphoton preselection cuts
        self.min_pt_photon = 25.0
        self.min_pt_lead_photon = 35.0
        self.min_mvaid = -0.9
        self.max_sc_eta = 2.5
        self.gap_barrel_eta = 1.4442
        self.gap_endcap_eta = 1.566
        self.max_hovere = 0.08
        self.min_full5x5_r9 = 0.8
        self.max_chad_iso = 20.0
        self.max_chad_rel_iso = 0.3

        self.prefixes = {"0": "lead", "1": "sublead"}
        
    def photon_preselection(self, photons):
        photon_abs_eta = np.abs(photons.eta)
        return photons[  (photons.pt > self.min_pt_photon)
                       & (photon_abs_eta < self.max_sc_eta)
                       & ((photon_abs_eta < self.gap_barrel_eta) | (photon_abs_eta > self.gap_endcap_eta))
                       & (photons.mvaID > self.min_mvaid)
                       & (photons.hoe < self.max_hovere)
                       & (  (photons.r9 > self.min_full5x5_r9)
                          | (photons.pfRelIso03_chg < self.max_chad_iso)
                          | (photons.pfRelIso03_chg/photons.pt < self.max_chad_rel_iso))]

    def diphoton_list_to_pandas(self, diphotons):
        output = pd.DataFrame()
        for field in ak.fields(diphotons):
            prefix = self.prefixes.get(field, "")
            if len(prefix) > 0:
                for subfield in ak.fields(diphotons[field]):
                    output[f"{prefix}_{subfield}"] = ak.to_numpy(diphotons[field][subfield])
            else:
                output[field] = ak.to_numpy(diphotons[field])
        return output

    def dump_pandas(self, pddf, fname, location, subdirs=[]):
        xrd_prefix = 'root://'
        xrootd = False
        if xrd_prefix in location:
            try:
                import XRootD
                import XRootD.client
                xrootd = True
            except ImportError:
                raise ImportError(
                    "Install XRootD python bindings with: conda install -c conda-forge xroot"
                )
        local_file = os.path.join(".", fname)
        subdirs = "/".join(subdirs) if xrootd else os.path.sep.join(subdirs)
        destination = locations + subdirs + f"/{fname}" if xrootd else os.path.join(location, os.path.join(subdirs, fname))
        pddf.to_parquet(local_file)
        if xrootd:
            pfx_len = len(xrd_prefix)
            client = XRootD.client.FileSystem(location[:location[pfx_len:].find('/') + pfx_len])
            status = client.copy(local_file, destination)
            assert status[0].ok
        else:
            dirname = os.path.dirname(destination)
            if not os.path.exists(dirname):
                pl.Path(dirname).mkdir(parents=True, exist_ok=True)
            shu.copy(local_file, destination)
            assert os.path.isfile(destination)
        
            
    
    def process(self, events):

        # data or monte carlo?
        data_kind = "mc" if "GenPart" in ak.fields(events) else "data"

        # met filters
        met_filters = self.meta["flashggMetFilters"][data_kind]
        filtered = ft.reduce(op.and_, (events.Flag[metfilter.split("_")[-1]] for metfilter in met_filters))

        triggered = ak.ones_like(filtered)
        if self.apply_trigger:
            triggers = self.meta["TriggerPaths"][self.trigger_group][self.analysis]
            triggered = ft.reduce(op.or_, (events.HLT[trigger[4:-1]] for trigger in triggers))
        
        # apply met filters and triggers to data
        events = events[filtered & triggered]

        # photon preselection
        photons = self.photon_preselection(events.Photon)
        # sort photons in each event descending in pt
        # make descending-pt combinations of photons
        photons = photons[ak.argsort(photons.pt, ascending=False)]
        diphotons = ak.combinations(photons, 2)
        # the remaining cut is to select the leading photons
        # the previous sort assures the order
        diphotons = diphotons[diphotons["0"].pt > self.min_pt_lead_photon]

        # now turn the diphotons into candidates with four momenta and such
        diphoton_4mom = diphotons["0"] + diphotons["1"]
        diphotons["pt"] = diphoton_4mom.pt
        diphotons["eta"] = diphoton_4mom.eta
        diphotons["phi"] = diphoton_4mom.phi
        diphotons["mass"] = diphoton_4mom.mass
        diphotons = ak.with_name(diphotons, "PtEtaPhiMCandidate")
 
        # arbitrate diphotons
        diphotons = diphotons[ak.argsort(diphotons.pt, ascending=False)]
        diphotons = ak.firsts(diphotons)

        # annotate diphotons with event information
        diphotons["event"] = events.event
        diphotons["lumi"] = events.luminosityBlock
        diphotons["run"] = events.run
        
        # drop events without a preselected diphoton candidate
        diphotons = diphotons[~ak.is_none(diphotons)]
        
        if self.output_location is not None:
            df = self.diphoton_list_to_pandas(diphotons)
            fname = events.behavior["__events_factory__"]._partition_key.replace("/", "_") + ".parquet"
            subdirs = []
            if 'dataset' in events.metadata:
                subdirs.append(events.metadata["dataset"])
            self.dump_pandas(df, fname, self.output_location, subdirs)
        
        return {
            
        }

    def postprocess(self):
        pass

In [None]:
from workflows import DYStudiesProcessor

In [3]:
dystudies = DYStudiesProcessor(metaconditions, False, True, './outputs/')

histos = dystudies.process(events)

In [12]:
x = ak.ones_like(ak.flatten(events.Photon.pt))

In [6]:
import pandas as pd
import pyarrow as pa
import awkward as ak

x = pd.read_parquet('./outputs/test')

y = ak.from_arrow(pa.Table.from_pandas(x))

In [None]:
# di-photon MVA
import xgboost as xg

min_diphoton_mass = 100
max_diphoton_mass = 180

model = xg.Booster()
model.load_model('aux-data/altDiphoModel_coffea.model')

# get the number of diphotons per row
# and save for re-wrapping xgb outputs
counts = ak.num(diphotons, axis=1)

# extract diphoton vars into flat lists
dipho_leadIDMVA = ak.flatten(diphotons["0"].mvaID)
dipho_subleadIDMVA = ak.flatten(diphotons["1"].mvaID)
dipho_lead_ptoM = ak.flatten(diphotons["0"].pt / diphotons.mass)
dipho_sublead_ptoM = ak.flatten(diphotons["1"].pt / diphotons.mass)
dipho_lead_eta = ak.flatten(diphotons["0"].eta)
dipho_sublead_eta = ak.flatten(diphotons["1"].eta)

diphoVars  = ['dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 
              'dipho_sublead_ptoM', 'dipho_leadEta', 'dipho_subleadEta', 
              'CosPhi', 'vtxprob', 'sigmarv', 'sigmawv']
allVars = diphoVars + ["dipho_mass"]

f = uproot.open('data/ggH_powheg_UL_2017.root')
tree = f['vbfTagDumper/trees/ggh_125_13TeV_GeneralDipho']
arrays = tree.arrays(allVars, how=dict)

mask = (  (arrays["dipho_mass"]> min_diphoton_mass) & (arrays["dipho_mass"]< max_diphoton_mass) 
        & (arrays["dipho_leadIDMVA"]>-0.9) & (arrays["dipho_subleadIDMVA"]>-0.9) 
        & (arrays["dipho_lead_ptoM"]>0.333) & (arrays["dipho_sublead_ptoM"]>0.25))

x = np.column_stack((ak.to_numpy(arrays[var][mask]) for var in diphoVars))[:100]

print(x.shape)

diphoMatrix = xg.DMatrix(x, feature_names=diphoVars)


y = model.predict(diphoMatrix)

In [None]:
import uproot
import xgboost
import numpy as np
import awkward as ak

diphoVars  = ['dipho_leadIDMVA', 'dipho_subleadIDMVA', 'dipho_lead_ptoM', 
              'dipho_sublead_ptoM', 'dipho_leadEta', 'dipho_subleadEta', 
              'CosPhi', 'vtxprob', 'sigmarv', 'sigmawv']
allVars = diphoVars + ["dipho_mass"]

f = uproot.open('../data/ggH_powheg_UL_2017.root')
tree = f['vbfTagDumper/trees/ggh_125_13TeV_GeneralDipho']
arrays = tree.arrays(allVars, how=dict)

mask = (  (arrays["dipho_mass"]>100.) & (arrays["dipho_mass"]<180.) 
        & (arrays["dipho_leadIDMVA"]>-0.9) & (arrays["dipho_subleadIDMVA"]>-0.9) 
        & (arrays["dipho_lead_ptoM"]>0.333) & (arrays["dipho_sublead_ptoM"]>0.25))

x = np.column_stack((ak.to_numpy(arrays[var][mask]) for var in diphoVars))[:100]

print(x.shape)

diphoMatrix = xgboost.DMatrix(x, feature_names=diphoVars)

model = xgboost.Booster()
model.load_model('aux-data/altDiphoModel_coffea.model')
y = model.predict(diphoMatrix)

print(x)
print(y)

In [None]:
events.behavior["__events_factory__"]._partition_key.replace('/', '_') + '.parquet'

In [None]:
events.metadata

In [None]:
metaconditions

In [4]:
from workflows import taggers

In [5]:
dir(taggers)

['DummyTagger1',
 'DummyTagger2',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__']