# Expression Parsing

Part of predicate push-down is parsing the expressions and re-building the map.

In [1]:
from pathlib import Path
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
from dask.distributed import Client
import dask
import dask_awkward
import awkward as ak
import hist.dask as hda
from typing import Dict, List
from dataclasses import dataclass

dask.config.set({"awkward.optimization.enabled": True, "awkward.raise-failed-meta": True, "awkward.optimization.on-fail": "raise"})

<dask.config.set at 0x206ab322b90>

Info about each function in dask

In [2]:
@dataclass
class func_info:
    n_args: int

function_info = {
    'any': func_info(1),
    'getitem': func_info(2),
    'bitwise_and': func_info(2),
    'greater': func_info(2),
}

Read through the layers, and try to extract the directed graph.

In [3]:
@dataclass
class node_info:
    name: str
    function_name: str
    input_nodes: List[str]

def layer_names(input_names) -> List[str]:
    result = []
    for l_info in input_names:
        if isinstance(l_info, tuple):
            result.append(l_info[0])
        else:
            result.append(l_info)
    return result

def extract_pushdown(coll) -> List[node_info]:
    hlg_sorted = coll.dask._toposort_layers()
    pushdown_deps = []
    result: List[node_info] = []
    for key in hlg_sorted:
        annotations = coll.dask.layers[key].annotations
        if annotations is not None and "pushdown" in annotations:
            #print(key, coll.dask.layers[key].annotations)
            pushdown_deps = [key] + pushdown_deps
    for dep in pushdown_deps:
        layer = coll.dask.layers[dep]

        fcn = list(layer.dsk.values())[0][0]
        if isinstance(layer, dask_awkward.layers.AwkwardBlockwiseLayer) and not isinstance(layer, dask_awkward.layers.AwkwardInputLayer):
            result.append(node_info(dep, fcn.fn.__name__, layer_names(layer[(dep, 0)][1:])))
            # print(layer.dsk)
            # print(dir(layer))
            # print(list(layer.keys()))
            # print(dep, fcn.fn)
            # print(dir(fcn))
            # print(fcn.arg_repackers[0])
            # print("")
        else:
            print("Not a good instance", dep, fcn)

    # next job is create a new input layer in place of the first dep.

    return result

In [4]:

with dask.annotate(pushdown="servicex"):
    events = NanoEventsFactory.from_root(
        {
            "nano_dy.root": "Events",
            "nano_dy.root": "Events",
            "nano_dy.root": "Events",
            "nano_dy.root": "Events",
        },
        metadata={"dataset": "nano_dy"},
        schemaclass=NanoAODSchema,
    ).events()

    mask_pt = events.Muon.pt > 30
    mask_eta = abs(events.Muon.eta) < 2.4
    events = events[ak.any(mask_pt & mask_eta, axis=1)]
    
myhist = hda.Hist.new.Regular(50, 0, 2.5, name="abseta").Double()

myhist.fill(abseta=abs(events.Muon.eta))

Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


In [5]:
extract_pushdown(myhist)

Not a good instance from-uproot-1d33ce1279e7835ab27a0a7569d14435 <uproot._dask._UprootOpenAndRead object at 0x00000206D4952110>


[node_info(name='getitem-ee66d0bdfc11447330d3798fd686c954', function_name='getitem', input_nodes=['from-uproot-1d33ce1279e7835ab27a0a7569d14435', 'any-17977bd1efcaf1a970619dfda4e72ba7']),
 node_info(name='any-17977bd1efcaf1a970619dfda4e72ba7', function_name='any', input_nodes=['bitwise-and-20b0b3f5ef1db85cd961fedb5aa64e01']),
 node_info(name='bitwise-and-20b0b3f5ef1db85cd961fedb5aa64e01', function_name='bitwise_and', input_nodes=['greater-35ee09587b66b671244bac1ac757c4bf', 'less-de276097741012a179886604447699c7']),
 node_info(name='greater-35ee09587b66b671244bac1ac757c4bf', function_name='greater', input_nodes=['pt-4e299fb17ee6d067064e659957c85727', 30]),
 node_info(name='pt-4e299fb17ee6d067064e659957c85727', function_name='getitem', input_nodes=['Muon-0403d9d7ff884b0571955813c1db4f5d', 'pt']),
 node_info(name='Muon-0403d9d7ff884b0571955813c1db4f5d', function_name='getitem', input_nodes=['from-uproot-1d33ce1279e7835ab27a0a7569d14435', 'Muon']),
 node_info(name='less-de276097741012a1798