In [1]:
import traceback
import awkward as ak
import dask
import uproot
from dask.distributed import Client, performance_report
import utils  # worker count tracking

print(f"awkward: {ak.__version__}")
print(f"uproot: {uproot.__version__}")

# Use dask client:
client = Client("tls://localhost:8786")

awkward: 2.6.3
uproot: 5.3.11.dev3+g2a20562


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
# Add ROOT files to work with:
all_files = []
all_files.append("root://eospublic.cern.ch//eos/root-eos/AGC/rntuple/nanoAOD/TT_TuneCUETP8M1_13TeV-amcatnlo-pythia8/cmsopendata2015_ttbar_19978_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext1-v1_60000_0004.root")


In [3]:
# define work to be done
def uproot_open_materialize(fname):
    try:
        with uproot.open(fname) as f:
            events = f["Events"]
            # num_entries = events.num_entries # This fails in in dask tasks due to different uproot version used
    except:
        exception = traceback.format_exc()
        print(exception)
        raise
    return {"uproot_ver": f"{uproot.__version__}"}

In [4]:
# Launch task with non-dask approach:
output = [uproot_open_materialize(f) for f in all_files]
print(f"Uproot version in non-dask tasks: {output[0]['uproot_ver']}")

Uproot version in non-dask tasks: 5.3.11.dev3+g2a20562


In [5]:
# Launch task with dask approach:
with performance_report(filename="dask-report-plain-uproot.html"):
    # futures = client.map(uproot_open_materialize, scattered_data)
    # out = ak.Array([r for r in client.gather(iter(futures))])
    tasks = [dask.delayed(uproot_open_materialize)(f) for f in all_files]
    out = ak.Array(dask.compute(*tasks))
    print("Uproot version in dask tasks: ", ak.to_list(out)[0]["uproot_ver"])

print("Finished notebook")

Uproot version in dask tasks:  5.3.2
Finished notebook
