This is a quick demo of observed memory leakage in coffea 

In [1]:
import coffea
print(f"coffea verion: {coffea.__version__}")
import distributed
print(f"distributed verion: {distributed.__version__}")
import uproot
print(f"uproot verion: {uproot.__version__}")

coffea verion: 2024.3.0
distributed verion: 2024.3.0
uproot verion: 5.3.1


In [2]:
import numpy as np
import awkward as ak
import dask_awkward as dak
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
from distributed import Client, performance_report
import json 
import glob
import os
import tqdm
import time
from itertools import islice
import copy
import dask
from coffea.dataset_tools import (
    max_chunks
)

def divide_chunks(data: dict, SIZE: int):
    """
    This takes a big sample of a dataset consisting of multiple root files and divides them to smaller sets of root files.
    Similar to coffea.dataset_tools maxfile function, but not exactly the same 
    """
    it = iter(data)
    for i in range(0, len(data), SIZE):
      yield {k:data[k] for k in islice(it, SIZE)}

If do_regular_restart == False, we get observe memory leak, as recorded in withOutRestart_output.log
If do_regular_restart == True, the operation continues, as recorded in withRestart_output.log (manually canceled in the middle of the script, since it went way past compared to when do_regular_restart == False)

In [3]:
do_regular_restart = False
# client = Client(n_workers=1,  threads_per_worker=1, processes=True, memory_limit='0.7 GiB')
client = Client(n_workers=1,  threads_per_worker=1, processes=True, memory_limit='1.0 GiB')

# sample_path = "./input_file.json"
sample_path = "./input_file_local.json"
with open(sample_path) as file:
    samples = json.loads(file.read())
samples  = max_chunks(samples, 20)
# dataset = list(samples.keys())[0]
# sample = list(samples.values())[0]
with performance_report(filename="dask-report.html"): # Sadly, the dask performance report doesn't record memory usage beyond the first minute, so it not very useful for recording memory leakage.
    for dataset, sample in tqdm.tqdm(samples.items()):
        max_file_len = 1
        smaller_files = list(divide_chunks(sample["files"], max_file_len))
        for idx in tqdm.tqdm(range(len(smaller_files)), leave=False):
            smaller_sample = copy.deepcopy(sample)
            smaller_sample["files"] = smaller_files[idx]
            # print(f"smaller_sample: {smaller_sample}")
            events = NanoEventsFactory.from_root(
                smaller_sample["files"],
                schemaclass=NanoAODSchema,
                metadata= smaller_sample["metadata"],
                # uproot_options={"handler" : uproot.XRootDSource}
            ).events()
            nmuons = ak.num(events.Muon, axis=1)
            muon_selection = (
                events.Muon.pt > 20 &
                nmuons == 2
            )
            muons = events.Muon[muon_selection]
            dask.compute(muons.pt)
            if do_regular_restart:
                client.restart(timeout=10, wait_for_workers=False)

  0%|          | 0/1 [00:00<?, ?it/s]
Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector
Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector




























































































100%|██████████| 92/92 [57:30<00:00, 22.75s/it][A
100%|██████████| 1/1 [57:30<00:00, 3450.76s/it][A


This script below runs essentially the same script as above, but with larger list of root files

In [None]:
! nohup python memleakBig_test.py &> output.log