In [1]:
import json
import dask
import dask_awkward as dak
import awkward as ak
import matplotlib.pyplot
from coffea import dataset_tools
from coffea.nanoevents import NanoEventsFactory
import pickle
import pyarrow
import os

In [2]:
with open('../../filelists/hgg_files.txt', 'r') as f:
    hgg_files = [line.strip() for line in f]

In [38]:
hgg_files[0]
entry = '/project01/ndcms/cmoore24/signal/hgg/' + hgg_files[0]
entry

'/project01/ndcms/cmoore24/signal/hgg/HJ_MINLO_Pt-200ToInf_0.root'

In [39]:
samples = {}
samples['Hgg'] = {}
samples['Hgg']['files'] = {}
samples['Hgg']['files'][entry] = {'object_path': 'Events'}

In [5]:
samples

{'Hgg': {'files': {'/project01/ndcms/cmoore24/signal/hgg/HJ_MINLO_Pt-200ToInf_0.root': {'object_path': 'Events'}}}}

In [6]:
samples_ready, samples = dataset_tools.preprocess(
    samples,
    step_size=50_000,
    skip_bad_files=True,
    recalculate_steps=True,
    save_form=False,
)

In [7]:
def repartition(samples, factor):
    # FIXME: not actually copying
    out = {}
    for name, sample in samples.items():
        out[name] = dict(sample)
        out[name]["files"] = dict(out[name]["files"])
        for fname, file in out[name]["files"].items():
            steps = file["steps"]
            if not steps:
                continue
            offsets = [start for start, _ in steps]
            offsets = offsets[::factor] + steps[-1][1:]
            file["steps"] = [
                [start, stop]
                for start, stop in zip(offsets, offsets[1:])
            ]
    return out

In [8]:
def analysis(events):
    dataset = events.metadata["dataset"]
    photonSelect = (
        (events.FatJet.pt > 18)
        & (abs(events.FatJet.eta) < 1.5)
        #& (events.Photon.isScEtaEE | events.Photon.isScEtaEB)
        #& (events.Photon.cutBased >= 1)
    )
    events = events[
        ak.any(photonSelect, axis=1)
    ]
    skim = ak.zip(
        {
            "Jets": events.Jet,
            # "MET": events.MET,
            # "Photon": events.Photon,
        },
        depth_limit=1,
    )
    
    skim_task = dak.to_parquet(
        skim,
        f"./{dataset}",
        compute=False,
    )
    return skim_task

In [9]:
tasks = dataset_tools.apply_to_fileset(
    analysis,
    dataset_tools.slice_files(samples_ready, slice(None, 5)),
    uproot_options={"allow_read_errors_with_report": True},
)

Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


In [10]:
out, report = dask.compute(*tasks)

In [11]:
out

{'Hgg': None}

In [12]:
report

{'Hgg': <Array [{call_time: None, duration: 1.46, ...}] type='1 * {call_time: ?unkn...'>}

In [13]:
with open('../../filelists/hgg_files.txt', 'r') as f:
    hgg_files = [line.strip() for line in f]

In [59]:
lists = os.listdir('../../filelists')
lists.remove('.ipynb_checkpoints')

In [73]:
dict = {}
for i in lists:
    with open('../../filelists/' + i, 'r') as f:
        files = [line.strip() for line in f]
    dict[str(i)[:-10]] = {}
    dict[str(i)[:-10]]['files'] = {}
    for j in files:
        if 'hgg' or 'hbb' in i:
            path = '/project01/ndcms/cmoore24/signal/'
        else:
            path = '/project01/ndcms/cmoore/qcd/'
        dict[str(i)[:-10]]['files'][path+i[:-10]+'/'+j] = {'object_path': 'Events'}

In [74]:
with open('output_datasets.json', 'w') as fin:
    json.dump(dict, fin)

In [75]:
with open('output_datasets.json', 'r') as f:
    s2 = json.load(f)

In [76]:
entry = '/project01/ndcms/cmoore24/signal/hgg/' + hgg_files[0]
samples = {}
samples['Hgg'] = {}
samples['Hgg']['files'] = {}
samples['Hgg']['files'][entry] = {'object_path': 'Events'}
# test_ready, test = dataset_tools.preprocess(
#     samples,
#     step_size=50_000,
#     skip_bad_files=True,
#     recalculate_steps=True,
#     save_form=False,
# )

In [77]:
s2['hbb']

{'files': {'/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_1-1.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_1-2.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_1.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_10.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_11.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_12.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_14.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_15.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_16.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_17.root': {'object_path': 'Events'},
  '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_18.root': {'object_path': 'Events'},
  '/project

In [83]:
test = {}
test['hbb'] = s2['hbb']

In [79]:
samples

{'Hgg': {'files': {'/project01/ndcms/cmoore24/signal/hgg/HJ_MINLO_Pt-200ToInf_0.root': {'object_path': 'Events'}}}}

In [84]:
test

{'hbb': {'files': {'/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_1-1.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_1-2.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_1.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_10.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_11.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_12.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_14.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_15.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_16.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_17.root': {'object_path': 'Events'},
   '/project01/ndcms/cmoore24/signal/hbb/nano_mc2017_18.root': {'object_path': 'Eve

In [81]:
test_ready, test = dataset_tools.preprocess(
    test,
    step_size=50_000,
    skip_bad_files=True,
    recalculate_steps=True,
    save_form=False,
)

In [82]:
@dask.delayed
def preprocess(set):
    preproed = dataset_tools.preprocess(
        set,
        step_size=50_000,
        skip_bad_files=True,
        recalculate_steps=True,
        save_form=False,
    )
    return preproed

In [85]:
test_ready, test = preprocess(test)

TypeError: Delayed objects of unspecified length are not iterable