In [1]:
# ========================
# Step 1: Imports
# ========================

import os
import gzip
import json
import dask
import awkward as ak
import numpy as np
import sys
import importlib

from coffea.nanoevents import NanoAODSchema
from coffea.dataset_tools import apply_to_fileset, max_chunks, max_files
from dask.distributed import Client
from pathlib import Path

current_dir = Path.cwd()
#print(current_dir)
#processors_dir = current_dir.parent / "processors"
processors_dir = current_dir # used to keep processors separate but that was bad
path_to_run_on = current_dir.parent
path_to_master_json = current_dir.parent.parent / "src" / "dataset_tools"

sys.path.append(str(processors_dir))

processor_name = "test_processor" # without .py
run_on_name = "run_on_custom.json"

processor_module = importlib.import_module(processor_name)
Processor = processor_module.Processor

Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


/home/cms-jovyan/dwg_experimental/src


In [2]:
import json

with open(path_to_master_json / "datasets_master.json") as file:
    datasets = json.load(file)

with open(path_to_run_on / run_on_name) as file:
    run_on = json.load(file)
#print(run_on)

In [3]:
import cloudpickle

results = {}

for AOD_type in run_on.keys():
    for year in run_on[AOD_type]:
        for sample_name in run_on[AOD_type][year]:
            #print(run_on[AOD_type][year][sample_name].keys())
            entry = run_on[AOD_type][year][sample_name]
            print(f"sample in json: {sample_name}")
            

            if entry['run']:

                preprocessed_file_path = (
                    path_to_master_json / 
                    datasets[AOD_type][year][sample_name]['preprocessed_file']
                )

                with gzip.open(preprocessed_file_path, "rt") as f:
                    preprocessed_file = json.load(f)

                num_files = entry['num_files']
                num_chunks = entry['num_chunks']

                client = Client("tls://localhost:8786")

                if entry["use_client"]:
                    print("using client")
                else:
                    client.close()
                    
                
                if entry['reduced_computation']:
                    test_files  = max_files(preprocessed_file, num_files)
                    test_chunks = max_chunks(test_files, num_chunks)
                
                    tg, rep = apply_to_fileset(
                        data_manipulation=Processor(),
                        fileset=test_chunks,
                        schemaclass=NanoAODSchema,
                        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
                    )
                else:
                    tg, rep = apply_to_fileset(
                        data_manipulation=Processor(),
                        fileset=preprocessed_file,
                        schemaclass=NanoAODSchema,
                        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
                    )

                #client.scheduler_info()
                print("sample loaded, running, hold please.........")
                result, report = dask.compute(tg, rep)

                pikl_result_path = Path('pikls') / processor_name
                pikl_result_path.mkdir(parents=True, exist_ok=True)
                with open(pikl_result_path / f"{sample_name}.pkl", "wb") as f:
                    cloudpickle.dump(result, f)
                
                #ak.to_json(report[], f"pikls/{sample_name}_{processor_name}_report.json", num_indent_spaces=2)
                print(f"done with {sample_name}")
            else:
                print('skipping')
                continue
if client.status == 'running':
    client.close()

sample in json: SlepSnu_MN1-260_MN2-280_MC1-270
skipping
sample in json: SlepSnu_MN1-220_MN2-260_MC1-240
skipping
sample in json: SlepSnu_MN1-270_MN2-280_MC1-275


TypeError: QuickConstruct.Regular() missing 2 required positional arguments: 'start' and 'stop'

In [None]:
r = result['SlepSnuCascade_MN1-270_MN2-280_MC1-275_TuneCP5_13p6TeV_madgraphMLM-pythia8130X_mcRun3_2023_realistic_postBPix_v6-v3']

In [None]:
r

In [None]:
r['test_dict']['pt_eta_hist'].project('pt', 'qual_tag').plot()

In [9]:
values = [15 + 2.5 * i for i in range(int((100 - 15) / 2.5) + 1)]
print(",".join(str(v) for v in values))

15.0,17.5,20.0,22.5,25.0,27.5,30.0,32.5,35.0,37.5,40.0,42.5,45.0,47.5,50.0,52.5,55.0,57.5,60.0,62.5,65.0,67.5,70.0,72.5,75.0,77.5,80.0,82.5,85.0,87.5,90.0,92.5,95.0,97.5,100.0
