In [None]:
"""
    This script loads and configures a processor in processors/, running it either locally or with TaskVine.
    The output parquet files are saved in results/{processor}/{year}/{type}.
    Run this script for each sample_json/NanoAODUL_{year}_{type}.json.
    To process specific datasets or root files, adjust the JSON files as needed.
    Use screen_run.py to automatically run this script through multiple JSONs.
    Raymond Kil, September 2025 (jkil@nd.edu)
"""
import os, glob, json, warnings
import pandas as pd
import importlib.util
from coffea import processor
from coffea.nanoevents import NanoAODSchema

In [None]:
### Directory Setup ###
#scriptPath = os.path.dirname(os.path.abspath(__file__))
outputPath = "data/results/makeDF/2018/mc/"
os.makedirs(outputPath, exist_ok=True)

In [None]:
### TaskVine Setup ###
#manager_name = f"{os.environ['USER']}-makeDF_2018_mc";
manager_name = os.environ.get("VINE_MANAGER_NAME")
print(manager_name)
ports_str = os.environ.get("VINE_MANAGER_PORTS", "9123, 9150")
ports = [int(p.strip()) for p in ports_str.split(",")]

In [None]:
if len(ports) == 1:
    ports = ports[0]
else:
    ports = [int(p) for p in ports]

In [None]:
print(f"Manager Ports: {ports}")

In [None]:
if True:
    executor_args = {
        'desc'             : f'Processing makeDF',
        'manager_name'     : manager_name,
        'port'             : ports,
#        'environment_file' : f"{scriptPath}/lfv-coffea-env.tar.gz",
        'extra_input_files': glob.glob(f"utils/*"),
        'retries'          : 5,
        'fast_terminate_workers': 0
    }
    executor = processor.TaskVineExecutor(**executor_args)
else:
    executor_args = {
      'workers': 10,
      'desc': f'Processing makeDF'
    }
    executor = processor.FuturesExecutor(**executor_args)

In [None]:
print(f"nCores  : {int(os.cpu_count())}\nnWorkers: 10")

In [None]:
samples={"tree":["data/samples/test/tree_1.root"]}

In [None]:
# module
processorpath = 'processors/makeDF.py' 
spec   = importlib.util.spec_from_file_location('my_processor', processorpath)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)

In [None]:
# processor
warnings.filterwarnings("ignore", message=r"Missing cross-reference.*", module="coffea.nanoevents.schemas.nanoaod")
processor_instance = module.my_processor(year="2018", type="mc")
run = processor.Runner(
    executor      = executor,
    schema        = NanoAODSchema,
    maxchunks     = None,
    chunksize     = 20000,
    xrootdtimeout = 900,
    skipbadfiles  = False
)

In [None]:
### Running & Saving ###
result = run(samples, treename="Events", processor_instance=processor_instance)
for sampleName in samples.keys():
    dicts = {}
    for key, value in result.items():
        if sampleName==key[1]: dicts.update({key[0]:value})
    parqResult = pd.DataFrame([{key[0]:value for key,value in result.items() if sampleName in key[1]}])
    parqResult.to_parquet(f"{outputPath}/{sampleName}.parq")
    print(f"Results saved in {outputPath}/{sampleName}.parq")