## Configuration

In [1]:
# ==================== ANALYSIS CONFIGURATION ====================

# Data Configuration
SAMPLES_PATH = '/users/mislam5/tmp-data/dv5-small'  # Path to input ROOT files
TRIGGERS_FILE = 'triggers.json'  # Path to triggers configuration
SAMPLES_READY_FILE = 'samples_ready.json'  # Preprocessed samples metadata

# Dataset Selection
PROCESS_ALL_DATASETS = False  # Set True to process all datasets, False for specific dataset
SUB_DATASET = 'hgg'  # Which dataset to process (e.g., 'hgg', 'hbb', 'qcd', etc.)

# Analysis Parameters
ECF_UPPER_BOUND = 3  # Calculate ECFs from n=2 to n=ECF_UPPER_BOUND (max: 6)
STEP_SIZE = 50_000  # Number of events per chunk for preprocessing

# Output Configuration
OUTPUT_DIR = 'output'  # Base directory for output parquet files

# TaskVine/Dask Configuration
# MANAGER_NAME = 'dv5-ecf-manager'  # Name for the TaskVine manager
# PORT_RANGE = [9123, 9128]  # Port range for TaskVine manager
RUN_INFO_PATH = 'vine-run-info'  # Directory for TaskVine logs
STAGING_PATH = '/tmp/ecf-staging'  # Temporary staging directory

# Processing Options
PREPROCESS_DATA = True  # Set True to run preprocessing step
SHOW_SAMPLES = False  # Set True to display available samples and exit

# Advanced TaskVine Tuning (optional)
MAX_WORKERS = 30000  # Maximum number of workers
MAX_RETRIEVALS = 10  # Max concurrent result retrievals
TEMP_REPLICA_COUNT = 1  # Replication count for temp files
PRUNE_DEPTH = 0  # Task graph pruning depth

print("Configuration loaded!")
print(f"Dataset: {SUB_DATASET if not PROCESS_ALL_DATASETS else 'ALL'}")
print(f"ECF range: n=2 to n={ECF_UPPER_BOUND}")

Configuration loaded!
Dataset: hgg
ECF range: n=2 to n=3


## Import Libraries

In [2]:
import json
import os
import sys
import time
import warnings

import dask
import dask_awkward as dak
import awkward as ak
import numpy as np

from coffea import dataset_tools
from coffea.nanoevents import PFNanoAODSchema
from ndcctools.taskvine.compat import DaskVine

# Import helper functions
from ecf_helpers import (
    preprocess_data,
    filter_existing_files,
    show_available_samples,
    analysis
)

# Suppress warnings
warnings.filterwarnings("ignore", "Found duplicate branch")
warnings.filterwarnings("ignore", "Missing cross-reference index for")
warnings.filterwarnings("ignore", "dcut")
warnings.filterwarnings("ignore", "Please ensure")
warnings.filterwarnings("ignore", "invalid value")

print("Libraries imported successfully!")

Libraries imported successfully!


## Initialize TaskVine Manager

In [3]:
#manager_name = f"{os.environ['USER']}-makeDF_2018_mc";
manager_name = os.environ.get("VINE_MANAGER_NAME")
print(manager_name)
ports_str = os.environ.get("VINE_MANAGER_PORTS", "9123, 9150")
ports = [int(p.strip()) for p in ports_str.split(",")]

if len(ports) == 1:
    ports = ports[0]
else:
    ports = [int(p) for p in ports]

print(f"Manager Ports: {ports}")

floability-406dd42c-53d4-4b70-9ee5-f563cd303715
Manager Ports: [9123, 9150]


In [4]:
# Create TaskVine manager for distributed computing
m = DaskVine(
    ports,
    name=manager_name,
    run_info_path=RUN_INFO_PATH,
    staging_path=STAGING_PATH,
)

# Configure TaskVine settings
m.tune("max-workers", MAX_WORKERS)
m.tune("max-retrievals", MAX_RETRIEVALS)
m.tune("transient-error-interval", 1)
m.tune("worker-source-max-transfers", 10000)
m.tune("transfer-temps-recovery", 0)
m.tune("attempt-schedule-depth", 100)
m.tune("watch-library-logfiles", 1)
m.tune("temp-replica-count", TEMP_REPLICA_COUNT)

0

In [5]:
print(f"TaskVine manager '{m.name}' initialized")
print(f"Listening on ports: {m.port}")
print(f"Run info path: {RUN_INFO_PATH}")

TaskVine manager 'floability-406dd42c-53d4-4b70-9ee5-f563cd303715' initialized
Listening on ports: 9123
Run info path: vine-run-info


## Data Preprocessing (Optional)

Run this cell if you need to preprocess the data files. This step:
- Scans the input directory structure
- Creates metadata for all ROOT files
- Saves results to `samples_ready.json`

Skip this if `samples_ready.json` already exists.

In [6]:
if PREPROCESS_DATA:
    print("Starting data preprocessing...")
    start_time = time.time()
    
    samples_ready = preprocess_data(
        SAMPLES_PATH,
        step_size=STEP_SIZE,
        manager=m
    )
    
    # Save preprocessed samples
    with open(SAMPLES_READY_FILE, 'w') as fout:
        json.dump(samples_ready, fout)
    
    elapsed = (time.time() - start_time) / 60
    print(f"Preprocessing complete! Time: {elapsed:.2f} minutes")
    print(f"Saved to: {SAMPLES_READY_FILE}")
else:
    print("Skipping preprocessing (PREPROCESS_DATA=False)")

Starting data preprocessing...
categories = ['hgg_1']
Preprocessing samples...
Computing preprocessing tasks...
Preprocessing complete!
Preprocessing complete! Time: 0.21 minutes
Saved to: samples_ready.json


## Load Preprocessed Samples

In [7]:
# Load samples metadata
if not os.path.exists(SAMPLES_READY_FILE):
    print(f"Error: {SAMPLES_READY_FILE} not found!")
    print("Please run preprocessing first (set PREPROCESS_DATA=True)")
    raise FileNotFoundError(SAMPLES_READY_FILE)

with open(SAMPLES_READY_FILE, 'r') as fin:
    samples_ready = json.load(fin)

print(f"Loaded {len(samples_ready)} sample categories")

Loaded 1 sample categories


## Show Available Samples (Optional)

In [8]:
if SHOW_SAMPLES:
    show_available_samples(samples_ready)

## Filter and Select Datasets

In [14]:
# Filter to only include files that exist
filtered_samples = filter_existing_files(samples_ready)

print(filtered_samples)

if not filtered_samples:
    print("Error: No valid files found in any dataset.")
    raise ValueError("No valid files")
PROCESS_ALL_DATASETS = True
# Select which datasets to process
if PROCESS_ALL_DATASETS:
    samples_to_process = filtered_samples
    print(f"Processing ALL datasets ({len(samples_to_process)} total)")
else:
    if SUB_DATASET not in filtered_samples:
        print(f"Error: Dataset '{SUB_DATASET}' not found!")
        print("Available datasets:")
        for name in filtered_samples.keys():
            print(f"  - {name}")
        raise ValueError(f"Dataset '{SUB_DATASET}' not found")
    
    samples_to_process = {SUB_DATASET: filtered_samples[SUB_DATASET]}
    print(f"Processing dataset: {SUB_DATASET}")

# Show file counts
print("\nSamples to process:")
for name, item in samples_to_process.items():
    print(f"  {name}: {len(item['files'])} files")

{'hgg_1': {'files': {'/users/mislam5/tmp-data/dv5-small/hgg_1/HJ_MINLO_Pt-200ToInf_9.root': {'object_path': 'Events', 'steps': [[0, 3000]], 'num_entries': 3000, 'uuid': '6f4df38e-6f58-11ed-abce-b0cde183beef'}, '/users/mislam5/tmp-data/dv5-small/hgg_1/HJ_MINLO_Pt-200ToInf_10.root': {'object_path': 'Events', 'steps': [[0, 3000]], 'num_entries': 3000, 'uuid': '653e7d32-6f58-11ed-9a32-c8cde183beef'}, '/users/mislam5/tmp-data/dv5-small/hgg_1/HJ_MINLO_Pt-200ToInf_1.root': {'object_path': 'Events', 'steps': [[0, 3000]], 'num_entries': 3000, 'uuid': '6556187a-6f58-11ed-8669-9bcde183beef'}, '/users/mislam5/tmp-data/dv5-small/hgg_1/HJ_MINLO_Pt-200ToInf_2.root': {'object_path': 'Events', 'steps': [[0, 2969]], 'num_entries': 2969, 'uuid': '5d79fde2-6f58-11ed-9c10-c2cde183beef'}, '/users/mislam5/tmp-data/dv5-small/hgg_1/HJ_MINLO_Pt-200ToInf_3.root': {'object_path': 'Events', 'steps': [[0, 3000]], 'num_entries': 3000, 'uuid': '6a4fb34a-6f58-11ed-a1fe-b9cde183beef'}, '/users/mislam5/tmp-data/dv5-smal

## Create Analysis Tasks

In [16]:
print("Creating analysis tasks...")

# Create a wrapper function with configured parameters
def analysis_wrapper(events):
    return analysis(
        events,
        ecf_upper_bound=ECF_UPPER_BOUND,
        triggers_file=TRIGGERS_FILE
    )

# Apply analysis to all selected datasets
tasks = dataset_tools.apply_to_fileset(
    analysis_wrapper,
    samples_to_process,
    uproot_options={"allow_read_errors_with_report": False},
    schemaclass=PFNanoAODSchema,
)

print(f"Analysis tasks created for {len(samples_to_process)} dataset(s)")

Creating analysis tasks...
Processing dataset: hgg_1
Signal: Higgs jets
Analysis tasks created for 1 dataset(s)


## Execute Analysis

This cell runs the distributed computation. Make sure workers are connected to the TaskVine manager.

**To start workers:**
```bash
vine_worker -M <manager_name> --cores 8 --memory 16000
```

In [20]:
!conda install -y conda-pack

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.9.1
    latest version: 26.1.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /users/mislam5/floability-base-dir/floability_instance_20260206_195706_796139/current_conda_env

  added / updated specs:
    - conda-pack


The following NEW packages will be INSTALLED:

  conda-pack         conda-forge/noarch::conda-pack-0.9.1-pyhcf101f3_0 



Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [23]:
!conda install -y threadpoolctl

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.9.1
    latest version: 26.1.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /users/mislam5/floability-base-dir/floability_instance_20260206_195706_796139/current_conda_env

  added / updated specs:
    - threadpoolctl


The following NEW packages will be INSTALLED:

  threadpoolctl      conda-forge/noarch::threadpoolctl-3.6.0-pyhecae5ae_0 



Downloading and Extracting Packages:

Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [None]:
print("="*60)
print("Starting computation...")
print("="*60)

start_time = time.time()

# Execute the analysis
computed = dask.compute(
    tasks,
    scheduler=m.get,
    resources_mode=None,
    prune_depth=PRUNE_DEPTH,
    worker_transfers=True,
    resources={"cores": 1},
    # lib_resources={'cores': 16, 'slots': 16},
    # task_mode="function-calls",
)

execution_time = time.time() - start_time

print("="*60)
print(f"COMPUTATION COMPLETE!")
print(f"Total execution time: {execution_time:.2f} seconds ({execution_time/60:.2f} minutes)")
print(f"Output saved to: {OUTPUT_DIR}/")
print("="*60)

Starting computation...


Output()

## Verify Output

In [None]:
# Check output directory
if os.path.exists(OUTPUT_DIR):
    print(f"\nOutput directory: {OUTPUT_DIR}")
    print("\nDatasets processed:")
    for item in os.listdir(OUTPUT_DIR):
        item_path = os.path.join(OUTPUT_DIR, item)
        if os.path.isdir(item_path):
            files = [f for f in os.listdir(item_path) if f.endswith('.parquet')]
            print(f"  {item}: {len(files)} parquet files")
else:
    print(f"Output directory {OUTPUT_DIR} not found")

## Load and Inspect Results (Optional)

In [None]:
# Example: Load results for inspection
import pyarrow.parquet as pq

dataset_name = SUB_DATASET
output_path = os.path.join(OUTPUT_DIR, dataset_name)

if os.path.exists(output_path):
    # Read parquet files
    result_table = pq.read_table(output_path)
    result_df = result_table.to_pandas()
    
    print(f"\nResults for {dataset_name}:")
    print(f"Shape: {result_df.shape}")
    print(f"\nColumns: {list(result_df.columns)}")
    print(f"\nFirst few rows:")
    display(result_df.head())
else:
    print(f"No output found for {dataset_name}")

## Summary

This notebook processed CMS NanoAOD data and calculated Energy Correlation Functions (ECFs) for jet substructure analysis.

**Key outputs:**
- ECFs calculated from n=2 to configured upper bound
- Color ring variable
- Jet kinematics (pT, mass, Î·)
- ParticleNet scores
- Gen-matching information (for MC samples)