In [1]:
from coffea.nanoevents import NanoAODSchema
from coffea.dataset_tools import apply_to_fileset, max_chunks, max_files, preprocess

import dask
import numpy as np
import awkward as ak

from template_processor import TestProcessor

from dask.distributed import Client

Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


In [2]:
import gzip
import json
import os
# Define the base directory where the preprocessed files are stored (preprocessed files point to sample on DAS and define slicing of root files by event (chunks)
base_dir = "../dataset_tools/preprocessing/preprocessed"
sample = "2023_ttbar_100000_preprocessed_available.json.gz"
#sample = "2023_SlepSnu_MN1_220_100000_preprocessed_available.json.gz" # These preprocessed files are generated one time in advance, stored until needed now in analysis
file_path = os.path.join(base_dir, sample)

ntuple_name = sample.replace("_100000_preprocessed_available.json.gz", "_dwg_ntuple")

with gzip.open(file_path, "rt") as file:
    preprocessed_available = json.load(file)


In [3]:
#client = Client("tls://localhost:8786")
#client

In [4]:
### SWITCH HERE ###

reduced_computation = True

num_files = 1 # number of root files from DAS to run over
num_chunks = 3# number of events (chunks) per root file to run over (chunksize set during preprocessing, my default is 1 chunk = 100,000 events)

###################

In [5]:
if reduced_computation:
    
    test_preprocessed_files = max_files(preprocessed_available, num_files)
    test_preprocessed = max_chunks(test_preprocessed_files, num_chunks)

    small_tg, small_rep = apply_to_fileset(
        data_manipulation=TestProcessor(),
        fileset=test_preprocessed,
        schemaclass=NanoAODSchema,
        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
    )
    computed, rep = dask.compute(small_tg, small_rep)

    
else:
    full_tg, full_rep = apply_to_fileset(
        data_manipulation=TestProcessor(),
        fileset=preprocessed_available,
        schemaclass=NanoAODSchema,
        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
    )
    computed, rep = dask.compute(full_tg, full_rep)


In [6]:
computed.keys()

dict_keys(['/TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8/Run3Summer23NanoAODv12-130X_mcRun3_2023_realistic_v14-v2/NANOAODSIM'])

In [7]:
sample_name = next(iter(computed))
sample_name 

'/TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8/Run3Summer23NanoAODv12-130X_mcRun3_2023_realistic_v14-v2/NANOAODSIM'

In [8]:
results = computed[sample_name]

In [9]:
results #did this to get in front of the massive sample name, work with the results dictionary from here forward for simplicity

{'ntuple': {'num_tot_Events': 183000,
  'num_tot_ele': 234341,
  'num_tot_lpte': 251027,
  'num_tot_mu': 238525,
  'dataset': '/TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8/Run3Summer23NanoAODv12-130X_mcRun3_2023_realistic_v14-v2/NANOAODSIM',
  'Electron': {'pt': <Array [[45.7], [10.6], [], ..., [156], [122]] type='183000 * var * float32...'>,
   'eta': <Array [[-0.0922], [0.0165], ..., [1.96]] type='183000 * var * float32[para...'>,
   'phi': <Array [[-0.397], [2.92], ..., [-1.32]] type='183000 * var * float32[parame...'>,
   'mass': <Array [[3.58e-07], ..., [-5.39e-06]] type='183000 * var * float32[paramete...'>,
   'charge': <Array [[-1], [-1], [], [], ..., [], [1], [1]] type='183000 * var * int32[p...'>,
   'dxy': <Array [[-0.0009], ..., [-0.00234]] type='183000 * var * float32[parameters...'>,
   'dz': <Array [[0.000651], [-0.00223], ..., [0.00699]] type='183000 * var * float3...'>,
   'pfRelIso03_all': <Array [[0.0863], [2.18], ..., [0.00192], [0]] type='183000 * var * float32...'>,


In [10]:
ak.sum(
results['ntuple']['LowPtElectron']['isBaseline']
)

12889

In [11]:
ak.sum(
results['ntuple']['LowPtElectron']['isGold']
)

1250

In [12]:

ak.flatten(results['ntuple']['LowPtElectron']['isGold'])


In [13]:
results['ntuple'].keys()

dict_keys(['num_tot_Events', 'num_tot_ele', 'num_tot_lpte', 'num_tot_mu', 'dataset', 'Electron', 'Muon', 'LowPtElectron'])

In [14]:
ak.sum(ak.num(results['ntuple']['Electron']['pt']))

118542

In [15]:
results['ntuple']['Electron']['pt'][:10]

In [34]:
%%time
chunk_size = 100000
ntuple = results['ntuple']
nentries = ntuple['num_tot_Events']
total_chunks = (nentries + chunk_size - 1) // chunk_size

os.makedirs(ntuple_name, exist_ok=True)

for chunk_idx, start in enumerate(range(0, nentries, chunk_size)):
    end = min(start + chunk_size, nentries)


    #sliced_events = {
    #    key: val[start:end]
    #    for key, val in ntuple["Event"].items()
    #}
    
    sliced_electron = {
        key: val[start:end]
        for key, val in ntuple["Electron"].items()
    }
    #sliced_electron["nEle"] = ak.sum(ak.num(sliced_electron["pt"], axis=1))

    sliced_lpte = {
        key: val[start:end]
        for key, val in ntuple["LowPtElectron"].items()
    }
    #sliced_lpte["nLpte"] = ak.sum(ak.num(sliced_lpte["pt"], axis=1))
    
    sliced_muon = {
        key: val[start:end]
        for key, val in ntuple["Muon"].items()
    }
    #sliced_muon["nMu"] = ak.sum(ak.num(sliced_muon["pt"], axis=1))
    
    slice_ntuple = {
        "numEvents": end - start,
        "dataset": ntuple["dataset"],
        #"Events": sliced_events,
        "Electron": sliced_electron,
        "Muon": sliced_muon,
        "LowPtElectron": sliced_lpte,
        
    }
    
    
    
    # Create output filename
    filename = f"{ntuple_name}/{ntuple_name}_{slice_ntuple['numEvents']}_events_chunk_{chunk_idx:03d}.parquet"
    
    # Save to Parquet
    ak.to_parquet(slice_ntuple, filename, compression="SNAPPY")
    print(f"Saved {filename}")
    print(f"chunk {chunk_idx} of {total_chunks}")
    with open(f"{ntuple_name}/upload_log.txt", "a") as log_file:
        log_file.write(
            f"Chunk_index: {chunk_idx} of: {total_chunks - 2} saved successfully. "
            f"Skim generated over {nentries} events of dataset: {ntuple['dataset']}.\n"
    )


Saved 2023_ttbar_dwg_ntuple/2023_ttbar_dwg_ntuple_100000_events_chunk_000.parquet
chunk 0 of 2
Saved 2023_ttbar_dwg_ntuple/2023_ttbar_dwg_ntuple_83000_events_chunk_001.parquet
chunk 1 of 2
CPU times: user 3min 36s, sys: 92.7 ms, total: 3min 36s
Wall time: 3min 36s


In [52]:
parquet_files = sorted(glob.glob("2023_ttbar_dwg_ntuple/*.parquet"))

test_ntuple = {}
for file in parquet_files:
    record = ak.from_parquet(file)        # load the ak.Record
    my_dict = ak.to_list(record)          # ✅ deep conversion to native dict
    test_ntuple |= my_dict    

test_ntuple.type

AttributeError: 'dict' object has no attribute 'type'

In [46]:
test_ntuple.keys()

dict_keys(['numEvents', 'dataset', 'Electron', 'Muon', 'LowPtElectron'])

In [54]:
test_ntuple.type

ScalarType(RecordType([NumpyType('int64'), ListType(NumpyType('uint8', parameters={'__array__': 'char'}), parameters={'__array__': 'string'}), RecordType([ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('int64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('float64'))), ListType(ListType(NumpyType('bool'))), ListType(ListType(NumpyType('bool'))), ListType(ListType(NumpyType('bool'))), ListType(ListType(NumpyType('bool'))), ListType(ListType(NumpyType('bool'))), ListType(ListType(NumpyType('bool')))], ['pt', 'eta', 'phi', 'mass', 'charge', 'dxy', 'dz', 'pfRelIso03_all', 'miniPFRelIso_all', 'sip3d', 'isGold', 'isSilver', 'isBronze', 'isSignal', 'isLightFake', 'isHeavyFake']), RecordType([Lis

In [53]:
#parquet_files = sorted(glob.glob("2023_ttbar_dwg_ntuple/*.parquet"))

#for file in parquet_files:
    
test_ntuple = ak.from_parquet("2023_ttbar_dwg_ntuple/2023_ttbar_dwg_ntuple_100000_events_chunk_000.parquet")   


test_ntuple

In [21]:
test_ntuple['Muon'].dxy

In [19]:
max(ak.flatten(test_ntuple.Muon.pt))

ValueError: the truth value of an array whose length is not 1 is ambiguous; use ak.any() or ak.all()

In [24]:
import sys
print(sys.version)


3.12.5 | packaged by conda-forge | (main, Aug  8 2024, 18:36:51) [GCC 12.4.0]


In [None]:
#init voms proxy in terminal before running this cell:
# voms-proxy-init -voms cms -vomses /etc/vomses
#!xrdcp -r {ntuple_name}/ root://xrootd-local.unl.edu:1094//store/user/dgrove/my_ntuple/


In [None]:
test_ntuple.Electron.nEle

In [None]:
test_ntuple.Muon.nMu

In [None]:
test_ntuple.LowPtElectron.nLpte