In [1]:
import dask
import numpy as np
import awkward as ak
import gzip
import json
import os

from coffea.nanoevents import NanoAODSchema
from coffea.dataset_tools import (
apply_to_fileset, max_chunks, max_files, preprocess
)

from dwg_ntupilizer_v12 import SkimNanoAODv12

from dask.distributed import Client

Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


# READ ME

This skim makes a ntuple directly from the remote files on DAS. Configure what you need below, the datasetname

## Saving the parquet file takes awhile, and for a massive enough sample, will run out of memory.

In [2]:
# Define the base directory where the preprocessed files are stored (preprocessed files point to sample on DAS and define slicing of root files by event (chunks)
base_dir = "/home/cms-jovyan/dwg_analysis_v3/dataset_tools/preprocessing/preprocessed"
sample = "2023_ttbar_100000_preprocessed_available.json.gz"
#sample = "2023_SlepSnu_MN1_220_100000_preprocessed_available.json.gz" # These preprocessed files are generated one time in advance, stored until needed now in analysis
file_path = os.path.join(base_dir, sample)


with gzip.open(file_path, "rt") as file:
    preprocessed_available = json.load(file)

ntuple_name = sample.replace("_100000_preprocessed_available.json.gz", "_dwg_ntuple")

reduced_computation = True

num_files = 1 # number of root files from DAS to run over
num_chunks = 5 # number of events (chunks) per root file to run over 
#(chunksize set during preprocessing, 
# my default is 1 chunk = 100,000 events)


In [3]:
#client = Client("tls://localhost:8786")
#client

In [4]:
if reduced_computation:
    
    test_preprocessed_files = max_files(preprocessed_available, num_files)
    test_preprocessed = max_chunks(test_preprocessed_files, num_chunks)

    small_tg, small_rep = apply_to_fileset(
        data_manipulation=SkimNanoAODv12(),
        fileset=test_preprocessed,
        schemaclass=NanoAODSchema,
        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
    )
    computed, rep = dask.compute(small_tg, small_rep)

    
else:
    full_tg, full_rep = apply_to_fileset(
        data_manipulation=SkimNanoAODv12(),
        fileset=preprocessed_available,
        schemaclass=NanoAODSchema,
        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
    )
    computed, rep = dask.compute(full_tg, full_rep)


/TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8/Run3Summer23NanoAODv12-130X_mcRun3_2023_realistic_v14-v2/NANOAODSIM


In [5]:
computed.keys()

dict_keys(['/TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8/Run3Summer23NanoAODv12-130X_mcRun3_2023_realistic_v14-v2/NANOAODSIM'])

In [6]:
sample_name = next(iter(computed))
sample_name

'/TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8/Run3Summer23NanoAODv12-130X_mcRun3_2023_realistic_v14-v2/NANOAODSIM'

In [7]:
results = computed[sample_name]
results

{'ntuple': {'event_count': 183000,
  'dataset': '/TTto2L2Nu_TuneCP5_13p6TeV_powheg-pythia8/Run3Summer23NanoAODv12-130X_mcRun3_2023_realistic_v14-v2/NANOAODSIM',
  'Events': {'MET': <MissingETArray [MissingET, ...] type='183000 * MissingET[MetUnclustEnUpDel...'>},
  'Electron': {'signal': {'baseline': <ElectronArray [[{seediEtaOriX: -7, ...}], ...] type='183000 * var * Electr...'>,
    'gold': <ElectronArray [[{seediEtaOriX: -7, ...}], ...] type='183000 * var * Electr...'>,
    'silver': <ElectronArray [[], [], [], [], ..., [], [], [], []] type='183000 * var * E...'>,
    'bronze': <ElectronArray [[], [], [], [], ..., [], [], [], []] type='183000 * var * E...'>},
   'light_fakes': {'baseline': <ElectronArray [[], [{...}], [], [], ..., [], [], []] type='183000 * var * ...'>,
    'gold': <ElectronArray [[], [], [], [], ..., [], [], [], []] type='183000 * var * E...'>,
    'silver': <ElectronArray [[], [], [], [], ..., [], [], [], []] type='183000 * var * E...'>,
    'bronze': <ElectronArr

In [8]:
ak.to_parquet(results['ntuple'], f"dwg_ntuple_v12/{ntuple_name}.parquet", compression="GZIP")

<pyarrow._parquet.FileMetaData object at 0x7f24125d80e0>
  created_by: parquet-cpp-arrow version 17.0.0
  num_columns: 710
  num_rows: 1
  num_row_groups: 1
  format_version: 2.6
  serialized_size: 0

In [9]:

test_ntuple = ak.from_parquet("dwg_ntuple_v12/2023_ttbar_dwg_ntuple.parquet")


In [10]:
test_ntuple

In [15]:
print(results['ntuple']['Events']

{'MET': <MissingETArray [MissingET, ...] type='183000 * MissingET[MetUnclustEnUpDel...'>}
