In [1]:
import dask
import numpy as np
import awkward as ak
import gzip
import json
import os

from coffea.nanoevents import NanoAODSchema
from coffea.dataset_tools import (
apply_to_fileset, max_chunks, max_files, preprocess
)

from skimmer_nanoAOD_v12 import SkimNanoAODv12

from dask.distributed import Client

Issue: coffea.nanoevents.methods.vector will be removed and replaced with scikit-hep vector. Nanoevents schemas internal to coffea will be migrated. Otherwise please consider using that package!.
  from coffea.nanoevents.methods import vector


ModuleNotFoundError: No module named 'skimmer_nanoAOD_v12'

# READ ME

This skim is designed to SLIM DOWN a NanoAOD file to the core objects (Electrons, Muons, Jets, etc.) that a given analysis needs. Additionally, it only grabs the variables specified in the `skimmer_nanoAOD_v12.py` file.

### Why do this?

I think of this as akin to downloading the sample, or a portion of it. Rather than `xrdcp` a certain number of the root files from DAS, store them somewhere, and make preprocessed files of them.... Lets just grab the subset of that sample and make our own ntuple and store it locally forever, no? We can load this slimmed file later for further processing and analysis.

Ideally, you would only need to run this step once then work with the parquet file going forward.

## Saving the parquet file takes awhile, and for a massive enough sample, will run out of memory.

In [None]:
# Define the base directory where the preprocessed files are stored (preprocessed files point to sample on DAS and define slicing of root files by event (chunks)
base_dir = "/home/cms-jovyan/dwg_analysis_v3/tools/preprocessing/preprocessed"
sample = "2023_ttbar_100000_preprocessed_available.json.gz"
#sample = "2023_SlepSnu_MN1_220_100000_preprocessed_available.json.gz" # These preprocessed files are generated one time in advance, stored until needed now in analysis
file_path = os.path.join(base_dir, sample)


with gzip.open(file_path, "rt") as file:
    preprocessed_available = json.load(file)

ntuple_name = "ttbar_2023_naodv12"

reduced_computation = True

num_files = 10 # number of root files from DAS to run over
num_chunks = 5# number of events (chunks) per root file to run over 
#(chunksize set during preprocessing, 
# my default is 1 chunk = 100,000 events)


In [None]:
client = Client("tls://localhost:8786")
client

In [None]:
if reduced_computation:
    
    test_preprocessed_files = max_files(preprocessed_available, num_files)
    test_preprocessed = max_chunks(test_preprocessed_files, num_chunks)

    small_tg, small_rep = apply_to_fileset(
        data_manipulation=SkimNanoAODv12(),
        fileset=test_preprocessed,
        schemaclass=NanoAODSchema,
        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
    )
    computed, rep = dask.compute(small_tg, small_rep)

    
else:
    full_tg, full_rep = apply_to_fileset(
        data_manipulation=SkimNanoAODv12(),
        fileset=preprocessed_available,
        schemaclass=NanoAODSchema,
        uproot_options={"allow_read_errors_with_report": (OSError, KeyError)},
    )
    computed, rep = dask.compute(full_tg, full_rep)


In [None]:
computed.keys()

In [None]:
sample_name = next(iter(computed))
sample_name 

In [None]:
results = computed[sample_name]
results

In [None]:
ak.to_parquet(results['ntuple'], "../ntuples/dwg_ntuple_gzip.parquet", compression="GZIP")

In [None]:

test_ntuple = ak.from_parquet("my_ntuple_gzip.parquet")


In [None]:
test_ntuple