In [1]:
from pathlib import Path
import uproot as up
from multiprocessing import Pool
import h5py as h5
from tqdm import tqdm
import numpy as np

In [2]:
ntuple_cache_path = '/data/massive1/LLP/hi_met/cache_ntuple_2024.h5'
nano_cache_path = '/data/massive1/LLP/hi_met/cache_nano_2024.h5'

ntuple_files = Path('/data/massive1/LLP/store/group').rglob('*.root')
nano_files = Path('/data/massive1/LLP/store/data').rglob('*.root')

ntuple_k_map = {str(p.with_suffix('').relative_to(p.parents[5]).__str__().replace('/', '=')):p for p in ntuple_files}
nano_k_map = {str(p.with_suffix('').relative_to(p.parents[5]).__str__().replace('/', '=')):p for p in nano_files}

In [3]:
def load_key(k: str, fname: str|Path) -> np.ndarray:
    with h5.File(fname, 'r') as f:
        arr = f[k][()]
        return arr

def worker(x:tuple[str, str|Path]):
    key, fname = x
    return key, load_key(key, fname)

In [4]:
with h5.File(ntuple_cache_path, 'r') as f:
    ntuple_keys = list(f.keys())

with h5.File(nano_cache_path, 'r') as f:
    nano_keys = list(f.keys())

In [5]:
def era_from_path(path:Path) -> str:
    if path.parents[5].name == 'NANOAOD':
        return path.parents[7].name[3:]
    else:
        return path.parents[2].name.split('_')[4][3:8]

In [6]:
pool = Pool(4)

In [7]:
r = pool.imap(worker, [(k, ntuple_cache_path) for k in ntuple_keys])
ntuple = {}
for k,arr in tqdm(r, total=len(ntuple_keys), desc='Loading NTuples'):
    path = ntuple_k_map[k]
    era = era_from_path(path)
    ntuple.setdefault(era, {})[k] = set(tuple(x) for x in arr)

Loading NTuples: 100%|██████████| 2582/2582 [00:08<00:00, 296.09it/s]


In [8]:
# f = h5.File(nano_cache_path)

In [9]:
# len(f)

In [10]:
# f['NANOAOD_22Sep2023-v1_2550000_09bd0d81-c724-4762-abbd-3ebe89f0bbca']

In [11]:
len(nano_keys)

4657

In [12]:
# nano_k_map2 = {k.split('=')[-1]:v for k,v in nano_k_map.items()}

In [13]:
r = pool.imap(worker, [(k, nano_cache_path) for k in nano_keys])
nano = {}
for k,arr in tqdm(r, total=len(nano_keys), desc='Loading Nano'):
    path = nano_k_map[k.split('_')[-1]]
    era = era_from_path(path)
    nano.setdefault(era, {})[k] = arr
# nano = dict(tqdm(r, total=len(nano_keys), desc='Loading Nano'))

Loading Nano: 100%|██████████| 4657/4657 [00:48<00:00, 95.34it/s] 


In [27]:
n_events = {k:[sum(len(vv) for vv in v.values()),0] for k,v in ntuple.items()}

In [28]:
matchs = {}
with tqdm(total=sum(len(x) for x in nano.values())) as pbar:
    for era, d_nano in nano.items():
        if era not in ntuple:
            continue
        d_ntuple = ntuple[era]
        matchs[era] = {}
        for p_nano, arr_nano in d_nano.items():
            nano_set = set(tuple(x) for x in arr_nano)
            for p_ntuple, arr_ntuple in d_ntuple.items():
                ntuple_set = arr_ntuple
                n_match = len(ntuple_set & nano_set)
                if n_match > 0:
                    matchs[era].setdefault(p_nano, []).append(p_ntuple)
                    n_events[era][1] += n_match
            pbar.update(1)

 83%|████████▎ | 3883/4657 [20:27<04:04,  3.16it/s]


In [29]:
n_events

{'2024A': [0, 0],
 '2024B': [87633, 87633],
 '2024C': [2923349, 2923349],
 '2024D': [3366570, 3366570],
 '2024E': [5559226, 5559226],
 '2024F': [5700779, 5700779]}

In [9]:
import json

In [10]:
with open('/data/massive1/LLP/hi_met/matchs2.json', 'w') as f:
    json.dump(matchs, f)

In [11]:
with open('/data/massive1/LLP/hi_met/matchs2.json', 'r') as f:
    matchs = json.load(f)

In [12]:
path_matchs = {}
for era, d in matchs.items():
    path_matchs[era] = {}
    for nano_name, ntuple_names in d.items():
        ntuple_paths = [ntuple_k_map[n] for n in ntuple_names]
        nano_path = nano_k_map[nano_name]
        path_matchs[era][nano_path] = ntuple_paths

In [13]:
tmp_path = Path('/tmp/merge_ntuples')

In [14]:
cache_path = Path('/data/massive1/LLP/cache_2024')
out_base = Path('/data/massive1/analysis_data/LLP/hi_met/ntuple_merged_2024')
args = []
for era in path_matchs:
    out_dir = out_base / era
    out_dir.mkdir(parents=True, exist_ok=True)
    (tmp_path / 'lists'/ era).mkdir(parents=True, exist_ok=True)
    for nano_path, ntuple_paths in path_matchs[era].items():
        out_path = out_dir / nano_path.name
        inp_nano_path = tmp_path / 'lists'/ era / f'{nano_path.stem}_nano.txt'
        inp_ntuple_path = tmp_path / 'lists' / era / f'{nano_path.stem}_ntuple.txt'
        inp_nano_path.write_text(str(nano_path))
        inp_ntuple_path.write_text('\n'.join(str(x) for x in ntuple_paths))
        
        arg = f'{inp_ntuple_path} {inp_nano_path} {out_path} {cache_path}'
        args.append(arg)

In [15]:
inp_file = Path('/tmp/inp.txt')

In [16]:
inp_file.write_text('\n'.join(args))

722537