### TTree and RNTuple loading comparison while using uproot

Sources:
- 
- 

In [1]:
import awkward as ak
import numpy as np
import uproot
import pandas as pd


print(f"awkward: {ak.__version__}")
print(f"uproot: {uproot.__version__}")

awkward: 2.6.7
uproot: 5.3.13.dev30+g0a84fd8


  from pandas.core.computation.check import NUMEXPR_INSTALLED


### File loading

In [2]:
all_files = {}
events_list = []

## Remote files:
# all_files.append("root://eospublic.cern.ch//eos/root-eos/AGC/rntuple/nanoAOD/TT_TuneCUETP8M1_13TeV-amcatnlo-pythia8/cmsopendata2015_ttbar_19978_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext1-v1_60000_0004.root") # RNTuple remote
# all_files.append("root://eospublic.cern.ch//eos/root-eos/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-amcatnlo-pythia8/cmsopendata2015_ttbar_19978_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext1-v1_60000_0004.root") # TTree remote

# Files downloaded locally:
all_files["TT"] = "/home/cms-jovyan/my_root_files/ttree/cmsopendata2015_ttbar_19978_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext1-v1_60000_0004.root" # TTree local
# all_files["RN"] = "/home/cms-jovyan/my_root_files/rntuple/cmsopendata2015_ttbar_19978_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext1-v1_60000_0004.root"  # RNTuple local


all_files["632"] = "/home/cms-jovyan/my_root_files/rntuple_v6_632_0909.root" # RNTuple, ROOT_632 (works)
# all_files["6x"] = "/home/cms-jovyan/my_root_files/rntuple_v7_6_0909.root" # RNTuple, ROOT_6_X (does not work)


def load_files_with_uproot(files):
    for fl in files.values():
        with uproot.open(fl) as f:
            events = f["Events"]
            events_list.append(events)
            # print("File was loaded with uproot, event count: ", len(events.keys()))
            
            # NOTE: to access array: # events.arrays(["Electron_pt"])["Electron_pt"]
        
def load_files_with_coffea(files):
    for fl in files:
        events = NanoEventsFactory.from_root({fl: "Events"}, schemaclass=NanoAODSchema).events()
        events_list.append(events)
        print("File was loaded with coffea, fields count: ", len(events.fields))
        
load_files_with_uproot(all_files)

# load_files_with_coffea(all_files)





In [3]:
# # Various available properties:
# print("Name: ", events.name)
# print("header: ", events.header)
# print("footer: ", events.footer)
# print("num_entries: ", events.num_entries)
# print("len of field_names: ", len(events.field_names))
# print("keys: ", len(events.keys()))
# print(" field_names: ", events.fields)
# print("column_records: ", events.column_records[:10])
# print("keys: ", events.keys()[:10])
# print("_column_records_dict: ", events._column_records_dict)
# print("_related_ids: ", events._related_ids)
# print("page_list_envelopes: ", events.page_list_envelopes)

### timeit tests:

In [4]:
import timeit
events_dict = {}

def format_test_results(times):
    df = pd.DataFrame(times, columns =['data_type', 'func_name', 'time(s)'])
    df = df.sort_values(by=['func_name'])
    df['time(s)'] = df['time(s)'].round(4)
    df['func_name'] = df['func_name'].str.replace('_', ' ', regex=False)
    
    return df


def load_file(data_type, file):
    with uproot.open(file) as f:
        events = f["Events"]
        events_dict[data_type] = events

def load_arrays_for_each_key(events):
    for key in events.keys():
        events.arrays(filter_name=[key])[key]
        
def load_all_arrays(events):
    events.arrays()
    
def load_all_arrays_while_using_filter_name(events):
    chosen_keys = events.keys()
    events.arrays(filter_name=chosen_keys)[chosen_keys]

def load_array_while_using_filter_name(events):
    key = "nGenVisTau"
    events.arrays(filter_name=[key])[key]
    
    
def load_24_arrays_while_using_filter_name(events):
    chosen_keys = [
        "GenPart_pt", "GenPart_eta", "GenPart_phi", "CorrT1METJet_phi",
        "GenJet_pt", "CorrT1METJet_eta", "SoftActivityJet_pt",
        "Jet_eta", "Jet_phi", "SoftActivityJet_eta", "SoftActivityJet_phi", 
        "CorrT1METJet_rawPt", "Jet_btagDeepFlavB", "GenJet_eta", 
        "GenPart_mass", "GenJet_phi",
        "Jet_puIdDisc", "CorrT1METJet_muonSubtrFactor", "Jet_btagDeepFlavCvL",
        "Jet_btagDeepFlavQG", "Jet_mass", "Jet_pt", "GenPart_pdgId",
        "Jet_btagDeepFlavCvB", "Jet_cRegCorr"
        ]
    
    events.arrays(filter_name=chosen_keys)[chosen_keys]
        
def start_all_performance_tests():
    print("Starting to timeit on various functions: ")
    times = []
    
    for data_type, file in all_files.items():
        time_taken = timeit.timeit(lambda: load_file(data_type, file), number=1)
        times.append((data_type, "load_file", time_taken))

        time_taken = timeit.timeit(lambda: load_arrays_for_each_key(events_dict[data_type]), number=1)
        times.append((data_type, "load_arrays_for_each_key", time_taken))
        
        time_taken = timeit.timeit(lambda: load_all_arrays(events_dict[data_type]), number=1)
        times.append((data_type, "load_all_arrays", time_taken))
        
        time_taken = timeit.timeit(lambda: load_all_arrays_while_using_filter_name(events_dict[data_type]), number=1)
        times.append((data_type, "load_all_arrays_while_using_filter_name", time_taken))
        
        time_taken = timeit.timeit(lambda: load_24_arrays_while_using_filter_name(events_dict[data_type]), number=1)
        times.append((data_type, "load_24_arrays_while_using_filter_name", time_taken))
        
        time_taken = timeit.timeit(lambda: load_array_while_using_filter_name(events_dict[data_type]), number=1)
        times.append((data_type, "load_array_while_using_filter_name", time_taken))

    
    return format_test_results(times)


# results = start_all_performance_tests()
# print(results.to_string(index=False))

In [5]:
# This cell compares data between TTree and RNTuple for each key array, ensuring that RNTuple does not have corrupted data:
def compare_all_arrays(events_1, events_2, keys):
    ak_match_count = 0
    ak_mismatch_count = 0
    ak_error_count = 0
        
    for key in keys:
        arrays_1 = events_1.arrays([key])[key]
        arrays_2 = events_2.arrays([key])[key]

        # Check if arrays are equal:
        try:                
            # Custom function to compare NaN-aware equality
            def nan_equal(x, y):
                if isinstance(x, (list, ak.Array)) and isinstance(y, (list, ak.Array)):
                    return all(nan_equal(a, b) for a, b in zip(x, y))
                return (x == y) or (np.isnan(x) and np.isnan(y))
            # Check if the lengths of the outermost arrays are equal
            assert len(arrays_1) == len(arrays_2)

            # Compare the arrays using the custom function
            are_equal = nan_equal(arrays_1.tolist(), arrays_2.tolist())

            if are_equal:
                ak_match_count += 1
                print(f"[{key}]", "ak arrays are equal")
            elif not are_equal:
                ak_mismatch_count += 1
                print(f"[{key}]", "ak comparison MISMATCH")
                print("tt: ", arrays_1, f"Type: {ak.type(arrays_1)}.")
                print("rn: ", arrays_2, f"Type: {ak.type(arrays_2)}.")

        except:
            ak_error_count += 1
            print(f"[{key}]", "ak comparison ERROR")
            print("tt: ", arrays_1, f"Type: {ak.type(arrays_1)}")
            print("rn: ", arrays_2, f"Type: {ak.type(arrays_2)}")

    print(f"ak array comparison statistics: matched count: {ak_match_count}; mismatch count: {ak_mismatch_count}; errors: {ak_error_count}")
    
events_tt = events_list[0]
events_rn = events_list[1]

keys = [
        "GenPart_pt", "GenPart_eta", "GenPart_phi", "CorrT1METJet_phi",
        "GenJet_pt", "CorrT1METJet_eta", "SoftActivityJet_pt",
        "Jet_eta", "Jet_phi", "SoftActivityJet_eta", "SoftActivityJet_phi", 
        "CorrT1METJet_rawPt", "Jet_btagDeepFlavB", "GenJet_eta", 
        "GenPart_mass", "GenJet_phi",
        "Jet_puIdDisc", "CorrT1METJet_muonSubtrFactor", "Jet_btagDeepFlavCvL",
        "Jet_btagDeepFlavQG", "Jet_mass", "Jet_pt", "GenPart_pdgId",
        "Jet_btagDeepFlavCvB", "Jet_cRegCorr"
        ]

compare_all_arrays(events_tt, events_rn, keys)

    


[GenPart_pt] ak arrays are equal
tt:  [[0, 0, 53.6, 53.6, 42.2, 84.5, ..., 10.3, 9.91, 11.3, 10.1, 9.81, 9.06], ...] Type: 188600 * var * float32.
rn:  [[0, 0, 53.6, 53.6, 42.2, 84.5, ..., 10.3, 9.91, 11.3, 10.1, 9.81, 9.06], ...] Type: 188600 * var * float32.
[GenPart_eta] ak arrays are equal
tt:  [[2.28e+04, -2.32e+04, -2.53, -1.03, ..., -1.52, -0.779, -0.799, -1.48], ...] Type: 188600 * var * float32.
rn:  [[2.28e+04, -2.32e+04, -2.53, -1.03, ..., -1.52, -0.779, -0.799, -1.48], ...] Type: 188600 * var * float32.
[GenPart_phi] ak arrays are equal
tt:  [[0, 0, 2.96, -0.182, 3.14, ..., 0.0825, 0.0348, 2.63, 2.68, 0.0828], ...] Type: 188600 * var * float32.
rn:  [[0, 0, 2.96, -0.182, 3.14, ..., 0.0825, 0.0348, 2.63, 2.68, 0.0828], ...] Type: 188600 * var * float32.
[CorrT1METJet_phi] ak arrays are equal
tt:  [[2.49, -3, -3.09, -0.305], [1.55, 1.46, ..., -1.37], ..., [-2.81, 1.56, 0.604]] Type: 188600 * var * float32.
rn:  [[2.49, -3, -3.09, -0.305], [1.55, 1.46, ..., -1.37], ..., [-2.81

In [6]:
# Comparing only certain regions of arrays:
def compare_array_region(key, events_tt, events_rn, strt, end):
    arr_tt = events_tt.arrays(filter_name=[key], entry_start=strt, entry_stop=end)[key]
    arr_rn = events_rn.arrays(filter_name=[key], entry_start=strt, entry_stop=end)[key]

    try:
        # Custom function to compare NaN-aware equality
        def nan_equal(x, y):
            if isinstance(x, (list, ak.Array)) and isinstance(y, (list, ak.Array)):
                return all(nan_equal(a, b) for a, b in zip(x, y))
            return (x == y) or (np.isnan(x) and np.isnan(y))
        # Check if the lengths of the outermost arrays are equal
        assert len(arr_tt) == len(arr_rn)
        # Compare the arrays using the custom function
        comparison_result = nan_equal(arr_tt.tolist(), arr_rn.tolist())
        # Final assertion
        assert comparison_result
        return True
    except Exception as e:
        print(f"TT array: {arr_tt}")
        print(f"RN array: {arr_rn}")
        print(f"Failure limits: {(strt, end)}")
        print("")
        return False

# Collect all regions near cluster edges, where data does not match:
def collect_breaking_points(key):
    cluster_starts = [md.num_first_entry for md in events_rn.cluster_summaries][1:] # Skip first, because it is 0.
    print("Starts of clusters: ", cluster_starts)

    step = 4
    for cl_start in cluster_starts:
        for i in range (cl_start-9, cl_start+9, step):
            strt = i
            end = i + step
            result = compare_array_region(key, events_tt, events_rn, strt, end)
            print(f"Range: ({strt},{end}). Match result: {result}")

key = "Electron_hoe"
collect_breaking_points(key)
print("Finished cell.")


Starts of clusters:  [17224, 44433, 71709, 98997, 126243, 153500, 180739]
Range: (17215,17219). Match result: True
Range: (17219,17223). Match result: True
Range: (17223,17227). Match result: True
Range: (17227,17231). Match result: True
Range: (17231,17235). Match result: True
Range: (44424,44428). Match result: True
Range: (44428,44432). Match result: True
Range: (44432,44436). Match result: True
Range: (44436,44440). Match result: True
Range: (44440,44444). Match result: True
Range: (71700,71704). Match result: True
Range: (71704,71708). Match result: True
Range: (71708,71712). Match result: True
Range: (71712,71716). Match result: True
Range: (71716,71720). Match result: True
Range: (98988,98992). Match result: True
Range: (98992,98996). Match result: True
Range: (98996,99000). Match result: True
Range: (99000,99004). Match result: True
Range: (99004,99008). Match result: True
Range: (126234,126238). Match result: True
Range: (126238,126242). Match result: True
Range: (126242,12624

In [7]:
cluster_starts = [md.num_first_entry for md in events_632.cluster_summaries][1:] # Skip first, because it is 0.
print("Starts of clusters: ", cluster_starts)
events_632 = events_list[0]
events_6x = events_list[1]
print("Keys: ", events_6x.keys())
print("Keys: ", events_632.keys())





NameError: name 'events_632' is not defined