In [1]:
import uproot
import lumin
import numpy as np
import pandas as pd
import awkward as ak
from sklearn.model_selection import StratifiedKFold, KFold
import h5py

In [2]:
data_path = "/disk/atlas3/data_MC/ATLASOpen/jets/part1.root:JetRecoTree" 
keys = ["EventNumber", "EventWeight", "Clusters*", "TruthJets_R10*"]
save_path = "./data"

In [3]:
#Open file in with-function will close it when you exit
def get_branches(rootfile:str, keys):
    with uproot.open(rootfile) as file:
        print(file.keys())
        events = file.arrays(filter_name=keys, library="ak", how="zip")
        return events

data = get_branches(data_path, keys)
data["n_jets"] = ak.num(data["TruthJets_R10"])

['EventNumber', 'RunNumber', 'EventWeight', 'mu_average', 'mu_actual', 'NPV', 'Tracks_pt', 'Tracks_eta', 'Tracks_phi', 'Tracks_m', 'Tracks_vtx', 'Clusters_pt', 'Clusters_eta', 'Clusters_phi', 'Clusters_m', 'Particles_pt', 'Particles_eta', 'Particles_phi', 'Particles_m', 'Particles_pdgID', 'RecoJets_R4_pt', 'RecoJets_R4_eta', 'RecoJets_R4_phi', 'RecoJets_R4_m', 'RecoJets_R4_jvf', 'TrackJets_R4_pt', 'TrackJets_R4_eta', 'TrackJets_R4_phi', 'TrackJets_R4_m', 'RecoJets_R10_pt', 'RecoJets_R10_eta', 'RecoJets_R10_phi', 'RecoJets_R10_m', 'RecoJets_R10_D2beta1', 'RecoJets_R10_tau32wta', 'RecoJets_R10_Trimmed_pt', 'RecoJets_R10_Trimmed_eta', 'RecoJets_R10_Trimmed_phi', 'RecoJets_R10_Trimmed_m', 'RecoJets_R10_Trimmed_D2beta1', 'RecoJets_R10_Trimmed_tau32wta', 'TruthJets_R4_pt', 'TruthJets_R4_eta', 'TruthJets_R4_phi', 'TruthJets_R4_m', 'TruthJets_R10_pt', 'TruthJets_R10_eta', 'TruthJets_R10_phi', 'TruthJets_R10_m', 'TruthJets_R10_D2beta1', 'TruthJets_R10_tau32wta', 'TruthJets_R10_Trimmed_pt', 'Tru

In [4]:
test = data[0:1000]
test["Clusters"] = ak.pad_none(test["Clusters"], target=10, axis=1, clip=True)

In [5]:
from lumin.data_processing.file_proc import aarr2foldfile


aarr2foldfile(aarr=test, n_folds=2, cat_feats=["EventNumber", "EventWeight"], 
                targ_feats=["n_jets"], tensor_feats=[["Clusters", ["pt", "eta", "phi"]]],
                savename=save_path + "/folds", targ_type='int')

Saving fold 0 with 500 events
Saving fold 1 with 500 events


In [6]:
f = h5py.File('data/folds.hdf5', 'r')
print(f.keys())
print(f["meta_data"].keys())
print(f["meta_data"]["matrix_feats"])
print(f["fold_0"]["Clusters"])

<KeysViewHDF5 ['fold_0', 'fold_1', 'meta_data']>
<KeysViewHDF5 ['cat_feats', 'cont_feats', 'matrix_feats', 'targ_feats', 'tensor_feats']>
<HDF5 dataset "matrix_feats": shape (), type "|O">
<HDF5 dataset "Clusters": shape (500, 3, 10), type "<f8">


In [7]:
from awkward import Array as Array
dictdata = {"matrix": [[1,2,3,4], [5, 6, 7, 8], [9, 9, 9, 9], [0, 0,0,0]], "value": [1, 2, 3, 4]}
#X = ak.Array([[[1, 2, 3, 4], [1]], [[5, 6, 7, 8], [2]], [[9, 9, 9, 9], [3]], [[1, 1, 1, 1], [4]]])
print(ak.to_list(data["n_jets"][0:30]).count(2))
X = ak.Array(data[0:30])
y = np.array(data["n_jets"][0:30])
kf = StratifiedKFold(n_splits=2, random_state=None, shuffle=True)
"""
for train_index, test_index in kf.split(X, y=X["n_jets"]):
    print("TRAIN_idx:", train_index, "TEST_idx:", test_index)
    X_train, X_test = X["n_jets"][train_index], X["n_jets"][test_index]
    y_train, y_test = y[train_index], y[test_index]
    print("TRAIN:", [[a, b] for a, b in zip(X_train, y_train)], "TEST:", [[a, b] for a, b in zip(X_test, y_test)])
"""
folds = kf.split(X=X, y=X["n_jets"])
for fold_idx, (_, fold) in enumerate(folds):
    print(fold_idx, fold)
    test = X[fold]
#print(test["Clusters"].fields)
print(ak.type(test))
test["Clusters"] = ak.pad_none(test["Clusters"], target=10, axis=1, clip=True)
#print(ak.to_list(test[["EventNumber", "EventWeight"]]))
keys = ["eta", "pt", "phi"]
in_fields = np.array([key in test["Clusters"].fields for key in keys])
if all([key in test["Clusters"].fields for key in keys]):
    print("yep")
else:
    print(in_fields)
    print(np.where(in_fields <1), keys[1:2])
matrices = np.swapaxes(np.array([ak.to_list(test["Clusters", key]) for key in keys]), 0, 1)

matrix_data = np.swapaxes(np.array([ak.to_list(test["Clusters", key]) for key in keys]), 0, 1)
print(matrix_data.shape)
print(type(matrix_data))
#!rm './data/testing.hdf5'
#out_file = h5py.File('./data/testing.hdf5', 'w')
#grp = out_file.create_group('fold_1')
#dset = grp.create_dataset(name = "Testing", shape=matrix_data.shape, dtype=matrix_data.dtype.name  if matrix_data.dtype.name not in ['object', 'str864'] else 'S64'
#, data=matrix_data  if matrix_data.dtype.name not in ['object', 'str864'] else matrix_data.astype('S64'), compression=None)

18
0 [ 2  3  4  6  7  8  9 10 15 16 21 23 26 27 28]
1 [ 0  1  5 11 12 13 14 17 18 19 20 22 24 25 29]
15 * {"EventNumber": uint64, "EventWeight": float32, "Clusters": var * {"pt": float32, "eta": float32, "phi": float32, "m": float32}, "TruthJets_R10": var * {"pt": float32, "eta": float32, "phi": float32, "m": float32, "D2beta1": float32, "tau32wta": float32, "Trimmed_pt": float32, "Trimmed_eta": float32, "Trimmed_phi": float32, "Trimmed_m": float32, "Trimmed_D2beta1": float32, "Trimmed_tau32wta": float32}, "n_jets": int64}
yep
(15, 3, 10)
<class 'numpy.ndarray'>




In [8]:
type(data)==pd.DataFrame

False