In [None]:
import energyflow as ef
import fastjet as fj
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas
from pyjet import DTYPE_PTEPM, cluster

In [None]:
filepath = "/beegfs/desy/user/ewencedr/data/lhco/events_anomalydetection_v2.h5"

In [None]:
# Option 1: Load everything into memory
df = pandas.read_hdf(filepath)
print(df.shape)
print("Memory in GB:", sum(df.memory_usage(deep=True)) / (1024**3))

In [None]:
truth = np.array(df[2100])
print(truth.shape)
np.unique(truth, return_counts=True)

In [None]:
events_combined_perm = np.array(df)[np.random.permutation(len(np.array(df)))]
print(np.shape(events_combined_perm))
# events_combined = events_combined[:, :10000]
# print(events_combined.shape)
print(events_combined_perm[:, 2100])

In [None]:
events_combined = events_combined_perm[:10000]
print(np.unique(events_combined[:, 2100], return_counts=True))
# events_combined = events_combined.T
# print(events_combined.shape)

In [None]:
phi = events_combined[:, ::3]
print(phi.shape)
print(phi[0, :50])

## Clustering

In [None]:
# Now, let's cluster some jets!
leadpT = {}
first_jets = {}
alljets = {}
for mytype in ["background", "signal"]:
    leadpT[mytype] = []
    alljets[mytype] = []
    first_jets[mytype] = []
    for i in range(np.shape(events_combined)[0]):
        if i % 1000 == 0:
            print(mytype, i)
            pass
        # print(f"events: {np.shape(events_combined[i])}")
        issignal = events_combined[i][2100]
        if mytype == "background" and issignal:
            continue
        elif mytype == "signal" and issignal == 0:
            continue
        pseudojets_input = np.zeros(
            len([x for x in events_combined[i][::3] if x > 0]), dtype=DTYPE_PTEPM
        )
        # print(np.shape(pseudojets_input))
        for j in range(700):
            if events_combined[i][j * 3] > 0:
                pseudojets_input[j]["pT"] = events_combined[i][j * 3]
                pseudojets_input[j]["eta"] = events_combined[i][j * 3 + 1]
                pseudojets_input[j]["phi"] = events_combined[i][j * 3 + 2]
                pass
            pass
        sequence = cluster(pseudojets_input, R=1.0, p=-1)
        jets = sequence.inclusive_jets(ptmin=20)
        leadpT[mytype] += [jets[0].pt]
        first_jets[mytype] += [jets[0]]
        alljets[mytype] += [jets]
        pass

In [None]:
array = np.array([1, 2, 3, 4, 5])
array[:2]

In [None]:
jets = alljets["background"]
mass_sorted_jets = []
for jet in jets:
    jets_to_sort = jet[:2]  # only sort the first two subjets with highest pT
    sorted_subjet = sorted(jets_to_sort, key=lambda x: x.mass, reverse=True)
    mass_sorted_jets.append(sorted_subjet)

In [None]:
jets = mass_sorted_jets
# jets = alljets['background']
x_masses = []
y_masses = []
x_jets = []
for jets in jets:
    # print(jets)
    x_jet = jets[0]
    y_jet = jets[1]
    x_jets.append(x_jet)
    # if (x_jet.mass < y_jet.mass):
    #    x_jet = jets[1]
    #    y_jet = jets[0]
    #    pass
    # x_jet_mass = sum([x.mass for x in x_jet])
    # y_jet_mass = sum([x.mass for x in y_jet])
    x_masses.append(x_jet.mass)
    y_masses.append(y_jet.mass)
x_jets = np.array(x_jets)

In [None]:
plt.hist(x_masses, bins=100, histtype="step", label="highest mass")
plt.hist(y_masses, bins=100, histtype="step", label="second hight mass")
plt.xlabel("mass")
plt.legend()
plt.show()

In [None]:
first_jets["signal"][2]

In [None]:
print(len(first_jets["background"]))
print(f"Jet pt: {first_jets['background'][0].pt}")
print(f"Jet eta: {first_jets['background'][0].eta}")
print(f"Jet phi: {first_jets['background'][0].phi}")
print(f"Jet mass: {first_jets['background'][0].mass}")

In [None]:
constituents = []
rel_constituents = []
len_constituents = []
mask = []
len_padding = 250
for jet in x_jets:
    const_temp = []
    for constituent_void in jet.constituents_array():
        array = np.asarray(constituent_void).tolist()
        const_temp.append(array)
    unpadded_consts = np.array(const_temp)
    mask_single_jet = np.ones(len(unpadded_consts))
    padded_mask = np.pad(
        mask_single_jet, (0, len_padding - len(mask_single_jet)), "constant", constant_values=0
    )
    padded_consts = np.pad(
        unpadded_consts,
        ((0, len_padding - len(unpadded_consts)), (0, 0)),
        "constant",
        constant_values=0,
    )
    constituents.append(padded_consts)
    # relative coordinates
    rel_constituents_temp = padded_consts.copy()
    rel_constituents_temp[:, 0] = rel_constituents_temp[:, 0] / jet.pt
    rel_constituents_temp[:, 1] = rel_constituents_temp[:, 1] - jet.eta
    rel_constituents_temp[:, 2] = rel_constituents_temp[:, 2] - jet.phi
    rel_constituents_temp[:, 2] = np.where(
        rel_constituents_temp[:, 2] > np.pi,
        rel_constituents_temp[:, 2] - 2 * np.pi,
        rel_constituents_temp[:, 2],
    )
    rel_constituents_temp[:, 2] = np.where(
        rel_constituents_temp[:, 2] < -np.pi,
        rel_constituents_temp[:, 2] + 2 * np.pi,
        rel_constituents_temp[:, 2],
    )

    rel_constituents.append(rel_constituents_temp)
    # mask
    mask.append(padded_mask)
    # len
    len_constituents.append(len(jet.constituents_array()))
constituents = np.array(constituents)
rel_constituents = np.array(rel_constituents)
mask = np.array(mask)
print(f"max constituents: {np.max(len_constituents)}")
print(f"min constituents: {np.min(len_constituents)}")
print(constituents.shape)
print(mask.shape)

In [None]:
plt.hist(np.sum(mask, axis=-1), bins=range(0, 250, 1))
plt.xlabel("particle multiplicity")
plt.show()

In [None]:
index = 2
plt.hist(
    rel_constituents[:, :, index].flatten()[rel_constituents[:, :, index].flatten() != 0], bins=100
)
plt.xlabel("phi")
# plt.yscale('log')
plt.show()

In [None]:
masses = []
for jet in x_jets:
    masses.append(jet.mass)
plt.hist(masses, bins=100)
plt.xlabel("mass")
plt.show()

In [None]:
background = alljets["background"]
print(len(background))

In [None]:
print(leadpT["background"])

In [None]:
print(len(background[200]))

In [None]:
mjj = {}
for mytype in ["background", "signal"]:
    mjj[mytype] = []
    for k in range(len(alljets[mytype])):
        E = alljets[mytype][k][0].e + alljets[mytype][k][1].e
        px = alljets[mytype][k][0].px + alljets[mytype][k][1].px
        py = alljets[mytype][k][0].py + alljets[mytype][k][1].py
        pz = alljets[mytype][k][0].pz + alljets[mytype][k][1].pz
        mjj[mytype] += [(E**2 - px**2 - py**2 - pz**2) ** 0.5]
        pass
    pass

In [None]:
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
n, b, p = plt.hist(mjj["background"], bins=50, facecolor="r", alpha=0.2, label="background")
plt.hist(mjj["signal"], bins=b, facecolor="b", alpha=0.2, label="signal")
plt.xlabel(r"$m_{JJ}$ [GeV]")
plt.ylabel("Number of events")
plt.legend(loc="upper right")
plt.show()

In [None]:
# Let's make some very simple plots.
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
n, b, p = plt.hist(leadpT["background"], bins=50, facecolor="r", alpha=0.2, label="background")
plt.hist(leadpT["signal"], bins=b, facecolor="b", alpha=0.2, label="signal")
plt.xlabel(r"Leading jet $p_{T}$ [GeV]")
plt.ylabel("Number of events")
plt.legend(loc="upper right")
plt.show()

# Fastjet

In [None]:
def run(data, n_events=1000):
    out = []

    # Loop over events
    for ievt in range(n_events):
        # Build a list of all particles
        pjs = []
        for i in range(data.shape[1]):
            pj = fj.PseudoJet()
            pj.reset_PtYPhiM(data[ievt, i, 0], data[ievt, i, 1], data[ievt, i, 2], 0)
            pjs.append(pj)

        # run jet clustering with AntiKt, R=1.0
        R = 1.0
        jet_def = fj.JetDefinition(fj.antikt_algorithm, R)

        # Save the two leading jets
        jets = jet_def(pjs)
        jets = [j for j in jets if j.pt() > 30.0]
        out.append([jets[0], jets[1]])

    return out

# Load data

In [None]:
path = "/beegfs/desy/user/ewencedr/data/lhco/events_anomalydetection_v2_processed.h5"
with h5py.File(path, "r") as f:
    rel_constituents = f["data"][:]
    mask = f["mask"][:]

In [None]:
print(rel_constituents.shape)
print(mask.shape)

In [None]:
plt.hist(np.sum(mask, axis=-1), bins=range(0, 250, 1))
plt.xlabel("particle multiplicity")
plt.yscale("log")
plt.show()

In [None]:
index = 2
plt.hist(
    rel_constituents[:, :, index].flatten()[rel_constituents[:, :, index].flatten() != 0], bins=100
)
plt.xlabel("phi")
plt.yscale("log")
plt.show()

In [None]:
p4s = ef.p4s_from_ptyphims(rel_constituents[:, :, :3])
sum_p4 = np.sum(p4s, axis=-2)
jet_data = ef.ptyphims_from_p4s(sum_p4, phi_ref=0)

In [None]:
jet_data.shape

In [None]:
plt.hist(jet_data[:, 1], bins=100)
plt.yscale("log")
plt.xlabel("mass")
# plt.xlim(0.1+1e-12-1e-11,0.1+1e-12+1e-11)
plt.show()

In [None]:
data = rel_constituents[:, :, :3]
print(data.shape)

In [None]:
data2 = np.concatenate([data, np.expand_dims(mask, axis=-1)], axis=-1)
print(data2.shape)