## _Track Evaluation_

1. Track Formation using DBScan or CCL, see **_trkx_from_gnn.py_** with its code breakdown in **_trkx_from_gnn.ipynb_**
2. Track Evaluation using Two-way Matching, see **_eval_reco_trkx.py_** with its code breakdown in **_eval_reco_trkx.ipynb_**
    
&nbsp;    
_**Problem:** Our track candidates has `track_id=-1` to hold unused hits, they needs to removed otherwise track purity goes above 100%_

_**Note:** Above scripts are adapted from `gnn4itk/scripts` with the same names._

In [None]:
import glob, os, sys, yaml

In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import torch
import time

In [None]:
from sklearn.cluster import DBSCAN
from multiprocessing import Pool
from functools import partial

In [None]:
# select a device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
sys.path.append("..")

In [None]:
from src import SttTorchDataReader, SttCSVDataReader
from src.drawing import detector_layout, draw_proc_event
from src.utils_math import polar_to_cartesian

In [None]:
raw_inputdir = (
    "../run_all/fwp_gnn_processed/pred"  # output of GNN stage as in test/pred
)
rec_inputdir = "../run_all/fwp_gnn_segmenting/seg"  # output of trkx_from_gnn.sh
outputdir = "../run_all/fwp_gnn_segmenting/eval"  # output of eval_reco_trkx.sh

### _(1) - GNN Raw Events_

In [None]:
# event to investigate
event_id = 10256

In [None]:
raw_reader = SttTorchDataReader(raw_inputdir)

In [None]:
graph = raw_reader.read(event_id)

In [None]:
# number of particles
graph.pid.unique()

In [None]:
# hit features
r, phi, ir = graph.x.T

# spatial coordinate transform
x, y = polar_to_cartesian(r.detach().numpy(), phi.detach().numpy())

# compensate scaling
ir = ir.detach().numpy() * 100

In [None]:
# plot true event
fig, ax = detector_layout(figsize=(8, 8))
e_id = int(graph.event_file[-10:])
p_ids = np.unique(graph.pid)

for pid in p_ids:
    idx = graph.pid == pid
    ax.plot(x[idx], y[idx], "-", linewidth=1.5)
    ax.scatter(x[idx], y[idx], label="particle_id: {}".format(int(pid)))

ax.set_title("Azimuthal View of STT, EventID # {}".format(e_id))
ax.legend(fontsize=10, loc="best")
fig.tight_layout()
# fig.savefig("true_track.png")

In [None]:
# prepare truth information
raw_data = graph

In [None]:
truth = pd.DataFrame(
    {"hit_id": raw_data.hid.numpy(), "particle_id": raw_data.pid.int().numpy()},
    columns=["hit_id", "particle_id"],
)

particles = pd.DataFrame(
    {
        "particle_id": raw_data.pid.int().numpy(),
        "pt": raw_data.pt.numpy(),
        "vx": raw_data.vertex[:, 0].numpy(),
        "vy": raw_data.vertex[:, 1].numpy(),
        "vz": raw_data.vertex[:, 2].numpy(),
        "q": raw_data.charge.numpy(),
        "pdgcode": raw_data.pdgcode.numpy(),
        "ptheta": raw_data.ptheta.numpy(),
        "peta": raw_data.peta.numpy(),
        "pphi": raw_data.pphi.numpy(),
    },
    columns=[
        "particle_id",
        "pt",
        "vx",
        "vy",
        "vz",
        "q",
        "pdgcode",
        "ptheta",
        "peta",
        "pphi",
    ],
).drop_duplicates(subset=["particle_id"])

### _(2) Tracks from GNN_

In [None]:
# reco_track_path = "run/trkx_from_gnn"
reco_trkx_reader = SttTorchDataReader(rec_inputdir)

In [None]:
# fetch a single event
reco_data = reco_trkx_reader(event_id)

In [None]:
reco_data.head()

In [None]:
# number of reco tracks
np.unique(reco_data.track_id.values)

In [None]:
# renaming
reconstructed = reco_data

### _(3) Track Evaluation_

- _Fixing `eval_reco_trkx.py`_

In [None]:
truth.head()

In [None]:
particles.head()

In [None]:
reconstructed.head()

In [None]:
# number of hits
truth.hit_id.count(), reconstructed.hit_id.count()

In [None]:
# filter missed hits
reconstructed[reconstructed["track_id"] == -1].hit_id.count()

In [None]:
reconstructed[reconstructed["track_id"] == -1].head(10)

- ALERT: We have -ve `track_id` that was added to keep used hits. As these are not track candidates, so should remove it.

In [None]:
# reconstructed = reconstructed[reconstructed['track_id'] !=-1]

In [None]:
reconstructed.head()

In [None]:
particles.drop_duplicates(subset=["particle_id"])

In [None]:
np.unique(particles.particle_id.values)

In [None]:
truth.shape[0], particles.shape[0], reconstructed.shape[0]

In [None]:
# input to evaluate_reco_tracks()
reco_df = reconstructed
min_hits_truth = 7
min_hits_reco = 6
min_pt = 0.0
frac_reco_matched = 0.5
frac_truth_matched = 0.5

In [None]:
# just in case particle_id == 0 included in truth.
if "particle_id" in truth.columns:
    truth = truth[truth.particle_id > 0]

In [None]:
truth.shape

In [None]:
# get number of spacepoints in each reconstructed tracks
n_reco_hits = (
    reco_df.track_id.value_counts(sort=False)
    .reset_index()
    .rename(columns={"index": "track_id", "track_id": "n_reco_hits"})
)

- ALERT: `track_id = -1`

In [None]:
n_reco_hits.head()

In [None]:
# only tracks with a minimum number of spacepoints are considered
n_reco_hits = n_reco_hits[n_reco_hits.n_reco_hits >= min_hits_reco]

In [None]:
n_reco_hits.head()

In [None]:
reco_df = reco_df[reco_df.track_id.isin(n_reco_hits.track_id.values)]

In [None]:
reco_df.head()

In [None]:
reco_df.track_id.unique()

- **Truth Info**

In [None]:
# get number of spacepoints in each particle
hits = truth.merge(particles, on="particle_id", how="left")

In [None]:
hits.head()

In [None]:
n_true_hits = (
    hits.particle_id.value_counts(sort=False)
    .reset_index()
    .rename(columns={"index": "particle_id", "particle_id": "n_true_hits"})
)

In [None]:
n_true_hits.head()

In [None]:
# only particles leaves at least min_hits_truth spacepoints
# and with pT >= min_pt are considered.
particles = particles.merge(n_true_hits, on=["particle_id"], how="left")

In [None]:
is_trackable = particles.n_true_hits >= min_hits_truth

In [None]:
is_trackable

In [None]:
# event has 3 columnes [track_id, particle_id, hit_id]
event = pd.merge(reconstructed, truth, on=["hit_id"], how="left")

In [None]:
event.track_id.unique()

In [None]:
n_reco_hits.shape, n_true_hits.shape

In [None]:
# n_common_hits and n_shared should be exactly the same
# for a specific track id and particle id

# Each track_id will be assigned to multiple particles.
# To determine which particle the track candidate is matched to,
# we use the particle id that yields a maximum value of n_common_hits / n_reco_hits,
# which means the majority of the spacepoints associated with the reconstructed
# track candidate comes from that true track.
# However, the other way may not be true.
reco_matching = (
    event.groupby(["track_id", "particle_id"])
    .size()
    .reset_index()
    .rename(columns={0: "n_common_hits"})
)

In [None]:
reco_matching.head(10)

In [None]:
# Each particle will be assigned to multiple reconstructed tracks
truth_matching = (
    event.groupby(["particle_id", "track_id"])
    .size()
    .reset_index()
    .rename(columns={0: "n_shared"})
)

In [None]:
truth_matching.head(10)

In [None]:
# add number of hits to each of the maching dataframe
reco_matching = reco_matching.merge(n_reco_hits, on=["track_id"], how="left")
truth_matching = truth_matching.merge(n_true_hits, on=["particle_id"], how="left")

In [None]:
reco_matching.head(10)

In [None]:
truth_matching.head(10)

In [None]:
# calculate matching fraction
reco_matching = reco_matching.assign(
    purity_reco=np.true_divide(reco_matching.n_common_hits, reco_matching.n_reco_hits)
)
truth_matching = truth_matching.assign(
    purity_true=np.true_divide(truth_matching.n_shared, truth_matching.n_true_hits)
)

In [None]:
reco_matching.head(10)

In [None]:
truth_matching.head(10)

In [None]:
# select the best match
reco_matching["purity_reco_max"] = reco_matching.groupby("track_id")[
    "purity_reco"
].transform(max)
truth_matching["purity_true_max"] = truth_matching.groupby("track_id")[
    "purity_true"
].transform(max)

In [None]:
reco_matching.head(10)

In [None]:
truth_matching.head(10)

In [None]:
# change the >= to just >
matched_reco_tracks = reco_matching[
        (reco_matching.purity_reco_max > frac_reco_matched) \           # I changed he >= to just >, by Murnane
      & (reco_matching.purity_reco == reco_matching.purity_reco_max)]

In [None]:
matched_reco_tracks

- **ALERT**: _`matched_reco_tracks` should always be less than `n_reco_hits`_

In [None]:
if matched_reco_tracks.shape[0] > n_reco_hits.shape[0]:
    print("shape:", matched_reco_tracks.shape[0], n_reco_hits.shape[0])

In [None]:
# change the >= to just >
matched_true_particles = truth_matching[
        (truth_matching.purity_true_max > frac_truth_matched) \          # I changed he >= to just >, by Murnane
      & (truth_matching.purity_true == truth_matching.purity_true_max)]

In [None]:
matched_true_particles

In [None]:
# now, let's combine the two majority criteria
# reconstructed tracks must be in both matched dataframe
# and the so matched particle should be the same
# in this way, each track should be only assigned
combined_match = matched_true_particles.merge(
    matched_reco_tracks, on=["track_id", "particle_id"], how="inner"
)

In [None]:
combined_match

In [None]:
n_reco_tracks = n_reco_hits.shape[0]
n_true_tracks = particles.shape[0]

In [None]:
n_reco_tracks, n_true_tracks

In [None]:
# For GNN, there are non-negaliable cases where GNN-based
# track candidates are matched to particles not considered as interesting.
# which means there are paticles in matched_pids that do not exist in particles.
matched_pids = np.unique(combined_match.particle_id)

In [None]:
matched_pids

In [None]:
is_matched = particles.particle_id.isin(matched_pids).values
n_matched_particles = np.sum(is_matched)

n_matched_tracks = reco_matching[reco_matching.purity_reco >= frac_reco_matched].shape[
    0
]
n_matched_tracks_poi = reco_matching[
    (reco_matching.purity_reco >= frac_reco_matched)
    & (reco_matching.particle_id.isin(particles.particle_id.values))
].shape[0]

In [None]:
is_matched

In [None]:
n_matched_particles

In [None]:
n_matched_tracks

In [None]:
n_matched_tracks_poi

In [None]:
n_duplicated_tracks = n_matched_tracks_poi - n_matched_particles
particles = particles.assign(is_matched=is_matched, is_trackable=is_trackable)

In [None]:
particles

In [None]:
(
    n_true_tracks,
    n_reco_tracks,
    n_matched_true_tracks,
    n_matched_reco_tracks,
    n_duplicated_reco_tracks,
    n_matched_reco_tracks_poi,
) = (
    n_true_tracks,
    n_reco_tracks,
    n_matched_particles,
    n_matched_tracks,
    n_duplicated_tracks,
    n_matched_tracks_poi,
)

### _Examine Output_

In [None]:
print("               Truth tracks: {:>10}".format(n_true_tracks))
print("       Truth tracks matched: {:>10}".format(n_matched_true_tracks))
print("       Reconstructed tracks: {:>10}".format(n_reco_tracks))
print("       Reco. tracks matched: {:>10}".format(n_matched_reco_tracks))
print("Reco. tracks matched to POI: {:>10}".format(n_matched_reco_tracks_poi))
print("    Reco. tracks duplicated: {:>10}".format(n_duplicated_reco_tracks))
print(
    "        Tracking Efficiency: {:>10.4f}%".format(
        100 * n_matched_true_tracks / n_true_tracks
    )
)
print(
    "            Tracking Purity: {:>10.4f}%".format(
        100 * n_matched_reco_tracks / n_reco_tracks
    )
)
print(
    "                  Fake rate: {:>10.4f}%".format(
        100 - 100 * n_matched_reco_tracks / n_reco_tracks
    )
)
print(
    "           Duplication Rate: {:>10.4f}%".format(
        100 * n_duplicated_reco_tracks / n_reco_tracks
    )
)

### _Let's View both True and Reconstructed Event_

In [None]:
true_track = raw_data

In [None]:
# hit features
r, phi, ir = true_track.x.T

# spatial coordinate transform
x, y = polar_to_cartesian(r.detach().numpy(), phi.detach().numpy())

# compensate scaling
ir = ir.detach().numpy() * 100

In [None]:
# plot true event
fig, ax = detector_layout(figsize=(10, 10))
e_id = int(true_track.event_file[-10:])
p_ids = np.unique(true_track.pid)

for pid in p_ids:
    idx = true_track.pid == pid
    ax.plot(x[idx], y[idx], "-", linewidth=1.5)
    ax.scatter(x[idx], y[idx], label="particle_id: {}".format(int(pid)))

ax.set_title("Azimuthal View of STT, EventID # {}".format(e_id))
ax.legend(fontsize=10, loc="best")
fig.tight_layout()
fig.savefig("true_track.png")

In [None]:
reco_track = reco_data

In [None]:
# plot reco event, here, (x,y,ir) comes from ture event. but idx is from reco event
fig, ax = detector_layout(figsize=(10, 10))
e_id = e_id
t_ids = np.unique(reco_track.track_id)

for tid in t_ids:
    idx = reco_track.track_id == tid
    ax.plot(x[idx], y[idx], "-", linewidth=2)
    ax.scatter(x[idx], y[idx], s=(ir[idx] * 100), label="particle_id: {}".format(tid))

ax.set_title("Azimuthal View of STT, EventID # {}".format(e_id))
ax.legend(fontsize=10, loc="best")
fig.tight_layout()
fig.savefig("reco_track.png")

### _Read HDF5_

In [None]:
import pandas as pd

In [None]:
path = "../run_all/fwp_gnn_segmenting/eval/all_particles.h5"
store = pd.HDFStore(path)

In [None]:
# store.get('data').head()
store["data"].head()

In [None]:
store.close()