## _Building True Edges_

1. _layerwise true edges_
2. _modulewise true edges_
3. _orderwise true edges (new for curly tracks)_

In [None]:
import glob, os, sys, yaml

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
import pprint
import seaborn as sns
import trackml.dataset

In [None]:
import torch
from torch_geometric.data import Data
import itertools

In [None]:
# append parent dir
sys.path.append("..")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# local imports
from src import SttCSVDataReader, SttTorchDataReader
from src import detector_layout
from src import Build_Event, Build_Event_Viz, Visualize_Edges
from src.math_utils import polar_to_cartesian

### _Input Data_

In [None]:
# input data
input_dir = "../data_all"

In [None]:
# Find All Input Data Files (hits.csv, cells.csv, particles.csv, truth.csv)
all_files = os.listdir(input_dir)

# Extract File Prefixes (use e.g. xxx-hits.csv)
suffix = "-hits.csv"
file_prefixes = sorted(
    os.path.join(input_dir, f.replace(suffix, ""))
    for f in all_files
    if f.endswith(suffix)
)

print("Number of Files: ", len(file_prefixes))

In [None]:
# file_prefixes[:10]

In [None]:
# load an event
# hits, tubes, particles, truth = trackml.dataset.load_event(file_prefixes[0])

In [None]:
# hits.head()
# tubes.head()
# particles.head()
# truth.head()

### _Visualize Event_

In [None]:
# select event
event_id = 95191

In [None]:
# compose event is exactly the same as select_hits()
# event = Build_Event(input_dir, event_id, noise=False, skewed=False, selection=False)

In [None]:
# visualize event
# Build_Event_Viz(event, figsize=(10,10), fig_type="pdf", save_fig=False)

## _True Edges_

In [None]:
from LightningModules.Processing.utils.event_utils import select_hits
from LightningModules.Processing.utils.event_utils import get_layerwise_edges
from LightningModules.Processing.utils.event_utils import get_modulewise_edges

### _1. Layerwise True Edges_

**True Graph** is the ground truth for GNN. It is built from creating edges from _`hits`_ from the same particle but in adjacent layers. For this purpose one has _`true_edges, hits = get_layerwise_edges(event)`_ function in the _`event_util.py`_.

In [None]:
# get event prefix using event_id
event_prefix = file_prefixes[event_id]

In [None]:
# select hits
kwargs = {"selection": False}
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
# get layerwise true edges & new hits dataframe
true_edges, hits = get_layerwise_edges(hits)

In [None]:
# true_edges

In [None]:
# check dimensions
# true_edges.shape

In [None]:
# gives True
# senders == true_edges[0]

In [None]:
# gives True
# receivers  == true_edges[1]

In [None]:
# split as sender and recivers
senders, receivers = true_edges

### _Plotting Layerwise True Edges_

- I have hit pairs in two arrays
- Extract each pair (w/ `hit_id`) to plot
- How to plot hit pairs for one track?

In [None]:
# new (1): plotting true edges

# detector layout
fig, ax = detector_layout(figsize=(10, 10))

# particle tracks
pids = np.unique(hits.particle_id)
for pid in pids:
    idx = hits.particle_id == pid
    ax.scatter(hits[idx].x.values, hits[idx].y.values, label="particle_id: %d" % pid)

# true edges
for iedge in range(true_edges.shape[1]):
    pt1 = hits.iloc[true_edges[0][iedge]]
    pt2 = hits.iloc[true_edges[1][iedge]]
    ax.plot([pt1.x, pt2.x], [pt1.y, pt2.y], color="k", alpha=0.3, lw=1.5)

# axis params
ax.legend(fontsize=12, loc="best")
fig.tight_layout()
fig.savefig("layerwise_true_edges.pdf")

### _Disect `get_layerwise_edges(hits)` Function_

In [None]:
# select hits
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
# layerwise true edges & new hits dataframe
# true_edges, hits = get_layerwise_edges(hits)

In [None]:
# Sort by increasing distance from production
hits = hits.assign(
    R=np.sqrt(
        (hits.x - hits.vx) ** 2 + (hits.y - hits.vy) ** 2 + (hits.z - hits.vz) ** 2
    )
)

In [None]:
# re-indexing of hits dataframe, we get two extra columns: R and index
hits = hits.sort_values("R").reset_index(drop=True).reset_index(drop=False)

In [None]:
# assign particle_id=0 as NaN
hits.loc[hits["particle_id"] == 0, "particle_id"] = np.nan

In [None]:
hits.head()

In [None]:
# hit_list based on particle_id and layer_id
hit_list = (
    hits.groupby(["particle_id", "layer_id"], sort=False)["index"]
    .agg(lambda x: list(x))
    .groupby(level=0)
    .agg(lambda x: list(x))
)

In [None]:
hit_list

In [None]:
# get first row of hit list i.e. first particle
# row = hit_list.values[0]

In [None]:
# get elements of array from 0 to n-1 i.e. skipping the last element
# row[0:-1]

In [None]:
# get elements of array from 1 to n i.e. skipping the first elemnet
# row[1:]

In [None]:
# now build layerwise true edges
true_edges = []
for row in hit_list.values:
    for i, j in zip(
        row[0:-1], row[1:]
    ):  # row is list: we take 0 to n-1 elements as row[0:-1], and 1 to n as row[1:]
        true_edges.extend(
            list(itertools.product(i, j))
        )  # extend() will add an iterable (list, set, etc) to the end of true_edges list, append() add one element to end of list.

In [None]:
true_edges = np.array(true_edges).T

In [None]:
true_edges.shape

In [None]:
true_edges[0].size, true_edges[1].size

* Now we have _`true_edges`_ and corresponding _`hits`_ (changed _i.e._ sorted due to $R$ parameter.)

In [None]:
# split as sender and recivers
senders, receivers = true_edges

In [None]:
senders.shape, receivers.shape

### _2. Modulewise True Edges_

In [None]:
# get event prefix using event_id
event_prefix = file_prefixes[event_id]

In [None]:
# select hits
kwargs = {"selection": False}
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
# get modulewise true edges
true_edges = get_modulewise_edges(hits)

In [None]:
# gives True
# senders == true_edges[0]

In [None]:
# gives True
# receivers  == true_edges[1]

In [None]:
# split as sender and recivers
senders, receivers = true_edges

### _Plotting Modulewise True Edges_

- I have hit pairs in two arrays
- Extract each pair (w/ `hit_id`) to plot
- How to plot hit pairs for one track?

In [None]:
# new (2): plotting true edges

# detector layout
fig, ax = detector_layout(figsize=(10, 10))

# particle tracks
pids = np.unique(hits.particle_id)
for pid in pids:
    idx = hits.particle_id == pid
    ax.scatter(hits[idx].x.values, hits[idx].y.values, label="particle_id: %d" % pid)

# loop over source and target nodes
# for i, (source_node, target_node) in enumerate(true_edges.T):
for source_node, target_node in true_edges.T:
    source_pos = hits.loc[source_node, ["x", "y"]].values
    target_pos = hits.loc[target_node, ["x", "y"]].values
    ax.plot(
        [source_pos[0], target_pos[0]],
        [source_pos[1], target_pos[1]],
        "k-",
        linewidth=0.5,
    )

# axis params
ax.legend(fontsize=12, loc="best")
fig.tight_layout()
fig.savefig("modulewise_true_edges.pdf")

### _Disect `get_modulewise_edges(hits)` Function_

In [None]:
# select hits
hits = select_hits(event_file=event_prefix, noise=False, skewed=False, **kwargs)

In [None]:
signal = hits[
    ((~hits.particle_id.isna()) & (hits.particle_id != 0)) & (~hits.vx.isna())
]

In [None]:
signal.head()

In [None]:
signal = signal.drop_duplicates(
    subset=["particle_id", "volume_id", "layer_id", "module_id"]
)

In [None]:
signal.head()

In [None]:
# Sort by increasing distance from production
signal = signal.assign(
    R=np.sqrt(
        (signal.x - signal.vx) ** 2
        + (signal.y - signal.vy) ** 2
        + (signal.z - signal.vz) ** 2
    )
)

In [None]:
signal.head()

In [None]:
# See DataFrame after sorting according to R
signal.sort_values("R").head()

In [None]:
# See DataFrame after sorting according to R and resetting index. The drop=False will keep the old index as a column (index)
signal.sort_values("R").reset_index(drop=False).head()

In [None]:
# Sort according to R and reset DataFrame index
signal = signal.sort_values("R").reset_index(drop=False)

In [None]:
# Handle re-indexing
signal = signal.rename(columns={"index": "unsorted_index"}).reset_index(drop=False)

In [None]:
signal.head()

In [None]:
# Handle noise i.e. particle_id==0
signal.loc[signal["particle_id"] == 0, "particle_id"] = np.nan

- get list of indices of each particle as _`signal_list`_
- first groupby _`particle_id`_ and get _`index`_ column
- second aggregate all indices as a list, each list corresponds to a particle

In [None]:
# Group by particle ID and get list of indices of every particle (series of series).
signal_list = signal.groupby(["particle_id"], sort=False)["index"].agg(
    lambda x: list(x)
)

In [None]:
# see all series of indices named by particle_ids
signal_list

In [None]:
# access indices of a particle in signal_list e.g. particle_id=10
signal_list[10]

- build _`true_edges`_ by first looping over each series (_`signal_list.values`_ gives array of lists) then access each list as a row. 
- covert each _`row`_ into two cascaded rows: _`r1 = row[:-1]`_ returns omitting last element, _`r2 = row[1:]`_ returns omitting first element.
- now create _`true_edges`_ list by pairing indices of _`r1`_ and _`r2`_ as list of lists.

In [None]:
true_edges = []
for row in signal_list.values:
    for i, j in zip(row[:-1], row[1:]):
        true_edges.append([i, j])

In [None]:
# covert list of lists into numpy array and transpose it.
true_edges = np.array(true_edges).T

In [None]:
# Restore to original order
true_edges = signal.unsorted_index.values[true_edges]

In [None]:
true_edges.shape

In [None]:
true_edges[0].size, true_edges[1].size

In [None]:
# split as sender and recivers
senders, receivers = true_edges

In [None]:
senders.shape, receivers.shape