### _Awkward Array_

Testing Awkward array to build track candidates.

- create a list of lists
- each list contains `[track_id, list_hit_id]`

**NOTE**: Don't sort values, just use `groupby()` to get subgroups.

In [1]:
import os
import glob
import torch

import scipy as sp
import numpy as np
import pandas as pd

import uproot
import awkward as ak

from multiprocessing import Pool
from functools import partial
from sklearn.cluster import DBSCAN

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
from trkx_from_gnn import tracks_from_gnn, process

In [4]:
inputdir = "../run_all/gnn_processed/pred"
gnn_eval_files = sorted(glob.glob(os.path.join(inputdir, "*")))
gnn_eval_files[:5]

['../run_all/gnn_processed/pred/110000',
 '../run_all/gnn_processed/pred/110001',
 '../run_all/gnn_processed/pred/110002',
 '../run_all/gnn_processed/pred/110003',
 '../run_all/gnn_processed/pred/110004']

In [5]:
filename = gnn_eval_files[0]
evtid = int(os.path.basename(filename))

In [6]:
evtid

110000

In [7]:
gnn_data = torch.load(filename, map_location=device)
score = gnn_data.scores[
    : gnn_data.edge_index.shape[1]
]  # score has twice the size of edge_index (flip(0) was used)
senders = gnn_data.edge_index[0]
receivers = gnn_data.edge_index[1]
hit_id = gnn_data.hid

In [8]:
predicted_tracks = tracks_from_gnn(hit_id, score, senders, receivers)
ptg = predicted_tracks.groupby(["track_id"])



In [9]:
predicted_tracks

Unnamed: 0,hit_id,track_id
0,52,0
1,77,1
2,127,2
3,178,3
4,1,4
...,...,...
124,25,4
125,177,6
126,102,11
127,152,-1


In [10]:
ptg.groups

{-1: [69, 79, 80, 85, 91, 98, 99, 127], 0: [0, 6, 17, 26, 33, 41, 49, 55, 71, 76, 82, 88, 95, 102, 108, 116, 123], 1: [1, 9, 14, 23, 30, 39, 46, 58], 2: [2, 8, 15, 25, 31, 38, 47, 56], 3: [3, 7, 16, 24, 32, 40, 50, 57], 4: [4, 11, 19, 27, 35, 43, 52, 61, 72, 77, 83, 89, 96, 103, 110, 117, 124], 5: [5, 13, 21, 29, 37, 45, 54, 64, 75, 81, 87, 93, 100, 107, 115, 122], 6: [10, 18, 22, 34, 42, 51, 59, 62, 65, 78, 84, 90, 97, 104, 111, 118, 125], 7: [12, 20, 28, 36, 44, 48, 53, 60, 63, 66, 67, 68, 94, 101, 109, 112, 121, 128], 8: [70, 73, 74], 9: [86, 92], 10: [105, 106, 113, 120], 11: [114, 119, 126]}

In [11]:
ptg.get_group(0)

Unnamed: 0,hit_id,track_id
0,52,0
6,53,0
17,54,0
26,55,0
33,56,0
41,57,0
49,58,0
55,59,0
71,68,0
76,69,0


In [12]:
track_cand = []

for g, data in ptg:
    temp = [int(g)]
    temp.extend(data["hit_id"].tolist())
    track_cand.append(temp)

  for g, data in ptg:


In [13]:
print(track_cand)

[[-1, 93, 145, 95, 96, 147, 148, 98, 152], [0, 52, 53, 54, 55, 56, 57, 58, 59, 68, 69, 70, 71, 72, 73, 74, 75, 76], [1, 77, 78, 79, 80, 81, 82, 83, 84], [2, 127, 128, 129, 130, 131, 132, 133, 134], [3, 178, 179, 180, 181, 182, 183, 184, 185], [4, 1, 2, 3, 4, 5, 6, 7, 8, 17, 18, 19, 20, 21, 22, 23, 24, 25], [5, 103, 104, 105, 106, 107, 108, 109, 110, 119, 120, 121, 122, 123, 124, 125, 126], [6, 153, 154, 155, 156, 157, 158, 159, 160, 161, 170, 171, 172, 173, 174, 175, 176, 177], [7, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 46, 47, 48, 49, 50, 51], [8, 143, 144, 94], [9, 146, 97], [10, 99, 149, 150, 151], [11, 100, 101, 102]]


In [14]:
ak.Array(track_cand)

In [16]:
df = predicted_tracks

In [29]:
df.dtypes

hit_id      int32
track_id    int64
dtype: object

In [32]:
df.astype(np.int64).dtypes

hit_id      int64
track_id    int64
dtype: object

In [28]:
df["hit_id"].values

array([ 52,  77, 127, 178,   1, 103,  53, 179, 128,  78, 153,   2,  26,
       104,  79, 129, 180,  54, 154,   3,  27, 105, 155,  80, 181, 130,
        55,   4,  28, 106,  81, 131, 182,  56, 156,   5,  29, 107, 132,
        82, 183,  57, 157,   6,  30, 108,  83, 133,  31,  58, 184, 158,
         7,  32, 109,  59, 134, 185,  84, 159,  33,   8, 160,  34, 110,
       161,  35,  36,  37,  93, 143,  68,  17, 144,  94, 119,  69,  18,
       170, 145,  95, 120,  70,  19, 171,  96, 146, 121,  71,  20, 172,
       147,  97, 122,  46,  72,  21, 173, 148,  98, 123,  47,  73,  22,
       174,  99, 149, 124,  74,  48,  23, 175,  49, 150, 100, 125,  75,
        24, 176, 101, 151,  50, 126,  76,  25, 177, 102, 152,  51],
      dtype=int32)

In [26]:
df["hit_id"].values[np.newaxis, :]

array([[ 52,  77, 127, 178,   1, 103,  53, 179, 128,  78, 153,   2,  26,
        104,  79, 129, 180,  54, 154,   3,  27, 105, 155,  80, 181, 130,
         55,   4,  28, 106,  81, 131, 182,  56, 156,   5,  29, 107, 132,
         82, 183,  57, 157,   6,  30, 108,  83, 133,  31,  58, 184, 158,
          7,  32, 109,  59, 134, 185,  84, 159,  33,   8, 160,  34, 110,
        161,  35,  36,  37,  93, 143,  68,  17, 144,  94, 119,  69,  18,
        170, 145,  95, 120,  70,  19, 171,  96, 146, 121,  71,  20, 172,
        147,  97, 122,  46,  72,  21, 173, 148,  98, 123,  47,  73,  22,
        174,  99, 149, 124,  74,  48,  23, 175,  49, 150, 100, 125,  75,
         24, 176, 101, 151,  50, 126,  76,  25, 177, 102, 152,  51]],
      dtype=int32)

### More Robust Method:

In [33]:
inputdir = "../run_all/gnn_processed/pred"
outputdir = "../run_all/seg_proc"

In [37]:
all_files = glob.glob(os.path.join(inputdir, "*"))
all_files = sorted(all_files)
len(all_files)

10000

In [36]:
all_files[:10]

['../run_all/gnn_processed/pred/110000',
 '../run_all/gnn_processed/pred/110001',
 '../run_all/gnn_processed/pred/110002',
 '../run_all/gnn_processed/pred/110003',
 '../run_all/gnn_processed/pred/110004',
 '../run_all/gnn_processed/pred/110005',
 '../run_all/gnn_processed/pred/110006',
 '../run_all/gnn_processed/pred/110007',
 '../run_all/gnn_processed/pred/110008',
 '../run_all/gnn_processed/pred/110009']

In [38]:
n_tot_files = len(all_files)
max_evts = 5000
max_evts = max_evts if 0 < max_evts <= n_tot_files else n_tot_files

In [39]:
print("Out of {} events processing {} events.\n".format(n_tot_files, max_evts))

Out of 10000 events processing 5000 events.

