In [1]:
import os
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
particles_df = pd.read_csv('datasets/unarch/train_100_events/event000001000-particles.csv')
hits_df = pd.read_csv('datasets/unarch/train_100_events/event000001000-hits.csv')
cells_df = pd.read_csv('datasets/unarch/train_100_events/event000001000-cells.csv')
truth_df = pd.read_csv('datasets/unarch/train_100_events/event000001000-truth.csv')

In [3]:
print(particles_df.shape)
particles_df.head(4)

(12263, 9)


Unnamed: 0,particle_id,vx,vy,vz,px,py,pz,q,nhits
0,4503668346847232,-0.009288,0.009861,-0.077879,-0.055269,0.323272,-0.203492,-1,8
1,4503737066323968,-0.009288,0.009861,-0.077879,-0.948125,0.470892,2.01006,1,11
2,4503805785800704,-0.009288,0.009861,-0.077879,-0.886484,0.105749,0.683881,-1,0
3,4503874505277440,-0.009288,0.009861,-0.077879,0.257539,-0.676718,0.991616,1,12


In [4]:
print(hits_df.shape)
hits_df.head(4)

(120939, 7)


Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id
0,1,-64.4099,-7.1637,-1502.5,7,2,1
1,2,-55.3361,0.635342,-1502.5,7,2,1
2,3,-83.8305,-1.14301,-1502.5,7,2,1
3,4,-96.1091,-8.24103,-1502.5,7,2,1


In [5]:
print(cells_df.shape)
cells_df.head(4)

(664996, 4)


Unnamed: 0,hit_id,ch0,ch1,value
0,1,209,617,0.013832
1,1,210,617,0.079887
2,1,209,618,0.211723
3,2,68,446,0.334087


In [6]:
print(truth_df.shape)
truth_df.head(4)

(120939, 9)


Unnamed: 0,hit_id,particle_id,tx,ty,tz,tpx,tpy,tpz,weight
0,1,0,-64.4116,-7.16412,-1502.5,250710.0,-149908.0,-956385.0,0.0
1,2,22525763437723648,-55.3385,0.630805,-1502.5,-0.570605,0.02839,-15.4922,1e-05
2,3,0,-83.828,-1.14558,-1502.5,626295.0,-169767.0,-760877.0,0.0
3,4,297237712845406208,-96.1229,-8.23036,-1502.5,-0.225235,-0.050968,-3.70232,8e-06


In [7]:
detectors_df = pd.read_csv('datasets/unarch/detectors.csv')

In [8]:
print(detectors_df.shape)
detectors_df.head(4)

(18728, 21)


Unnamed: 0,volume_id,layer_id,module_id,cx,cy,cz,rot_xu,rot_xv,rot_xw,rot_yu,...,rot_yw,rot_zu,rot_zv,rot_zw,module_t,module_minhu,module_maxhu,module_hv,pitch_u,pitch_v
0,7,2,1,-65.7965,-5.1783,-1502.5,0.078459,-0.996917,0.0,-0.996917,...,0.0,0,0,-1,0.15,8.4,8.4,36,0.05,0.05625
1,7,2,2,-139.851,-6.46568,-1502.0,0.046183,-0.998933,0.0,-0.998933,...,0.0,0,0,-1,0.15,8.4,8.4,36,0.05,0.05625
2,7,2,3,-138.657,-19.3419,-1498.0,0.138156,-0.99041,0.0,-0.99041,...,0.0,0,0,-1,0.15,8.4,8.4,36,0.05,0.05625
3,7,2,4,-64.1764,-15.4074,-1498.0,0.233445,-0.97237,0.0,-0.97237,...,0.0,0,0,-1,0.15,8.4,8.4,36,0.05,0.05625


In [9]:
submission_df = pd.read_csv('datasets/unarch/sample_submission.csv')

In [10]:
print(submission_df.shape)
submission_df.head(4)

(13741466, 3)


Unnamed: 0,event_id,hit_id,track_id
0,0,1,0
1,0,2,0
2,0,3,0
3,0,4,0


In [11]:
train_dataset_dir = 'datasets/unarch/train_100_events'
dataset_filenames = os.listdir(train_dataset_dir)
print(dataset_filenames[:4])
event_ids = []
event_filenames = {}
for filename in dataset_filenames[:20]:
    event_id = filename[5:14]
    print(event_id)
    if event_id not in event_ids:
        event_ids.append(event_id)
    if event_id not in event_filenames:
        event_filenames[event_id] = [filename]
    else:
        event_filenames[event_id].append(filename)

['event000001015-truth.csv', 'event000001022-cells.csv', 'event000001056-cells.csv', 'event000001043-particles.csv']
000001015
000001022
000001056
000001043
000001061
000001035
000001026
000001004
000001081
000001076
000001008
000001057
000001066
000001019
000001083
000001053
000001090
000001001
000001074
000001085


In [12]:
def random_sublist_select(original_list, sublist_size):
    return [original_list[i] for i in random.sample(range(len(original_list)), sublist_size)]

In [13]:
def offset_sublist_select(original_list, sublist_size, offset):
    return original_list[offset:offset + sublist_size]

In [14]:
def select_random_indexses_subset(size, subset_size):
    return random.sample(tuple(range(size)), subset_size) 

In [15]:
def select_offset_indexses_subset(size, subset_size, offset):
    return tuple(range(size))[offset:offset + subset_size]

In [16]:
def read_dataset_filenames_from_dir(path_to_datasets_dir):
    dataset_filenames = os.listdir(path_to_datasets_dir)
    event_filenames = {}
    for filename in dataset_filenames:
        path_to_file = os.path.join(path_to_datasets_dir, filename)
        event_id = filename[5:14]
        if event_id not in event_filenames:
            event_filenames[event_id] = [path_to_file]
        else:
            event_filenames[event_id].append(path_to_file)
    return event_filenames

In [17]:
def select_events(indexes_list, event_names):
    return tuple(event_names[i] for i in indexes_list)

In [18]:
def random_events_select(event_names, subset_size):
    event_names_len = len(event_names)
    indexes = select_random_indexses_subset(event_names_len, subset_size)
    return select_events(indexes, event_names)

In [19]:
def offset_events_select(event_names, subset_size, offset):
    event_names_len = len(event_names)
    indexes = select_offset_indexses_subset(event_names_len, subset_size, offset)
    return select_events(indexes, event_names)

In [20]:
def read_dataset_filenames_random(directory_list, sample_size=0):
    event_grouped_dataset_filenames = {}
    for directory in directory_list:
        dataset_filenames = read_dataset_filenames_from_dir(directory)
        event_names = tuple(dataset_filenames)
        if sample_size > 0:
            event_names = random_events_select(event_names, sample_size)
        dataset_filenames = {event_name: sorted(dataset_filenames[event_name]) for event_name in event_names}
        event_grouped_dataset_filenames.update(dataset_filenames)
    return event_grouped_dataset_filenames    

In [21]:
def read_dataset_filenames_offset(directory_list, sample_size=0, offset=0):
    event_grouped_dataset_filenames = {}
    for directory in directory_list:
        dataset_filenames = read_dataset_filenames_from_dir(directory)
        event_names = tuple(dataset_filenames)
        if sample_size > 0:
            event_names = offset_events_select(tuple(dataset_filenames), sample_size, offset)
        dataset_filenames = {event_name: sorted(dataset_filenames[event_name]) for event_name in event_names}
        event_grouped_dataset_filenames.update(dataset_filenames)
    return event_grouped_dataset_filenames 

In [22]:
def random_select_dataset_filenames(dataset_filenames, sample_size):
    return {
        event_name: dataset_filenames[event_name] for event_name in random_events_select(
            tuple(dataset_filenames),
            sample_size
        )
    }

In [23]:
def offset_select_dataset_filenames(dataset_filenames, sample_size, offset):
    return {
        event_name: dataset_filenames[event_name] for event_name in offset_events_select(
            tuple(dataset_filenames),
            sample_size,
            offset
        )
    }

In [24]:
event_grouped_dataset_filenames = read_dataset_filenames_random(['datasets/unarch/train_1/'], 20)

In [25]:
event_grouped_dataset_filenames

{'000001038': ['datasets/unarch/train_1/event000001038-cells.csv',
  'datasets/unarch/train_1/event000001038-hits.csv',
  'datasets/unarch/train_1/event000001038-particles.csv',
  'datasets/unarch/train_1/event000001038-truth.csv'],
 '000001070': ['datasets/unarch/train_1/event000001070-cells.csv',
  'datasets/unarch/train_1/event000001070-hits.csv',
  'datasets/unarch/train_1/event000001070-particles.csv',
  'datasets/unarch/train_1/event000001070-truth.csv'],
 '000001097': ['datasets/unarch/train_1/event000001097-cells.csv',
  'datasets/unarch/train_1/event000001097-hits.csv',
  'datasets/unarch/train_1/event000001097-particles.csv',
  'datasets/unarch/train_1/event000001097-truth.csv'],
 '000001219': ['datasets/unarch/train_1/event000001219-cells.csv',
  'datasets/unarch/train_1/event000001219-hits.csv',
  'datasets/unarch/train_1/event000001219-particles.csv',
  'datasets/unarch/train_1/event000001219-truth.csv'],
 '000001288': ['datasets/unarch/train_1/event000001288-cells.csv',
 

In [26]:
event_grouped_dataset_filenames_1 = read_dataset_filenames_offset(['datasets/unarch/train_1/'], 20)

In [27]:
event_grouped_dataset_filenames_1

{'000001015': ['datasets/unarch/train_1/event000001015-cells.csv',
  'datasets/unarch/train_1/event000001015-hits.csv',
  'datasets/unarch/train_1/event000001015-particles.csv',
  'datasets/unarch/train_1/event000001015-truth.csv'],
 '000001127': ['datasets/unarch/train_1/event000001127-cells.csv',
  'datasets/unarch/train_1/event000001127-hits.csv',
  'datasets/unarch/train_1/event000001127-particles.csv',
  'datasets/unarch/train_1/event000001127-truth.csv'],
 '000001191': ['datasets/unarch/train_1/event000001191-cells.csv',
  'datasets/unarch/train_1/event000001191-hits.csv',
  'datasets/unarch/train_1/event000001191-particles.csv',
  'datasets/unarch/train_1/event000001191-truth.csv'],
 '000001287': ['datasets/unarch/train_1/event000001287-cells.csv',
  'datasets/unarch/train_1/event000001287-hits.csv',
  'datasets/unarch/train_1/event000001287-particles.csv',
  'datasets/unarch/train_1/event000001287-truth.csv'],
 '000001343': ['datasets/unarch/train_1/event000001343-cells.csv',
 

In [28]:
all_event_grouped_dataset_filenames = read_dataset_filenames_random(['datasets/unarch/train_1/'])

In [29]:
random_selected_dataset_filenames = random_select_dataset_filenames(all_event_grouped_dataset_filenames, 100)

In [30]:
print(len(random_selected_dataset_filenames))
random_selected_dataset_filenames

100


{'000001014': ['datasets/unarch/train_1/event000001014-cells.csv',
  'datasets/unarch/train_1/event000001014-hits.csv',
  'datasets/unarch/train_1/event000001014-particles.csv',
  'datasets/unarch/train_1/event000001014-truth.csv'],
 '000001039': ['datasets/unarch/train_1/event000001039-cells.csv',
  'datasets/unarch/train_1/event000001039-hits.csv',
  'datasets/unarch/train_1/event000001039-particles.csv',
  'datasets/unarch/train_1/event000001039-truth.csv'],
 '000001044': ['datasets/unarch/train_1/event000001044-cells.csv',
  'datasets/unarch/train_1/event000001044-hits.csv',
  'datasets/unarch/train_1/event000001044-particles.csv',
  'datasets/unarch/train_1/event000001044-truth.csv'],
 '000001122': ['datasets/unarch/train_1/event000001122-cells.csv',
  'datasets/unarch/train_1/event000001122-hits.csv',
  'datasets/unarch/train_1/event000001122-particles.csv',
  'datasets/unarch/train_1/event000001122-truth.csv'],
 '000001127': ['datasets/unarch/train_1/event000001127-cells.csv',
 

In [31]:
def create_event_df(event_id, particles_df, truth_df, cells_df, hits_df):
    #return pd.merge(truth_df, hits_df, on='hit_id')
    truth_hits_df = pd.merge(truth_df, hits_df, on='hit_id')
    particles_truth_hits_df = pd.merge(particles_df, truth_hits_df, on='particle_id', how='right')
    #return particles_truth_hits_df
    return pd.merge(cells_df, particles_truth_hits_df, on='hit_id', how='outer')

In [46]:
def read_dataset_to_grouped_by_event_dfs(selected_dataset_filenames):
    event_dfs = {}
    for event_id, event_filenames in selected_dataset_filenames.items():
        event_dfs[event_id] = create_event_df(
            event_id,
            pd.read_csv(event_filenames[2]),
            pd.read_csv(event_filenames[3]),
            pd.read_csv(event_filenames[0]),
            pd.read_csv(event_filenames[1])
        )
    return event_dfs        

In [49]:
def read_dataset_to_grouped_by_event_dfs(selected_dataset_filenames):
    grouped_by_event_dfs = {}
    for event_id, event_filenames in selected_dataset_filenames.items():
        grouped_by_event_dfs[event_id] = (
            pd.read_csv(event_filenames[2]),
            pd.read_csv(event_filenames[3]),
            pd.read_csv(event_filenames[0]),
            pd.read_csv(event_filenames[1])
        )
    return grouped_by_event_dfs 

In [34]:
def read_dataset_to_event_dfs(grouped_by_event_dfs):
    event_dfs = {}
    for event_id, dfs in grouped_by_event_dfs.items():
        event_dfs[event_id] = create_event_df(
            event_id,
            dfs[0],
            dfs[1],
            dfs[2],
            dfs[3]
        )
    return event_dfs 

In [35]:
all(hits_df['hit_id'] == truth_df['hit_id'])

True

In [36]:
event_df = create_event_df(0, particles_df, truth_df, cells_df, hits_df)

In [37]:
event_df.shape

(664996, 26)

In [50]:
grouped_by_event_datasets_df = read_dataset_to_grouped_by_event_dfs(event_grouped_dataset_filenames)

In [51]:
grouped_by_event_datasets_df.keys()

dict_keys(['000002354', '000001288', '000002231', '000001097', '000002740', '000001821', '000001637', '000001766', '000001686', '000001617', '000001525', '000001070', '000001781', '000001976', '000001038', '000001681', '000002620', '000001219', '000002545', '000001969'])

In [52]:
grouped_by_event_datasets_df['000002231']

(              particle_id          vx          vy           vz        px  \
 0        4503668346847232   -0.036088   -0.005899    -0.589940  1.601750   
 1        4504011944230912   -0.036088   -0.005899    -0.589940 -0.156046   
 2        4504080663707648   -0.036088   -0.005899    -0.589940 -0.473923   
 3        4504149383184384   -0.036088   -0.005899    -0.589940 -0.091837   
 4        4504355541614592   -0.036088   -0.005899    -0.589940 -0.284858   
 5        4504424261091328   -0.036088   -0.005899    -0.589940 -0.777792   
 6        4504492980568064   -0.036088   -0.005899    -0.589940 -0.711079   
 7        4504561700044800   -0.036088   -0.005899    -0.589940  0.195826   
 8        4504699138998272   -0.036088   -0.005899    -0.589940 -0.604959   
 9        4504767858475008   -0.036088   -0.005899    -0.589940 -0.016909   
 10       4504836577951744   -0.036088   -0.005899    -0.589940  0.148628   
 11       4505317614288896   -0.036088   -0.005899    -0.589940 -0.744794   

In [53]:
event_dfs = read_dataset_to_event_dfs(grouped_by_event_datasets_df)

In [54]:
event_dfs['000002231']

Unnamed: 0,hit_id,ch0,ch1,value,particle_id,vx,vy,vz,px,py,...,tpx,tpy,tpz,weight,x,y,z,volume_id,layer_id,module_id
0,1,132,839,0.262129,234200306043322368,0.005541,0.006762,-0.514969,-0.166157,-0.031275,...,-0.159125,0.013682,-3.204160,0.000007,-77.1231,-4.289230,-1502.5,7,2,1
1,2,79,844,0.326944,653034659071918080,-0.013272,-0.013146,14.754400,-0.705561,-0.038511,...,-0.707325,0.008192,-13.793500,0.000009,-77.6114,-1.669470,-1502.5,7,2,1
2,3,164,22,0.275527,184653150999805952,0.022183,-0.003298,5.989040,-0.196029,-0.020165,...,0.201429,-0.009293,-4.411550,0.000006,-31.1830,-2.278610,-1502.5,7,2,1
3,4,262,594,0.329261,135111218636521472,-0.013593,0.017873,0.188585,-0.640876,-0.077797,...,-0.634443,-0.120796,-15.246300,0.000012,-62.8743,-9.687930,-1502.5,7,2,1
4,5,312,1030,0.284308,49542138521714688,-0.002136,-0.004747,6.877430,-0.711064,-0.137113,...,-0.724587,-0.090109,-12.406800,0.000011,-87.1276,-14.104400,-1502.5,7,2,1
5,6,6,1132,0.286151,135111630953381888,-0.013593,0.017873,0.188585,-0.827635,0.037548,...,-0.822005,-0.023945,-13.187200,0.000011,-94.0511,0.697985,-1502.5,7,2,1
6,6,6,1133,0.017867,135111630953381888,-0.013593,0.017873,0.188585,-0.827635,0.037548,...,-0.822005,-0.023945,-13.187200,0.000011,-94.0511,0.697985,-1502.5,7,2,1
7,7,233,579,0.315824,423338502411780096,-0.002551,0.032404,-2.134870,-0.177247,-0.004932,...,-0.169220,-0.040929,-4.230970,0.000008,-62.1470,-8.176200,-1502.5,7,2,1
8,8,325,914,0.297094,265722204900032512,-0.012797,0.005356,6.561480,-0.800249,-0.166847,...,-0.814606,-0.117859,-15.099200,0.000010,-80.5717,-14.240500,-1502.5,7,2,1
9,9,189,525,0.272044,126108898464628736,0.001974,-0.020161,2.071310,-0.359383,-0.014962,...,-0.345777,-0.049168,-8.986070,0.000008,-59.2914,-5.744660,-1502.5,7,2,1
