In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from trackml.dataset import load_event
from sklearn import cluster, preprocessing
import glob
import os
from tqdm import tqdm

from trackml.score import score_event

In [2]:
train = np.unique([p.split('-')[0] for p in sorted(glob.glob('./data/train_1/**'))])
test = np.unique([p.split('-')[0] for p in sorted(glob.glob('./data/test/**'))])
det = pd.read_csv('./data/detectors.csv')
sub = pd.read_csv('./data/sample_submission.csv')
print("Train:",len(train)) 
print("Test:", len(test))
print("Detectors:", len(det))
print("Sample Submission:", len(sub))

Train: 1770
Test: 125
Detectors: 18728
Sample Submission: 13741466


In [3]:
def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

def get_training_sample(path_to_data, event_names):

    events = []
    track_id = 0

    for name in tqdm(event_names):
        # if there is an error skip it
        try:
            # Read an event
            hits, cells, particles, truth = load_event(os.path.join(path_to_data, name))

            # Generate new vector of particle id
            particle_ids = truth.particle_id.values
            particle2track = {}
            for pid in np.unique(particle_ids):
                particle2track[pid] = track_id
                track_id += 1
            hits['particle_id'] = [particle2track[pid] for pid in particle_ids]

            # Collect hits
            events.append(hits)
        except:
            print("Error with", name)
            continue
            
    # Put all hits into one sample with unique track ids
    data = pd.concat(events, axis=0)

    return data

In [26]:
# Change this according to your directory preferred setting
path_to_train = "data/train_1"

# This event is in Train_1
event_prefix = "event000001000"

In [28]:
start_event_id = 1000
n_train_samples = 10
train_event_names = ["event0000{:05d}".format(i) for i in range(start_event_id, start_event_id+n_train_samples)]
train_data = get_training_sample(path_to_train, train_event_names)
print("train_data:", train_data.shape)

 30%|███       | 6/20 [00:03<00:08,  1.67it/s]

Error with event000001006


100%|██████████| 20/20 [00:11<00:00,  1.71it/s]

train_data: (2169884, 8)





In [7]:
train_data.head()

Unnamed: 0,hit_id,x,y,z,volume_id,layer_id,module_id,particle_id
0,1,-64.409897,-7.1637,-1502.5,7,2,1,0
1,2,-55.336102,0.635342,-1502.5,7,2,1,477
2,3,-83.830498,-1.14301,-1502.5,7,2,1,0
3,4,-96.1091,-8.24103,-1502.5,7,2,1,3556
4,5,-62.673599,-9.3712,-1502.5,7,2,1,4811


In [22]:
scl = preprocessing.StandardScaler()
dbscan = cluster.DBSCAN(eps=0.00515, min_samples=1, algorithm='kd_tree', n_jobs=-1)
events = []
track_id = 0

for e in train[:5]:
    hits, cells, particles, truth = load_event(e)
    hits['event_id'] = int(e[-9:])
    cells = cells.groupby(by=['hit_id'])['ch0', 'ch1', 'value'].agg(['mean']).reset_index()
    cells.columns = ['hit_id', 'ch0', 'ch1', 'value']
    hits = pd.merge(hits, cells, how='left', on='hit_id')
    col = [c for c in hits.columns if c not in ['event_id', 'hit_id']]
    
    # Generate new vector of particle id
    particle_ids = truth.particle_id.values
    particle2track = {}
    for pid in np.unique(particle_ids):
        particle2track[pid] = track_id
        track_id += 1
    hits['particle_id'] = [particle2track[pid] for pid in particle_ids]
    
    #https://www.kaggle.com/mikhailhushchyn/dbscan-benchmark
    x = hits.x.values
    y = hits.y.values
    z = hits.z.values
    r = np.sqrt(x**2 + y**2 + z**2)
    hits['x'] = x/r
    hits['y'] = y/r
    r = np.sqrt(x**2 + y**2)
    hits['z'] = z/r
    
    events.append(hits)

events = pd.concat(events, axis=0)
y = events.particle_id.values
X = events.drop("particle_id",axis=1)


In [23]:
dbscan.fit(X, y)

DBSCAN(algorithm='kd_tree', eps=0.00515, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=1, n_jobs=-1, p=None)

In [30]:
path_to_event = os.path.join(path_to_train, "event0000{:05d}".format(start_event_id + n_train_samples + 1))
hits, cells, particles, truth = load_event(path_to_event)

# Warning: it takes about 30s per one event
labels = dbscan.fit_predict(hits)

In [None]:
hits['particle_id'] = dbscan.fit_predict(scl.fit_transform(hits[['x2', 'y2', 'z2']].values))

In [3]:
scl = preprocessing.StandardScaler()
dbscan = cluster.DBSCAN(eps=0.00515, min_samples=1, algorithm='kd_tree', n_jobs=-1)
df_test = []
for e in test:
    hits, cells = load_event(e, parts=['hits', 'cells'])
    hits['event_id'] = int(e[-9:])
    cells = cells.groupby(by=['hit_id'])['ch0', 'ch1', 'value'].agg(['mean']).reset_index()
    cells.columns = ['hit_id', 'ch0', 'ch1', 'value']
    hits = pd.merge(hits, cells, how='left', on='hit_id')
    col = [c for c in hits.columns if c not in ['event_id', 'hit_id', 'particle_id']]

    #https://www.kaggle.com/mikhailhushchyn/dbscan-benchmark
    x = hits.x.values
    y = hits.y.values
    z = hits.z.values
    r = np.sqrt(x**2 + y**2 + z**2)
    hits['x2'] = x/r
    hits['y2'] = y/r
    r = np.sqrt(x**2 + y**2)
    hits['z2'] = z/r
    hits['particle_id'] = dbscan.fit_predict(scl.fit_transform(hits[['x2', 'y2', 'z2']].values))
    
    df_test.append(hits[['event_id','hit_id','particle_id']].copy())
    print(e, len(hits['particle_id'].unique()))

./data/test/event000000000 77379
./data/test/event000000001 79199
./data/test/event000000002 70258
./data/test/event000000003 71722
./data/test/event000000004 77982
./data/test/event000000005 70888
./data/test/event000000006 71302
./data/test/event000000007 76779
./data/test/event000000008 75379
./data/test/event000000009 74109
./data/test/event000000010 76395
./data/test/event000000011 70509
./data/test/event000000012 68967
./data/test/event000000013 80624
./data/test/event000000014 68400
./data/test/event000000015 61685
./data/test/event000000016 64732
./data/test/event000000017 83900
./data/test/event000000018 65388
./data/test/event000000019 69323
./data/test/event000000020 55907
./data/test/event000000021 70486
./data/test/event000000022 78410
./data/test/event000000023 76052
./data/test/event000000024 76067
./data/test/event000000025 72464
./data/test/event000000026 64874
./data/test/event000000027 66324
./data/test/event000000028 71436
./data/test/event000000029 60388
./data/tes

In [4]:
df_test = pd.concat(df_test, ignore_index=True)

sub_new = pd.merge(sub, df_test, how='left', on=['event_id','hit_id'])

In [5]:
sub_new['track_id'] = sub_new['particle_id'] + 1
sub_new[['event_id','hit_id','track_id']].to_csv('20180627_dbscan_submission_04.csv.gz', index=False, compression='gzip')