In [1]:
from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from sklearn.cluster.dbscan_ import dbscan
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import timeit
import multiprocessing
from multiprocessing import Pool

In [2]:
def find_labels(params):
    hits, dz, eps = params
    a = hits['phi'].values
    z = hits['z'].values
    zr = hits['zr'].values
    aa = a + np.sign(z) * dz * z

    f0 = np.cos(aa)
    f1 = np.sin(aa)
    f2 = zr
    X = StandardScaler().fit_transform(np.column_stack([f0, f1, f2]))

    _, l = dbscan(X, eps=eps, min_samples=1, n_jobs=-1)
    return l + 1

def add_count(l):
    unique, reverse, count = np.unique(l, return_counts=True, return_inverse=True)
    c = count[reverse]
    c[np.where(l == 0)] = 0
    c[np.where(c > 20)] = 0
    return (l, c)

def do_dbscan_predict(hits, eps=0.0035):
    start_time = timeit.default_timer()

    hits['r'] = np.sqrt(hits['x'] ** 2 + hits['y'] ** 2)
    hits['zr'] = hits['z'] / hits['r']
    hits['phi'] = np.arctan2(hits['y'], hits['x'])

    params = []
    for i in range(0, 20):
        dz = i * 0.00001
        params.append((hits, dz, eps))
        if i > 0:
             params.append((hits, -dz, eps))

    for i in range(20, 60):
        dz = i * 0.00001
        params.append((hits, dz, eps))
        params.append((hits, -dz, eps))
             
    pool = Pool(processes=14)
    labels_for_all_steps = pool.map(find_labels, params)
    results = [add_count(l) for l in labels_for_all_steps]
    pool.close()

    labels, counts = results[0]
    for i in range(1, len(results)):
        l, c = results[i]
        idx = np.where((c - counts > 0))[0]
        labels[idx] = l[idx] + labels.max()
        counts[idx] = c[idx]

    print('time spent:', round(timeit.default_timer() - start_time))

    return labels

def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

def run_dbscan(eps=0.0035):
    data_dir = './data/train_1'

    event_ids = ['000001000','000001010','000001200','000001100']
    sum = 0
    sum_score = 0
    for i, event_id in enumerate(event_ids):
        hits, cells, particles, truth = load_event(data_dir + '/event' + event_id)
        labels = do_dbscan_predict(hits, eps=eps)
        submission = create_one_event_submission(0, hits['hit_id'].values, labels)
        score = score_event(truth, submission)
        print('[%2d] score : %0.8f' % (i, score))
        sum_score += score
        sum += 1

    print('--------------------------------------')
    print("Mean:", sum_score / sum)

In [4]:
print('estimate score by known events')
run_dbscan(eps=0.0045)

estimate score by known events
time spent: 18
[ 0] score : 0.41206948
time spent: 16
[ 1] score : 0.40140858
time spent: 18
[ 2] score : 0.42780646
time spent: 17
[ 3] score : 0.42559620
--------------------------------------
Mean: 0.4167201787127368


In [9]:
def generate_test_preds(path_to_test="./data/test", eps=0.0047, start=0):
    test_dataset_submissions = []
    print('process test events')
    for event_id, hits in load_dataset(path_to_test, parts=['hits']):
        if event_id >= start:
            print('Event ID: ', event_id)
            labels = do_dbscan_predict(hits, eps=eps)
            # Prepare submission for an event
            one_submission = create_one_event_submission(event_id, hits['hit_id'].values, labels)
            test_dataset_submissions.append(one_submission)

            one_submission.to_csv('./%09d.helix_baseline.csv.gz'%event_id, index=False, compression='gzip')
            test_dataset_submissions.append(one_submission)
    
    return test_dataset_submissions

In [10]:
submissions = generate_test_preds(start=0, eps=0.0046)

process test events
Event ID:  0
time spent: 18
Event ID:  1
time spent: 19
Event ID:  2
time spent: 16
Event ID:  3
time spent: 17
Event ID:  4
time spent: 19
Event ID:  5
time spent: 17
Event ID:  6
time spent: 17
Event ID:  7
time spent: 18
Event ID:  8
time spent: 19
Event ID:  9
time spent: 18
Event ID:  10
time spent: 18
Event ID:  11
time spent: 17
Event ID:  12
time spent: 16
Event ID:  13
time spent: 21
Event ID:  14
time spent: 16
Event ID:  15
time spent: 14
Event ID:  16
time spent: 15
Event ID:  17
time spent: 22
Event ID:  18
time spent: 15
Event ID:  19
time spent: 16
Event ID:  20
time spent: 12
Event ID:  21
time spent: 17
Event ID:  22
time spent: 19
Event ID:  23
time spent: 18
Event ID:  24
time spent: 19
Event ID:  25
time spent: 17
Event ID:  26
time spent: 15
Event ID:  27
time spent: 16
Event ID:  28
time spent: 17
Event ID:  29
time spent: 14
Event ID:  30
time spent: 18
Event ID:  31
time spent: 16
Event ID:  32
time spent: 13
Event ID:  33
time spent: 13
Even

In [11]:
event_ids = [ i for i in range(0,125) ]
submissions = []
for i,event_id in enumerate(event_ids):
    submission  = pd.read_csv('./%09d.helix_baseline.csv.gz'%event_id, compression='gzip')
    submissions.append(submission)

# Create submission file
submission = pd.concat(submissions, axis=0)
submission.to_csv('20180701_helix_baseline_e_46.csv.gz', index=False, compression='gzip')
print(len(submission))

13741466
