In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import lightgbm as lgb
from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

In [2]:
def get_training_sample(path_to_data, event_names):

    events = []
    track_id = 0

    for name in tqdm(event_names):
        # if there is an error skip it
        try:
            # Read an event
            hits, cells, particles, truth = load_event(os.path.join(path_to_data, name))

            # Generate new vector of particle id
            particle_ids = truth.particle_id.values
            particle2track = {}
            for pid in np.unique(particle_ids):
                particle2track[pid] = track_id
                track_id += 1
            hits['particle_id'] = [particle2track[pid] for pid in particle_ids]

            # Collect hits
            events.append(hits)
        except:
            print("Error with", name)
            continue
            
    # Put all hits into one sample with unique track ids
    data = pd.concat(events, axis=0)

    return data

def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [3]:
# Change this according to your directory preferred setting
path_to_train = "data/train_1"

# This event is in Train_1
event_prefix = "event000001000"

In [20]:
start_event_id = 1000
n_train_samples = 2
train_event_names = ["event0000{:05d}".format(i) for i in range(start_event_id, start_event_id+n_train_samples)]
train_data = get_training_sample(path_to_train, train_event_names)
print("train_data:", train_data.shape)

100%|██████████| 2/2 [00:01<00:00,  1.79it/s]

train_data: (214619, 8)





In [21]:
# pre-process
hits = train_data
x = hits.x.values
y = hits.y.values
z = hits.z.values

r = np.sqrt(x**2 + y**2 + z**2)
hits['x2'] = x/r
hits['y2'] = y/r
hits['z2'] = z/r

ss = StandardScaler()
X = ss.fit_transform(hits[['x2', 'y2', 'z2']].values)

y = hits.particle_id.values

In [22]:
print("Num Classes:", len(np.unique(y)))

Num Classes: 18266


In [23]:
lgbc = LGBMClassifier(num_leaves=25, objective="multiclass", silent=False, min_split_gain=1e-1, n_estimators=5)
lgbc.fit(X, y)

LightGBMError: b'std::bad_alloc'

In [None]:
path_to_event = os.path.join(path_to_train, "event0000{:05d}".format(start_event_id + n_train_samples + 1))
hits, cells, particles, truth = load_event(path_to_event)

In [None]:
%%time
# Warning: it takes about 30s per one event
labels = lgb_model1.predict(hits)

In [None]:
submission = create_one_event_submission(0, hits, labels)
score = score_event(truth, submission)
print("Your score: ", score)

In [None]:
dataset_submissions = []
dataset_scores = []

for event_id, hits, cells, particles, truth in load_dataset(path_to_train, skip=500, nevents=5):
        
    # Track pattern recognition
    labels = model.predict(hits)
        
    # Prepare submission for an event
    one_submission = create_one_event_submission(event_id, hits, labels)
    dataset_submissions.append(one_submission)
    
    # Score for the event
    score = score_event(truth, one_submission)
    dataset_scores.append(score)
    
    print("Score for event %d: %.3f" % (event_id, score))
    
print('Mean score: %.3f' % (np.mean(dataset_scores)))