In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import math

from trackml.dataset import load_event, load_dataset
from trackml.score import score_event

from multiprocessing import Pool

from itertools import product

def create_one_event_submission(event_id, hits, labels):
    sub_data = np.column_stack(([event_id]*len(hits), hits.hit_id.values, labels))
    submission = pd.DataFrame(data=sub_data, columns=["event_id", "hit_id", "track_id"]).astype(int)
    return submission

In [2]:
# Change this according to your directory preferred setting
path_to_train = "./data/train_1"

# This event is in Train_1
event_prefix = "event000001000"

In [3]:
weights_0 = np.array([1.1, 1.1, .375, .25, 0.05, 0.05])
weights_2 = np.array([1.05, 1.05, .37, .25, .01, .045])
weights_3 = np.array([1.08, 1.08, .38,.25, 0.009, 0.0001])
weights_4 = np.array([1.1, 1.1, .5, .25, 0.008, 0.001])
weights_5 = np.array([1.01, 1.01, .4048, .2, 2e-16, 2e-16])
weights_9 = np.array([1.02, 1.02, .38, .25, 0.009, 0.02])

weights_1 = np.array([1.08, 1.08, .38,.25, 0.0001, 0.0001])
weights_6 = np.array([1.08, 1.08, .38, .25, 2e-4, 0])
weights_8 = np.array([1.07, 1.07, .37, .25, 0.0002, 0.0001])
weights_12 = np.array([1.01, 1.01, .42, .25, 0.0023, 0.001])
weights_13 = np.array([1.0, 1.0, .40, .20, 0.0023, 0.0001])
weights_14 = np.array([1.02, 1.02, .40, .24, 0.0002, 0])
weights_7 = np.array([1.08, 1.08, .40, .20, 0.0023, 0])

weights_10 = np.array([1.02, 1.02, .42, .24, 0.0002, 0])
weights_11 = np.array([1.08, 1.08, .40, .20, 0.0023, 0.0001])
weights_15 = np.array([1.01, 1.01, .42, .25, 0.0002, 0])
weights_16 = np.array([1.005, 1.005, .40, .25, 0.0002, 0])
weights_17 = np.array([1.02, 1.02, .42, .24, 0.02, 0])
weights_18 = np.array([1.033, 1.033, .376, .2, 0.0002, 0])

weights_arr = np.vstack([weights_0, weights_1, weights_2, weights_3, weights_4, weights_5, weights_6, weights_7, weights_8, weights_9, weights_10, weights_11, weights_12, weights_13, weights_14,  weights_15,  weights_16,  weights_17,  weights_18])

In [9]:
from sklearn.preprocessing import StandardScaler
import hdbscan
from scipy import stats
from tqdm import tqdm_notebook as tqdm
from sklearn.cluster import DBSCAN, AgglomerativeClustering

class Clusterer(object):
    def __init__(self,rz_scales=[0.65, 0.965, 1.528], eps=0.0035, dz0 = 0.000300, stepdz = 0.000050, stepeps = 1.000000e-09, num_loops=155, final_cluster=True, weight=10, weight_arr=weights_arr, dz_incr=False, max_size=19):
        self.rz_scales=rz_scales
        self.epsilon = eps
        self.dz0 = dz0
        self.stepdz = stepdz
        self.stepeps = stepeps
        self.num_loops = num_loops
        self.final_cluster = final_cluster
        self.weights = weight
        self.weight_arr = weight_arr
        self.dz_incr = dz_incr
        self.max_size = max_size
        
    # remove outliers
    def _eliminate_outliers(self,labels,M):
        norms=np.zeros((len(labels)),np.float32)
        indices=np.zeros((len(labels)),np.float32)
        for i, cluster in tqdm(enumerate(labels),total=len(labels)):
            if cluster == 0:
                continue
            index = np.argwhere(self.clusters==cluster)
            index = np.reshape(index,(index.shape[0]))
            indices[i] = len(index)
            x = M[index]
            norms[i] = self._test_quadric(x)
        threshold1 = np.percentile(norms,95)*5
        threshold2 = 25
        threshold3 = 6
        for i, cluster in enumerate(labels):
            if norms[i] > threshold1 or indices[i] > threshold2 or indices[i] < threshold3:
                self.clusters[self.clusters==cluster]=0   
    
    # not sure what this function does?
    def _test_quadric(self,x):
        if x.size == 0 or len(x.shape)<2:
            return 0
        Z = np.zeros((x.shape[0],10), np.float32)
        Z[:,0] = x[:,0]**2
        Z[:,1] = 2*x[:,0]*x[:,1]
        Z[:,2] = 2*x[:,0]*x[:,2]
        Z[:,3] = 2*x[:,0]
        Z[:,4] = x[:,1]**2
        Z[:,5] = 2*x[:,1]*x[:,2]
        Z[:,6] = 2*x[:,1]
        Z[:,7] = x[:,2]**2
        Z[:,8] = 2*x[:,2]
        Z[:,9] = 1
        v, s, t = np.linalg.svd(Z,full_matrices=False)        
        smallest_index = np.argmin(np.array(s))
        T = np.array(t)
        T = T[smallest_index,:]        
        norm = np.linalg.norm(np.dot(Z,T), ord=2)**2
        return norm

    # standard scale our data
    def _preprocess(self, hits):
        x = hits.x.values
        y = hits.y.values
        z = hits.z.values

        r = np.sqrt(x**2 + y**2 + z**2)
        hits['x2'] = x/r
        hits['y2'] = y/r

        r = np.sqrt(x**2 + y**2)
        hits['z2'] = z/r

        ss = StandardScaler()
        X = ss.fit_transform(hits[['x2', 'y2', 'z2']].values)
        for i, rz_scale in enumerate(self.rz_scales):
            X[:,i] = X[:,i] * rz_scale
       
        return X
    
    def _init(self,dfh):
        dfh['s1'] =dfh.hit_id
        dfh['N1'] =1
        dfh['stepped_z'] = dfh.z
        mm = 1
        dz0 = self.dz0

        for ii in range(self.num_loops):
            dfh['r'] = dfh['r'] = np.sqrt(dfh['x'].values**2+dfh['y'].values**2+dfh['stepped_z'].values**2)
            dfh['rt'] = np.sqrt(dfh['x'].values**2+dfh['y'].values**2)
            dfh['a0'] = np.arctan2(dfh['y'].values,dfh['x'].values)
            dfh['z1'] = dfh['stepped_z'].values/dfh['rt'].values
            dfh['z2'] = dfh['stepped_z']/dfh['r']
            dfh['z3'] = 1/dfh['z1'].values
            mm = mm*(-1)  
            
            z_step = mm * dz0 * ii
            dfh['stepped_z'] = dfh['z'] + z_step    
            dz = mm*(dz0+ii*self.stepdz)
            
            dfh['a1'] = dfh['a0'].values*(dfh['rt']+dz*dfh['rt']**2)/1000*(ii/2)/180*math.pi
            dfh['x1'] = dfh['a1'].values/dfh['z1'].values
            dfh['sina1'] = np.sin(dfh['a1'].values)
            dfh['cosa1'] = np.cos(dfh['a1'].values)
            ss = StandardScaler()
            dfs = ss.fit_transform(dfh[['sina1','cosa1','z1','z2', 'z3', 'x1']].values)
            cx = self.weight_arr[self.weights]
            dfs = np.multiply(dfs, cx)

            clusters=DBSCAN(eps=self.epsilon+(ii*self.stepeps),min_samples=1,metric='euclidean',n_jobs=1).fit(dfs).labels_ 

            if ii==0:
                dfh['s1'] = clusters
                dfh['N1'] = dfh.groupby('s1')['s1'].transform('count')

            # else update our hits conditionally, if it's a better fit
            else:
                # put our new clusters to another feature
                dfh['s2'] = clusters

                # get the count of those clusters
                dfh['N2'] = dfh.groupby('s2')['s2'].transform('count')
                maxs1 = dfh['s1'].max()

                # if our new clusters are bigger, but less than 20, use the new ones instead
                cond = np.where((dfh['N2'].values>dfh['N1'].values) & (dfh['N2'].values<self.max_size))
                s1 = dfh['s1'].values
                s1[cond] = dfh['s2'].values[cond]+maxs1

                # write the new clusters back to our dataframe
                dfh['s1'] = s1
                dfh['s1'] = dfh['s1'].astype('int64')
                dfh['N1'] = dfh.groupby('s1')['s1'].transform('count')
        
        # return our clusters
        return dfh['s1'].values    
    
    def predict(self, hits):    
        # init our clusters
        self.clusters = self._init(hits) 
        
        if self.final_cluster:
            # preprocess our data
            X = self._preprocess(hits) 

            # create our last clusterer
            cl = hdbscan.HDBSCAN(min_samples=1,min_cluster_size=7, metric='braycurtis',cluster_selection_method='leaf',algorithm='best', leaf_size=50)

            # labels = unique clusters
            labels = np.unique(self.clusters)

            # remove outliers
            self._eliminate_outliers(labels,X)

            # init n_labels
            n_labels = 0

            # now we loop through the points that haven't been assigned to a cluster and assign them with
            # HDBSCAN
            while n_labels < len(labels):
                n_labels = len(labels)
                max_len = np.max(self.clusters)
                mask = self.clusters == 0
            self.clusters[mask] = cl.fit_predict(X[mask])+max_len
            
        return self.clusters

In [10]:
results_df = pd.read_pickle("gs_results.pkl")
# results_df

In [11]:
# defaults: 
# eps=0.0035, dz0 = -0.00070, stepdz = 0.00001, stepeps = 0.000005
# 0.0035, 0.0005, 5e-05, 1e-09, 150, True, 10, True
# create our params to iterate over
eps_vals = [ 0.0035 ] #, , 0.0035, 0.00355
dz0_vals = [ 0.000300 ]
stepdz_vals = [ 0.000050 ] # , 4e-6, 5e-6
stepeps_vals = [ 1e-9 ] # , 0
loop_vals = [155, 153, 175 ] # , 155, 160
weight_vals = [ 10 ]
final_cluster = [True, False]
dz_incr = [True]
size_vals = [ 19, 20, 21 ]
step = "eps"

foo = product(eps_vals, dz0_vals, stepdz_vals, stepeps_vals, loop_vals, final_cluster, weight_vals, dz_incr, size_vals)

offset = np.max(results_df.index.values) + 1

iter_list = []
for i, item in enumerate(foo):
    iter_list.append([i + offset, item])
    
print("Length:", len(iter_list))    

Length: 18


In [12]:
def grid_search_loop(params):
    counter = params[0]
    eps, dz0, stepdz, stepeps, loops, final_cluster, weights, dz_incr, max_size = params[1]
    step = "eps"
    dataset_scores = []
    print("Evaluating Counter:", counter, "Eps:", eps, "dz0:", dz0, "stepdz:", stepdz, "stepeps:", stepeps, "loops:", loops, "weight:", weights, "max size:", max_size)
    
    for event_id, hits, cells, particles, truth in load_dataset("./data/train_1", skip=50, nevents=1):
        # Track pattern recognition
        model = Clusterer(eps=eps, dz0=dz0, stepdz=stepdz, stepeps=stepeps, num_loops=loops, final_cluster=final_cluster, weight=weights, weight_arr=weights_arr, dz_incr=dz_incr, max_size=max_size)
        labels = model.predict(hits)

        # Prepare submission for an event
        one_submission = create_one_event_submission(event_id, hits, labels)

        # Score for the event
        score = score_event(truth, one_submission)
        dataset_scores.append(score)
        print(event_id, params[1], "score:", score)
        
    mean_score = np.mean(dataset_scores)
    
    # create and return our results
    result = [counter, [eps, dz0, stepdz, stepeps, loops, step, weights, final_cluster, max_size, mean_score]]
    print(result)
    
    return result

def grid_search(iter_list, results_df=None, start=0, end=5):
    pool = Pool(processes=14)
    results = pool.map(grid_search_loop, iter_list[start:end])
    pool.close()
    
    return results

In [None]:
results = grid_search(iter_list, results_df, start=0, end=28)

for item in results:
    results_df.loc[item[0]] = item[1]
    
results_df.sort_values("acc", ascending=False).to_pickle("gs_results.pkl")
results_df

Evaluating Counter: 767 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 155 weight: 10 max size: 19
Evaluating Counter: 769 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 155 weight: 10 max size: 21
Evaluating Counter: 768 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 155 weight: 10 max size: 20
Evaluating Counter: 774 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 153 weight: 10 max size: 20
Evaluating Counter: 770 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 155 weight: 10 max size: 19
Evaluating Counter: 773 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 153 weight: 10 max size: 19
Evaluating Counter: 772 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 155 weight: 10 max size: 21
Evaluating Counter: 778 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 153 weight: 10 max size: 21
Evaluating Counter: 776 Eps: 0.0035 dz0: 0.0003 stepdz: 5e-05 stepeps: 1e-09 loops: 153 weight: 10 max s

In [23]:
results_df.sort_values("acc", ascending=False)

Unnamed: 0,eps,dz0,stepdz,stepeps,loops,acc
404,0.00375,-0.0007,1e-05,3e-06,135.0,0.480309
402,0.00375,-0.0007,1e-05,3e-06,125.0,0.478326
386,0.00375,-0.0007,1e-05,5e-06,125.0,0.478138
383,0.00375,-0.00075,1e-05,5e-06,125.0,0.478138
380,0.00375,-0.00072,1e-05,5e-06,125.0,0.478138
405,0.00375,-0.0007,1e-05,4e-06,125.0,0.478088
403,0.00375,-0.0007,1e-05,3e-06,120.0,0.477423
387,0.00375,-0.0007,1e-05,5e-06,120.0,0.477136
384,0.00375,-0.00075,1e-05,5e-06,120.0,0.477136
381,0.00375,-0.00072,1e-05,5e-06,120.0,0.477136


In [12]:
results_df.sort_values("acc", ascending=False).to_pickle("gs_results.pkl")