In [101]:
from sklearn.neighbors import KDTree
import pandas as pd
import numpy as np
import timeit
from scipy.spatial import distance
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from sklearn.metrics import pairwise_distances_argmin

# KDTree

In [162]:
pred_hit = np.array([[13,-28,12.78]])

In [13]:
bag = pd.read_csv("/data/track-ml/bracis/datasets/bag_test.bag.csv", names=["x","y","z"], header=None)

In [97]:
bag

Unnamed: 0,x,y,z
0,13.880900,-28.452400,12.782100
1,31.948000,-64.647202,29.190100
2,51.642899,-102.942001,46.637100
3,78.363403,-153.121994,69.608597
4,119.526001,-226.973999,103.599998
...,...,...,...
299995,238.214996,-271.295990,-10.200000
299996,317.265991,-392.298004,-10.800000
299997,394.248993,-533.853027,-11.400000
299998,457.937012,-680.596985,-16.200001


In [98]:
bag_kdtree = bag.copy()

In [244]:
def kdtree(X=bag_kdtree,pred_hit=pred_hit, k=5, silent = True,leaf_size=3):
    tree = KDTree(X, leaf_size=leaf_size)
    dist, ind = tree.query(pred_hit, k=k)
    if silent is False:
        print(ind)  # indices of 5 closest neighbors
        print(dist)  # distances to 5 closest neighbors

In [239]:
kdtree(bag_kdtree,pred_hit,silent = False)

[[277180  40000 224700  60420      0]]
[[0.86324439 0.91806988 0.96723332 0.98760924 0.99028069]]


In [142]:
bag.iloc[277180,:]

x    12.8435
y   -28.8151
z    13.0173
Name: 277180, dtype: float64

In [143]:
n_runs = 100
print(timeit.timeit(kdtree, number=n_runs))

24.927744856104255


In [55]:
X[:1].values

array([[ 13.88090038, -28.45240021,  12.78209972]])

In [56]:
list_indexes = []
for i in range(ind.shape[1]): list_indexes.append(ind[0,i])
list_indexes

[277180, 40000, 224700, 60420, 0, 31300, 131490, 23860, 142390, 94450]

In [58]:
X.iloc[list_indexes,:]

Unnamed: 0,x,y,z
277180,12.8435,-28.8151,13.0173
40000,12.784,-28.835899,12.4678
224700,13.2135,-28.685699,12.1321
60420,13.7393,-28.5019,12.3594
0,13.8809,-28.4524,12.7821
31300,12.5594,-28.9144,12.8116
131490,13.0785,-28.732901,13.6404
23860,12.465,-28.947399,12.2908
142390,13.0785,-28.732901,11.793
94450,12.3706,-28.9804,13.3609


In [62]:
for i in list_indexes:
    print(np.sqrt((X.loc[i,"x"] - pred_hit[0,0])**2 + 
                  (X.loc[i,"y"] - pred_hit[0,1])**2 + 
                  (X.loc[i,"z"] - pred_hit[0,2])**2))

0.8632443913689571
0.9180698826076227
0.9672333155721109
0.9876092426954545
0.9902806881716024
1.0155071616955493
1.1329578898619017
1.1929402807827183
1.2318579865310324
1.301834755079246


In [59]:
print(dist)

[[0.86324439 0.91806988 0.96723332 0.98760924 0.99028069 1.01550716
  1.13295789 1.19294028 1.23185799 1.30183476]]


# Sort and min

In [104]:
bag_dist_min = bag.copy()

In [122]:
def dist_min_serial(X=bag_dist_min,pred_hit=pred_hit, k=5, silent=True):
    dist = distance.cdist(X, pred_hit, 'sqeuclidean')
    ind = np.argmin(dist)
    if silent is False:
        print(ind)
        print(bag.iloc[ind,:])
        #print(ind)  # indices of 5 closest neighbor

In [130]:
def dist_min_parallel(X=bag_dist_min,pred_hit=pred_hit, k=5, silent=True):
    dist = pairwise_distances_argmin(X, pred_hit, metric = 'sqeuclidean')
    ind = np.argmin(dist)
    if silent is False:
        print(ind)
        print(bag.iloc[ind,:])
        #print(ind)  # indices of 5 closest neighbor

In [188]:
dist_min_serial(silent=False)

277180
x    12.8435
y   -28.8151
z    13.0173
Name: 277180, dtype: float64


In [137]:
n_runs = 100
print(timeit.timeit(dist_min_serial, number=n_runs))

0.2630734834820032


In [138]:
n_runs = 100
print(timeit.timeit(dist_min_parallel, number=n_runs))

0.6601543668657541


# DBSCAN

In [173]:
origin_hit = np.array([[0,0,0]])

bag_tb_ordered = bag.copy()
dist = distance.cdist(bag_tb_ordered, origin_hit, 'sqeuclidean')

In [174]:
bag_tb_ordered["dist"] = dist

In [175]:
bag_tb_ordered

Unnamed: 0,x,y,z,dist
0,13.880900,-28.452400,12.782100,1.165601e+03
1,31.948000,-64.647202,29.190100,6.051997e+03
2,51.642899,-102.942001,46.637100,1.543906e+04
3,78.363403,-153.121994,69.608597,3.443252e+04
4,119.526001,-226.973999,103.599998,7.653662e+04
...,...,...,...,...
299995,238.214996,-271.295990,-10.200000,1.304519e+05
299996,317.265991,-392.298004,-10.800000,2.546721e+05
299997,394.248993,-533.853027,-11.400000,4.405613e+05
299998,457.937012,-680.596985,-16.200001,6.731810e+05


In [180]:
bag_ordered = bag_tb_ordered.sort_values(by=['dist'])

In [186]:
dist_pred = distance.cdist(pred_hit, origin_hit, 'sqeuclidean')
dist_pred

array([[1116.3284]])

In [228]:
bag_nearest = bag_ordered.iloc[(bag_ordered['dist']-1116.3284).abs().argsort()[:10000]].sort_index()
bag_nearest
#277180

Unnamed: 0,x,y,z,dist
0,13.880900,-28.452400,12.78210,1165.600546
50,-32.281700,-2.928450,10.48750,1160.671644
80,3.086110,31.811100,-10.08080,1123.092689
130,-11.432900,-30.844400,3.86562,1097.031266
140,-4.485150,31.737400,8.75024,1103.945837
...,...,...,...,...
299780,31.540400,-7.886530,-10.37530,1164.641017
299870,-23.190500,21.440901,12.12060,1144.420468
299930,9.976400,30.926600,6.43096,1097.340370
299950,-25.031000,21.619101,-3.35938,1105.221911


In [229]:
277180 in bag_nearest.index.tolist()


True

In [238]:

bag_nearest.iloc[9250:9300,:]


Unnamed: 0,x,y,z,dist
277020,15.9527,27.211901,9.6165,1087.453237
277030,-21.390301,-23.990299,-10.3834,1140.894418
277070,-4.66945,-31.7218,-10.3227,1134.634473
277090,24.0963,20.9608,-10.9508,1139.906843
277120,2.89847,31.3258,10.6441,1103.003739
277150,-4.744,-31.7176,-6.34337,1068.750019
277170,-2.61714,-31.837601,-10.9269,1139.879383
277180,12.8435,-28.8151,13.0173,1164.715558
277200,17.9307,-26.139,11.8725,1145.713597
277290,-28.4513,14.5213,-10.1834,1124.046254


In [None]:
def dist_min_serial(X=bag_dist_min,pred_hit=pred_hit, k=5, silent=True):
    dist = distance.cdist(X, pred_hit, 'sqeuclidean')
    ind = np.argmin(dist)
    if silent is False:
        print(ind)
        print(bag.iloc[ind,:])
        #print(ind)  # indices of 5 closest neighbor

# Nearest Centroid Classifier

In [247]:
data = pd.read_csv("/data/track-ml/bracis/datasets/bag_test.bag.csv", names=["x","y","z"], header=None)

origin_hit = np.array([[0,0,0]])

dist = distance.cdist(data, origin_hit, 'sqeuclidean')
data["dist"] = dist

In [277]:
from sklearn.neighbors import NearestCentroid
import numpy as np
from sklearn import preprocessing
from sklearn import utils

X, y = data.iloc[:,:-1],data.iloc[:,-1]

lab_enc = preprocessing.LabelEncoder()
y = lab_enc.fit_transform(y)

clf = NearestCentroid()
clf.fit(X, y)
NearestCentroid()
print(clf.predict(pred_hit))

[20479]


In [278]:
y

array([ 20538,  54038,  82889, ..., 214710, 240835, 274211])