/
fast_knn.py
129 lines (112 loc) · 6.09 KB
/
fast_knn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import numpy as np
from scipy.spatial import KDTree
from impyute.ops import matrix
from impyute.ops import wrapper
from impyute.ops import inverse_distance_weighting as idw
from . import mean
# pylint: disable=too-many-arguments
@wrapper.wrappers
@wrapper.checks
def fast_knn(data, k=3, eps=0, p=2, distance_upper_bound=np.inf, leafsize=10,
idw_fn=idw.shepards, init_impute_fn=mean):
""" Impute using a variant of the nearest neighbours approach
Basic idea: Impute array with a passed in initial impute fn (mean impute)
and then use the resulting complete array to construct a KDTree. Use this
KDTree to compute nearest neighbours. After finding `k` nearest
neighbours, take the weighted average of them. Basically, find the nearest
row in terms of distance
This approach is much, much faster than the other implementation (fit+transform
for each subset) which is almost prohibitively expensive.
Parameters
----------
data: numpy.ndarray
2D matrix to impute.
k: int, optional
Parameter used for method querying the KDTree class object. Number of
neighbours used in the KNN query. Refer to the docs for
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
eps: nonnegative float, optional
Parameter used for method querying the KDTree class object. From the
SciPy docs: "Return approximate nearest neighbors; the kth returned
value is guaranteed to be no further than (1+eps) times the distance to
the real kth nearest neighbor". Refer to the docs for
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
p : float, 1<=p<=infinity, optional
Parameter used for method querying the KDTree class object. Straight from the
SciPy docs: "Which Minkowski p-norm to use. 1 is the
sum-of-absolute-values Manhattan distance 2 is the usual Euclidean
distance infinity is the maximum-coordinate-difference distance". Refer to
the docs for
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
distance_upper_bound : nonnegative float, optional
Parameter used for method querying the KDTree class object. Straight
from the SciPy docs: "Return only neighbors within this distance. This
is used to prune tree searches, so if you are doing a series of
nearest-neighbor queries, it may help to supply the distance to the
nearest neighbor of the most recent point." Refer to the docs for
[`scipy.spatial.KDTree.query`]
(https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.KDTree.query.html).
leafsize: int, optional
Parameter used for construction of the `KDTree` class object. Straight from
the SciPy docs: "The number of points at which the algorithm switches
over to brute-force. Has to be positive". Refer to the docs for
[`scipy.spatial.KDTree`](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.KDTree.html)
for more information.
idw_fn: fn, optional
Function that takes one argument, a list of distances, and returns weighted percentages. You can define a custom
one or bootstrap from functions defined in `impy.util.inverse_distance_weighting` which can be using
functools.partial, for example: `functools.partial(impy.util.inverse_distance_weighting.shepards, power=1)`
init_impute_fn: fn, optional
Returns
-------
numpy.ndarray
Imputed data.
Examples
--------
>>> data = np.arange(25).reshape((5, 5)).astype(np.float)
>>> data[0][2] = np.nan
>>> data
array([[ 0., 1., nan, 3., 4.],
[ 5., 6., 7., 8., 9.],
[10., 11., 12., 13., 14.],
[15., 16., 17., 18., 19.],
[20., 21., 22., 23., 24.]])
>> fast_knn(data, k=1) # Weighted average (by distance) of nearest 1 neighbour
array([[ 0., 1., 7., 3., 4.],
[ 5., 6., 7., 8., 9.],
[10., 11., 12., 13., 14.],
[15., 16., 17., 18., 19.],
[20., 21., 22., 23., 24.]])
>> fast_knn(data, k=2) # Weighted average of nearest 2 neighbours
array([[ 0. , 1. , 10.08608891, 3. , 4. ],
[ 5. , 6. , 7. , 8. , 9. ],
[10. , 11. , 12. , 13. , 14. ],
[15. , 16. , 17. , 18. , 19. ],
[20. , 21. , 22. , 23. , 24. ]])
>> fast_knn(data, k=3)
array([[ 0. , 1. , 13.40249283, 3. , 4. ],
[ 5. , 6. , 7. , 8. , 9. ],
[10. , 11. , 12. , 13. , 14. ],
[15. , 16. , 17. , 18. , 19. ],
[20. , 21. , 22. , 23. , 24. ]])
>> fast_knn(data, k=5) # There are at most only 4 neighbours. Raises error
...
IndexError: index 5 is out of bounds for axis 0 with size 5
"""
nan_xy = matrix.nan_indices(data)
data_c = init_impute_fn(data)
kdtree = KDTree(data_c, leafsize=leafsize)
for x_i, y_i in nan_xy:
distances, indices = kdtree.query(data_c[x_i], k=k+1, eps=eps,
p=p, distance_upper_bound=distance_upper_bound)
# Will always return itself in the first index. Delete it.
distances, indices = distances[1:], indices[1:]
# Add small constant to distances to avoid division by 0
distances += 1e-3
weights = idw_fn(distances)
# Assign missing value the weighted average of `k` nearest neighbours
data[x_i][y_i] = np.dot(weights, [data_c[ind][y_i] for ind in indices])
return data