In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KDTree as skKNN
from cuml import KNN as cumlKNN
import cudf
import os

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz',source='mortgage'):
    if os.path.exists(cached) and source=='mortgage':
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        print('use random data')
        X = np.random.random((nrows,ncols)).astype('float32')
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])}).fillna(0)
    return df

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=1e-2,with_sign=True,metric='mse'):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    if metric=='mse':
        error = mean_squared_error(a,b)
    else:
        error = np.sum(a!=b)/(a.shape[0]*a.shape[1])
    res = error<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x    

# Run tests

In [5]:
%%time
nrows = 10
ncols = 2

X = load_data(nrows,ncols)
print('data',X.shape)

use mortgage data
data (15, 20)
CPU times: user 5.72 s, sys: 620 ms, total: 6.34 s
Wall time: 6.34 s


In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 2 columns):
fea0    15 non-null float32
fea1    15 non-null float32
dtypes: float32(2)
memory usage: 200.0 bytes


In [6]:
n_neighbors = 2

In [8]:
%%time
knn_sk = skKNN(X)
D_sk,I_sk = knn_sk.query(X,n_neighbors)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 932 µs


In [7]:
%%time
X = cudf.DataFrame.from_pandas(X)

CPU times: user 1.53 s, sys: 992 ms, total: 2.52 s
Wall time: 2.98 s


In [None]:
%%time
knn_cuml = cumlKNN()
knn_cuml.fit(X)

In [10]:
I_cuml, D_cuml = knn_cuml.query(X,n_neighbors)

NameError: name 'knn_cuml' is not defined

In [12]:
print(D_cuml[10:])

       
10  0.0
11  0.0
12  0.0
13  0.0
14  0.0
15  0.0
16  0.0
17  0.0
18  0.0
19  0.0


In [27]:
passed = array_equal(D_sk,D_cuml)
message = 'compare knn: cuml vs sklearn distances %s'%('equal'if passed else 'NOT equal')
print(message)
passed = array_equal(I_sk,I_cuml)
message = 'compare knn: cuml vs sklearn indexes %s'%('equal'if passed else 'NOT equal')
print(message)

ValueError: Found input variables with inconsistent numbers of samples: [10, 20]