In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN as skDBSCAN
from cuml import DBSCAN as cumlDBSCAN
import cudf
import os

# Helper Functions

In [2]:
from timeit import default_timer

class Timer(object):
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

In [3]:
import gzip
def load_data(nrows, ncols, cached = 'data/mortgage.npy.gz'):
    if os.path.exists(cached):
        print('use mortgage data')
        with gzip.open(cached) as f:
            X = np.load(f)
        X = X[np.random.randint(0,X.shape[0]-1,nrows),:ncols]
    else:
        print('use random data')
        X = np.random.rand(nrows,ncols)
    df = pd.DataFrame({'fea%d'%i:X[:,i] for i in range(X.shape[1])})
    return df

In [4]:
from sklearn.metrics import mean_squared_error
def array_equal(a,b,threshold=5e-3,with_sign=True):
    a = to_nparray(a)
    b = to_nparray(b)
    if with_sign == False:
        a,b = np.abs(a),np.abs(b)
    res = mean_squared_error(a,b)<threshold
    return res

def to_nparray(x):
    if isinstance(x,np.ndarray) or isinstance(x,pd.DataFrame):
        return np.array(x)
    elif isinstance(x,np.float64):
        return np.array([x])
    elif isinstance(x,cudf.DataFrame) or isinstance(x,cudf.Series):
        return x.to_pandas().values
    return x

# Run tests

In [21]:
%%time
nrows = 1000
ncols = 128

X = load_data(nrows,ncols)
print('data',X.shape)

use mortgage data
data (1000, 128)
CPU times: user 5.78 s, sys: 708 ms, total: 6.48 s
Wall time: 6.48 s


In [22]:
eps = 0.3
min_samples = 2

In [23]:
%%time
clustering_sk = skDBSCAN(eps = eps, min_samples = min_samples)
clustering_sk.fit(X)

CPU times: user 200 ms, sys: 0 ns, total: 200 ms
Wall time: 198 ms


In [24]:
%%time
X = cudf.DataFrame.from_pandas(X)

CPU times: user 480 ms, sys: 0 ns, total: 480 ms
Wall time: 476 ms


In [None]:
%%time
clustering_cuml = cumlDBSCAN(eps = eps, min_samples = min_samples)
clustering_cuml.fit(X)

In [18]:
l = clustering_sk.labels_
print(str(l[l !=0]))

[-1  1 -1  1  2 -1  3  4  5  6 -1  7  8  9 10 -1 -1 -1 11 -1 -1 -1  9 -1
  9 -1 12 12 13 14 15 -1 12 16  4 17  9  6 12  2 -1 13 -1 17 12 -1 -1 18
 16 -1 19 20  4  1 21 17 16 -1  6  8  9 17 20 20 -1 20 22 -1  9 23  2 -1
  5 -1 -1  6 -1 -1  4 -1  2 13 -1 22 -1 14 -1 24 -1 -1 -1 -1  6 25 -1 17
  6 -1 24 -1 10 10 -1  2 19 26  5  1  9 -1 20  4 26 -1  9 19 -1 -1 -1  4
 -1 18 -1 -1 -1 25 -1 -1 19 -1 -1 12 -1 27  4 -1 -1 17 -1 25  9 17 28 29
 19 15 30 -1 31  1 -1 32 33 -1 25 -1 -1 -1 20 34 -1 31 -1 28 -1 32 -1 -1
 30 -1 -1 -1 27 -1 33 26 -1 -1 -1  7  9 -1 35  4  1 -1  3  4 11 36 -1 12
 -1 -1 37  2 23 -1  5 -1 23 -1  9 -1 26 -1 -1 38 27 -1 -1 -1  9 10 36  4
 26 25 -1 39 37 -1 -1  9 -1 -1 12  7 39 -1 25 -1  7 -1 15 -1  7 21 -1 -1
 -1 -1 34  2 19  8 -1 38 -1 35  2 29 -1 10  6 30]


In [19]:
l2 = clustering_cuml.labels_
print(str(l2[l2 != 0].to_array()))

[-1  1 -1  1  2 -1  3  4  5  6 -1  7  8  9 10 -1 -1 -1 11 -1 -1 -1  9 -1
  9 -1 12 12 13 14 15 -1 12 16  4 17  9  6 12  2 -1 13 -1 17 12 -1 -1 18
 16 -1 19 20  4  1 21 17 16 -1  6  8  9 17 20 20 -1 20 22 -1  9 23  2 -1
  5 -1 -1  6 -1 -1  4 -1  2 13 -1 22 -1 14 -1 24 -1 -1 -1 -1  6 25 -1 17
  6 -1 24 -1 10 10 -1  2 19 26  5  1  9 -1 20  4 26 -1  9 19 -1 -1 -1  4
 -1 18 -1 -1 -1 25 -1 -1 19 -1 -1 12 -1 27  4 -1 -1 17 -1 25  9 17 28 29
 19 15 30 -1 31  1 -1 32 33 -1 25 -1 -1 -1 20 34 -1 31 -1 28 -1 32 -1 -1
 30 -1 -1 -1 27 -1 33 26 -1 -1 -1  7  9 -1 35  4  1 -1  3  4 11 36 -1 12
 -1 -1 37  2 23 -1  5 -1 23 -1  9 -1 26 -1 -1 38 27 -1 -1 -1  9 10 36  4
 26 25 -1 39 37 -1 -1  9 -1 -1 12  7 39 -1 25 -1  7 -1 15 -1  7 21 -1 -1
 -1 -1 34  2 19  8 -1 38 -1 35  2 29 -1 10  6 30]


In [20]:
passed = array_equal(clustering_sk.labels_,clustering_cuml.labels_)
message = 'compare dbscan: cuml vs sklearn labels_ %s'%('equal'if passed else 'NOT equal')
print(message)

compare dbscan: cuml vs sklearn labels_ equal
