
# Simple profiling for dbscan 1d implementation

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

from dbscan1d import DBSCAN1D
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_blobs


n_points = [10, 100, 1_000, 10_000, 20_000, 30_000, 40_000,]
centers = 2

In [None]:
# create tests data matrix
def create_blobs(num_points, centers):
    return make_blobs(num_points, centers=centers, n_features=1)[0]

In [None]:
# Profile
db1 = DBSCAN1D(.5, 4)
db2 = DBSCAN(.5, 4)

In [None]:
# profile each stream type with each function
df = pd.DataFrame(columns=['dbscan', 'dbscan1d'], index=n_points)
for n_point in n_points:
    print(f'on {n_point}')
    X = create_blobs(n_point, centers)
    print('starting dbscan1d')
    ti1 = %timeit -o db1.fit_predict(X)
    df.loc[n_point, 'dbscan1d'] = ti1.best
    print('starting dbscan')
    ti2 = %timeit -o db2.fit_predict(X)
    df.loc[n_point, 'dbscan'] = ti2.best
    print()
    print()

## Plot results
Plot results and save figure to docs

In [None]:
out_path = Path(__file__).parent / 'profile_results.png'

x = df.index.values
plt.loglog(x, df['dbscan'].values, label='dbscan', color='r')
plt.loglog(x, df['dbscan1d'].values, label='dbscan1d', color='b')

plt.xlabel('number of points')
plt.ylabel('run time (s)')

plt.legend()

plt.savefig(out_path)

plt.show()
