# Experiments for "DBSpan: Density-Based Spanner for Clustering Complex Data, With an Application to Persistence Diagrams"

To get started, we will do a little bit of path hackery to import the library.  There are better ways to do this, but for now, this is okay.

In [1]:
import random
import sys
import os

libpath = os.path.abspath('..')
sys.path.insert(0, libpath)

import dbspan

We will do a bunch of comparisions of DBScan and DBSpan, so it will help if we have some code for making the algos

In [2]:
def make_algos(eps, min_samples, delta):
    dgm_metric = dbspan.topology.DiagramMetric()

    def dgm1_metric(dgm1, dgm2):
        return dgm_metric.bottleneck(dgm1[1], dgm2[1])

    algo_dbscan = dbspan.cluster.DBSCAN(metric=dgm1_metric, eps=eps, min_samples=min_samples)
    algo_dbspan = dbspan.cluster.DBSpan(metric=dgm1_metric, eps=eps, min_samples=min_samples, delta=delta)

    return algo_dbscan, algo_dbspan

## Experiment 1: Point clouds of things in 4D

First, we will create some tools for creating dgms of things in 4D.  Each function will produce a diagram from a rips filtration of a "noisy" sphere, torus, or swiss roll.

In [3]:
data_factory = dbspan.topology.PointSetDataFactory(seed=72330)
dgm_factory = dbspan.topology.DiagramFactory()

rng = random.Random(43081)

def make_sphere_dgm():
    points = data_factory.make_sphere(dim=4, num_points=100, noise=.1)
    return dgm_factory.make_from_point_set(points)

def make_torus_dgm():
    points = data_factory.make_torus(dim=4, num_points=100, noise=.1)
    return dgm_factory.make_from_point_set(points)

def make_swiss_roll_dgm():
    points = data_factory.make_swiss_roll(dim=4, num_points=100, noise=.1)
    return dgm_factory.make_from_point_set(points)

Now that we have some functions for creating data, let's create our data set that we will cluster:

In [39]:
dgms = [make_sphere_dgm() for _ in range(30)] \
    + [make_torus_dgm() for _ in range(30)] \
    + [make_swiss_roll_dgm() for _ in range(14)]

And we are off!  Let's do some experiments

In [40]:
import time
import pandas as pd
from sklearn import metrics

def add_row(df, delta, num_edges, max_edges, rand_index, dbscan_time, dbspan_time):
    data = [delta, rand_index, num_edges, max_edges, num_edges/max_edges, dbscan_time, dbspan_time, dbscan_time/dbspan_time]
    cols=['$\delta$', 'Rand index', 'Num Edges', 'maxEdges', '\% Possible Edges', 'T_DBSCAN', 'DBSpan time (sec)', 'Speedup']
    line = pd.DataFrame([data,], columns=cols)
    return pd.concat([df, line])

def run_configuration(df, data, delta, cache=None):
    # create the algos
    algo_dbscan, algo_dbspan = make_algos(eps=.3, min_samples=15, delta=delta)

    # run the algos
    t0 = time.perf_counter()
    true_labels = cache['true_labels'] if cache else algo_dbscan.fit(data)
    t1 = time.perf_counter()
    dbspan_labels, dbg_data= algo_dbspan.fit(data, dbg=True)
    t2 = time.perf_counter()
    
    # pull out the dbg data for the analysis
    spanner = dbg_data['neighborhood'].spanner
    
    # prepare data for row
    rand_index = metrics.rand_score(true_labels, dbspan_labels)
    dbscan_time = cache['dbscan_time'] if cache else t1 - t0
    dbspan_time = t2 - t1
    num_edges = spanner.number_of_edges()
    n = spanner.number_of_nodes()
    max_edges = n * (n-1) / 2
    
    # return row
    return add_row(df, delta, num_edges, max_edges, rand_index, dbscan_time, dbspan_time), {
        'true_labels': true_labels,
        'dbscan_time': dbscan_time,
    }

In [41]:
results = None
cache = None
for delta in [.1, 1, 10, 50, 100, 500, 1000]:
    results, cache = run_configuration(results, dgms, delta, cache=cache)
    print(results)

   $\delta$  Rand index  Num Edges  maxEdges  \% Possible Edges    T_DBSCAN  \
0       0.1         1.0       2243    2701.0          0.8304332  74.9850758   

   DBSpan time (sec)    Speedup  
0         80.9130799  0.9267361  
   $\delta$  Rand index  Num Edges  maxEdges  \% Possible Edges    T_DBSCAN  \
0       0.1         1.0       2243    2701.0          0.8304332  74.9850758   
0       1.0         1.0       1114    2701.0          0.4124398  74.9850758   

   DBSpan time (sec)    Speedup  
0         80.9130799  0.9267361  
0         47.9950000  1.5623518  
   $\delta$  Rand index  Num Edges  maxEdges  \% Possible Edges    T_DBSCAN  \
0       0.1     1.00000       2243    2701.0          0.8304332  74.9850758   
0       1.0     1.00000       1114    2701.0          0.4124398  74.9850758   
0      10.0     0.98408        442    2701.0          0.1636431  74.9850758   

   DBSpan time (sec)    Speedup  
0         80.9130799  0.9267361  
0         47.9950000  1.5623518  
0         18.7

In [42]:
tex_string = results.style.to_latex()
print(tex_string)

\begin{tabular}{lrrrrrrrr}
 & $\delta$ & Rand index & Num Edges & maxEdges & \% Possible Edges & T_DBSCAN & DBSpan time (sec) & Speedup \\
0 & 0.100000 & 1.000000 & 2243 & 2701.000000 & 0.830433 & 74.985076 & 80.913080 & 0.926736 \\
0 & 1.000000 & 1.000000 & 1114 & 2701.000000 & 0.412440 & 74.985076 & 47.995000 & 1.562352 \\
0 & 10.000000 & 0.984080 & 442 & 2701.000000 & 0.163643 & 74.985076 & 18.743303 & 4.000633 \\
0 & 50.000000 & 0.844502 & 398 & 2701.000000 & 0.147353 & 74.985076 & 16.582940 & 4.521820 \\
0 & 100.000000 & 0.857830 & 375 & 2701.000000 & 0.138837 & 74.985076 & 15.657587 & 4.789057 \\
0 & 500.000000 & 0.954461 & 421 & 2701.000000 & 0.155868 & 74.985076 & 18.624877 & 4.026071 \\
0 & 1000.000000 & 0.844502 & 388 & 2701.000000 & 0.143650 & 74.985076 & 17.437679 & 4.300175 \\
\end{tabular}

