# Experiments for "DBSpan: Density-Based Spanner for Clustering Complex Data, With an Application to Persistence Diagrams"

To get started, we will do a little bit of path hackery to import the library.  There are better ways to do this, but for now, this is okay.

In [None]:
import random
import sys
import os

libpath = os.path.abspath('..')
sys.path.insert(0, libpath)

import dbspan

We will do a bunch of comparisions of DBScan and DBSpan, so it will help if we have some code for making the algos

In [None]:
def make_algos(eps, min_samples, delta):
    dgm_metric = dbspan.topology.DiagramMetric()

    def dgm1_metric(dgm1, dgm2):
        return dgm_metric.bottleneck(dgm1[1], dgm2[1])

    algo_dbscan = dbspan.cluster.DBSCAN(metric=dgm1_metric, eps=eps, min_samples=min_samples)
    algo_dbspan = dbspan.cluster.DBSpan(metric=dgm1_metric, eps=eps, min_samples=min_samples, delta=delta)

    return algo_dbscan, algo_dbspan

## Experiment 1: Point clouds of things in 4D

First, we will create some tools for creating dgms of things in 4D.  Each function will produce a diagram from a rips filtration of a "noisy" sphere, torus, or swiss roll.

In [None]:
data_factory = dbspan.topology.PointSetDataFactory(seed=72330)
dgm_factory = dbspan.topology.DiagramFactory()

rng = random.Random(43081)

def make_sphere_dgm():
    points = data_factory.make_sphere(dim=4, num_points=100, noise=.1)
    return dgm_factory.make_from_point_set(points)

def make_torus_dgm():
    points = data_factory.make_torus(dim=4, num_points=100, noise=.1)
    return dgm_factory.make_from_point_set(points)

def make_swiss_roll_dgm():
    points = data_factory.make_swiss_roll(dim=4, num_points=100, noise=.1)
    return dgm_factory.make_from_point_set(points)

Now that we have some functions for creating data, let's create our data set that we will cluster:

In [None]:
dgms = [make_sphere_dgm() for _ in range(10)] \
    + [make_torus_dgm() for _ in range(10)] \
    + [make_swiss_roll_dgm() for _ in range(4)]

And we are off!  Let's do some experiments

In [None]:
import time
import pandas as pd
from sklearn import metrics

def add_row(df, delta, num_edges, max_edges, rand_index, dbscan_time, dbspan_time):
    data = [delta, rand_index, num_edges, max_edges, num_edges/max_edges, dbscan_time, dbspan_time, dbscan_time/dbspan_time]
    cols=['Delta', 'RandIdx', 'nEdges', 'maxEdges', '%Edges', 'T_DBSCAN', 'T_DBSPAN', 'T_Ratio']
    line = pd.DataFrame([data,], columns=cols)
    return pd.concat([df, line])

def run_configuration(df, data, delta, cache=None):
    # create the algos
    algo_dbscan, algo_dbspan = make_algos(eps=.3, min_samples=5, delta=delta)

    # run the algos
    t0 = time.perf_counter()
    true_labels = cache['true_labels'] if cache else algo_dbscan.fit(data)
    t1 = time.perf_counter()
    dbspan_labels, dbg_data= algo_dbspan.fit(data, dbg=True)
    t2 = time.perf_counter()
    
    # pull out the dbg data for the analysis
    spanner = dbg_data['neighborhood'].spanner
    
    # prepare data for row
    rand_index = metrics.rand_score(true_labels, dbspan_labels)
    dbscan_time = cache['dbscan_time'] if cache else t1 - t0
    dbspan_time = t2 - t1
    num_edges = spanner.number_of_edges()
    n = spanner.number_of_nodes()
    max_edges = n * (n-1) / 2
    
    # return row
    return add_row(df, delta, num_edges, max_edges, rand_index, dbscan_time, dbspan_time), {
        'true_labels': true_labels,
        'dbscan_time': dbscan_time,
    }

In [None]:
results = None
cache = None
for delta in [.1, 1, 10, 50, 100, 500, 1000]:
    results, cache = run_configuration(results, dgms, delta, cache=cache)
    print(results)

In [None]:
cache