# Setup

## Init

In [85]:
import os
import glob
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.cluster import adjusted_rand_score

## Conf

In [187]:
baseline_key = "baseline"
ours_key = "ours"

default_dataset = "geonames"

baseline_variant_name = 'mrdbscan'
ours_variant_name = 'dDBGSCAN'
default_variants = {baseline_key: baseline_variant_name, ours_key: ours_variant_name}
partitioner_only_variants = {baseline_key: ours_variant_name, ours_key: ours_variant_name}

baseline_partitioner_name = 'cost'
ours_partitioner_name = 'S2'
default_partitioners = {baseline_key: baseline_partitioner_name, ours_key: ours_partitioner_name}

default_exp_index=1
default_exp_indices = {baseline_key: default_exp_index, ours_key: default_exp_index}
dummy_partition_level=99
default_partition_level=13
default_partition_levels = {baseline_key: default_partition_level, ours_key: default_partition_level}
default_1part_points=100000000
default_dummy_max_points=256
default_max_points = {baseline_key: default_1part_points, ours_key: default_dummy_max_points}

In [50]:
remote_output_base_path="s3://waze.tmp/dmarcous/clustering/output/"
local_output_base_path="/tmp/output/"
# local_output_base_path="~/git/dDBGSCAN/src/test/resources/output/" # Test 
experiment_path_suffix="{dataset}/{variant}/{partitioner}/partlvl_{partition_level}/maxp_{maxpoints}/exp_{exp_index:02d}/"

# Read Data

In [51]:
def buildExperimentSpec(dataset, variant, partitioner, partition_level, maxpoints, exp_index):
    return {"dataset": dataset, "variant": variant,
            "partitioner": partitioner, "partition_level": partition_level,
            "maxpoints": maxpoints, "exp_index": exp_index}

In [52]:
def formatExperimentPath(base_path, experiment_spec):
    results_template = base_path + experiment_path_suffix
    result_path=results_template.format(dataset=experiment_spec['dataset'],
                                        variant=experiment_spec['variant'],
                                        partitioner=experiment_spec['partitioner'],
                                        partition_level=experiment_spec['partition_level'],
                                        maxpoints=experiment_spec['maxpoints'],
                                        exp_index=experiment_spec['exp_index'])
    return result_path

In [53]:
def downloadResultFiles(experiment_spec):
    remote_path = formatExperimentPath(remote_output_base_path, experiment_spec)
    local_path = formatExperimentPath(local_output_base_path, experiment_spec)
    download_cmd = "aws s3 cp --recursive " + remote_path + " " + local_path
    print(download_cmd)
    if not os.path.isdir(local_path):
        return_code = os.system(download_cmd)
    else :
        print("Skipping, local already exists")
    return local_path

In [54]:
def readClusteringResults(data_path):
    column_names = ['recordId', 'cluster', 'instanceStatus']
    df = pd.concat([pd.read_csv(f, index_col=None, names=column_names) for f in glob.glob(data_path+'*')], axis=0, ignore_index=True, sort=False)
    return df.sort_values(by=['recordId'])

In [55]:
def prepareDataForAnaysis(dataset=default_dataset, variant=ours_variant_name,
                          partitioner=ours_partitioner_name,
                          partition_level=default_partition_level, maxpoints=default_1part_points,
                          exp_index=default_exp_index):
    experiment_spec=buildExperimentSpec(dataset, variant, partitioner, partition_level, maxpoints, exp_index)
    print(experiment_spec)
    local_path = downloadResultFiles(experiment_spec)
    results_df = readClusteringResults(local_path)
    return results_df

# Compare

In [56]:
def resultsToLabels(df):
    return df.sort_values(by=['recordId'])['cluster'].tolist()

In [57]:
def compareClusteringResults(baseline, ours):
    assert baseline.shape[0] == ours.shape[0]
    return adjusted_rand_score(resultsToLabels(baseline), resultsToLabels(ours))

In [109]:
def adjustClusterIds(df):
    df_adjusted = df.copy()
    for cluster in df['cluster'].unique():
        df_adjusted.loc[df_adjusted.cluster==cluster,'cluster'] = df_adjusted.loc[df_adjusted.cluster==cluster,'recordId'].min()
    return df_adjusted

# Main

## Chain

In [173]:
def compare(dataset,
            variants=partitioner_only_variants,
            partitioners=default_partitioners,
            partition_levels=default_partition_levels,
            max_points=default_max_points,
            exp_indices=default_exp_indices):
    baseline = prepareDataForAnaysis(dataset,
                                     variants[baseline_key], partitioners[baseline_key],
                                     partition_levels[baseline_key], max_points[baseline_key],
                                     exp_indices[baseline_key])
    ours = prepareDataForAnaysis(dataset,
                                 variants[ours_key], partitioners[ours_key],
                                 partition_levels[ours_key], max_points[ours_key],
                                 exp_indices[ours_key])
    return compareClusteringResults(baseline, ours)



In [59]:
def printCompareResults(dataset, ARI):
    print("Dateset : " + dataset + " , ARI: " + str(ARI))

## Conf

In [19]:
small_datasets=["geo54k", "geo108k", "geo005"]

## Run Comparisons

### Scale ours VS cost based partitioner version of ours - scale

In [20]:
for dataset in small_datasets:
    ARI = compare(dataset)
    printCompareResults(dataset, ARI)

aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_100000000/exp_01/ /tmp/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_100000000/exp_01/
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo54k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/ /tmp/output/geo54k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/
Dateset : geo54k , ARI: 1.0
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo108k/dDBGSCAN/cost/partlvl_13/maxp_100000000/exp_01/ /tmp/output/geo108k/dDBGSCAN/cost/partlvl_13/maxp_100000000/exp_01/
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/ /tmp/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/
Dateset : geo108k , ARI: 0.9999987538300177
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo005/dDBGSCAN/cost/partlvl_13/maxp_100000000/exp_01/ /tmp/output/geo005/dDBGSCAN/cost/partlvl_13/maxp_100000000/exp_01/
aws s3 cp --recursive s3://waze

In [25]:
large_datasets = ["geo015", "geoquarter", "geosmall"]
large_max_points = [1659300, 2765500, 5531000]

for dataset, maxp in zip(large_datasets, large_max_points):
    cur_max_points = {baseline_key: maxp, ours_key: default_dummy_max_points}
    ARI = compare(dataset, max_points=cur_max_points)
    printCompareResults(dataset, ARI)

aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo015/dDBGSCAN/cost/partlvl_13/maxp_1659300/exp_01/ /tmp/output/geo015/dDBGSCAN/cost/partlvl_13/maxp_1659300/exp_01/
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo015/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/ /tmp/output/geo015/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/
Dateset : geo015 , ARI: 0.9999999391659478
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geoquarter/dDBGSCAN/cost/partlvl_13/maxp_2765500/exp_01/ /tmp/output/geoquarter/dDBGSCAN/cost/partlvl_13/maxp_2765500/exp_01/
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geoquarter/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/ /tmp/output/geoquarter/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/
Dateset : geoquarter , ARI: 0.9999990655379201
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geosmall/dDBGSCAN/cost/partlvl_13/maxp_5531000/exp_01/ /tmp/output/geosmall/dDBGSCAN/cost/partlvl_13/maxp_5531000/exp_01/
aws s3 cp

### Ours - different partition levels

In [184]:
dataset = "geonames"
ours_partitioner = {baseline_key: ours_partitioner_name, ours_key: ours_partitioner_name}
ours_max_points = {baseline_key: default_dummy_max_points, ours_key: default_dummy_max_points}
part_levels = [9] #11, 13

In [186]:
for lvl in part_levels:
    cur_partition_levels = {baseline_key: 7, ours_key: lvl}
    ARI = compare(dataset, partitioners=ours_partitioner, partition_levels=cur_partition_levels, max_points=ours_max_points)
    printCompareResults(dataset, ARI)

{'dataset': 'geonames', 'variant': 'dDBGSCAN', 'partitioner': 'S2', 'partition_level': 7, 'maxpoints': 256, 'exp_index': 1}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geonames/dDBGSCAN/S2/partlvl_7/maxp_256/exp_01/ /tmp/output/geonames/dDBGSCAN/S2/partlvl_7/maxp_256/exp_01/
{'dataset': 'geonames', 'variant': 'dDBGSCAN', 'partitioner': 'S2', 'partition_level': 11, 'maxpoints': 256, 'exp_index': 1}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geonames/dDBGSCAN/S2/partlvl_11/maxp_256/exp_01/ /tmp/output/geonames/dDBGSCAN/S2/partlvl_11/maxp_256/exp_01/
Dateset : geonames , ARI: 0.9999991362803113


### Ours - different runs of same experiment

In [178]:
dataset = "geo108k"
ours_partitioner = {baseline_key: ours_partitioner_name, ours_key: ours_partitioner_name}
ours_max_points = {baseline_key: default_dummy_max_points, ours_key: default_dummy_max_points}
experiments = [2,3]

In [179]:
for exp in experiments:
    cur_exps = {baseline_key: default_exp_index, ours_key: exp}
    ARI = compare(dataset, partitioners=ours_partitioner, max_points=ours_max_points, exp_indices=cur_exps)
    printCompareResults(dataset, ARI)

{'dataset': 'geo108k', 'variant': 'dDBGSCAN', 'partitioner': 'S2', 'partition_level': 13, 'maxpoints': 256, 'exp_index': 1}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/ /tmp/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/
{'dataset': 'geo108k', 'variant': 'dDBGSCAN', 'partitioner': 'S2', 'partition_level': 13, 'maxpoints': 256, 'exp_index': 2}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_02/ /tmp/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_02/
Dateset : geo108k , ARI: 1.0
{'dataset': 'geo108k', 'variant': 'dDBGSCAN', 'partitioner': 'S2', 'partition_level': 13, 'maxpoints': 256, 'exp_index': 1}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/ /tmp/output/geo108k/dDBGSCAN/S2/partlvl_13/maxp_256/exp_01/
Skipping, local already exists
{'dataset': 'geo108k', 'variant': 'dDBGSCAN', 'partitio

### Ours - cost partitioner - different maxp

In [180]:
dataset = "geo54k"
ours_partitioner = {baseline_key: baseline_partitioner_name, ours_key: baseline_partitioner_name}
maxps = [27000, 13500, 6750]

In [181]:
for maxp in maxps:
    ours_max_points = {baseline_key: 54000, ours_key: maxp}
    ARI = compare(dataset, partitioners=ours_partitioner, max_points=ours_max_points)
    printCompareResults(dataset, ARI)

{'dataset': 'geo54k', 'variant': 'dDBGSCAN', 'partitioner': 'cost', 'partition_level': 13, 'maxpoints': 54000, 'exp_index': 1}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_54000/exp_01/ /tmp/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_54000/exp_01/
{'dataset': 'geo54k', 'variant': 'dDBGSCAN', 'partitioner': 'cost', 'partition_level': 13, 'maxpoints': 27000, 'exp_index': 1}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_27000/exp_01/ /tmp/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_27000/exp_01/
Dateset : geo54k , ARI: 1.0
{'dataset': 'geo54k', 'variant': 'dDBGSCAN', 'partitioner': 'cost', 'partition_level': 13, 'maxpoints': 54000, 'exp_index': 1}
aws s3 cp --recursive s3://waze.tmp/dmarcous/clustering/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_54000/exp_01/ /tmp/output/geo54k/dDBGSCAN/cost/partlvl_13/maxp_54000/exp_01/
Skipping, local already exists
{'dataset': 'geo54k', 'varian