In [None]:
%matplotlib notebook
import numpy as np
from mlcv import greedy_search, generate_points, render_pcl, render_stats, distance_to_origin
from tqdm.notebook import trange, tqdm
import pandas as pd

np.random.seed(1) # for reproducability

def cf_prime(data,combs):
    dthr = 0.1
    return distance_to_origin(data,combs) - dthr

def cf(data,combs):
    return np.zeros(combs.shape)

def stop(idx, indexing, target_partition_count=0):
    if idx < 100:
        return False
    nr_partitions = np.unique(indexing).shape[0]
    return nr_partitions == target_partition_count

This notebook runs multiple instances of the greedy search algorithm on various datasets and stores the results in the `./experiments` subdirectory. For this matter, three files are created:

* `meta.csv` stores the settings for each run, with the columns
    * `didx`, which contains an index for the used dataset
    * `partinit`, which is the number of random partitions the search algorithm is initialized with
    * `N`, number of neighbours considered in each iteration. If `NaN` or `None`, then all neighbours are compared
    * `M`, number of samples drawn in order to compute the reduced cost. If `NaN` or `None`, then the costs are computed explicitly
    * `settingname`, name or short description for the search instance
* `runs.csv` stores the run (i.e., every iteration) of each search instance, with the columns
    * `run`, which links the iteration to the corresponding run
    * `n_neighbours`, which stores the current number of considered neighbours
    * `partcount`, which contains the current number of partitions
    * `rc`, which stores the computed reduced costs
    * `t_neighbours`, which stores the overall time required to compute all neighbours
    * `t_rcs_mean`, which stores the mean time required to compute the reduced cost per neighbour
    * `t_rcs_std`, which contains the respective standard deviation and
    * `t_rcs_sum`, which contains the overall time required to compute every cost for every neighbour
* `data.csv` stores the used datasets (i.e. a row for every point) with ground-truth and computed partitions
    * `didx`, links every point to its corresponding dataset
    * `x`, `y`, `z` contain the x, y and z-coordinate of every point
    * `gt` contains the partition index of this point 
    * and columns for every considered run, which contain the computed partition index for this point
    
(overall execution time is approx. 1h, 10min)

In [None]:
dataset = [
    generate_points(noise=[0.01]*3, distribution=[30 ]*3),
    generate_points(noise=[0.01]*3, distribution=[80 ]*3),
    generate_points(noise=[0.01]*3, distribution=[150]*3),
    generate_points(noise=[0.01]*4, distribution=[250]*4),
    generate_points(noise=[0.01]*5, distribution=[300]*5),
]

combs = [
    # dataset, #intial_random_partitions, N, M, name_of_setting
    [0, 1, 40,      1000, "0-S-S"],
    [0, 1, None,    1000, "0-E-S"],
    [0, 1, 40,      None, "0-S-E"],
    [0, 1, None,    None, "0-E-E"],
    
    [1, 1, 40,      5000, "1-S-S"],
    [1, 1, None,    5000, "1-E-S"],
    [1, 1, 40,      None, "1-S-E"],
    [1, 1, None,    None, "1-E-E"],
    
    [2, 1, 50,      5000, "2-S-S"],
    [3, 1, 50,     10000, "3-S-S"],
    [4, 1, 50,     15000, "4-S-S"],
    
    [2, 1, 50,       500, "2-S-SL"],
    [2, 1,  5,      5000, "2-SL-S"],
    
    [2, 10, 50,     5000, "2-S-S-IP10"],
    [2, 50, 50,     5000, "2-S-S-IP50"],
    [2,100, 50,     5000, "2-S-S-IP100"]
]

dfmeta = pd.DataFrame(combs,columns=["didx","partinit","N","M","settingname"])
dfruns = pd.DataFrame()
dfdata = pd.DataFrame(columns=["didx","x","y","z","gt"]+[c[4] for c in combs])

pbar = tqdm(combs, total=len(combs))
already_stored = set()
for didx, indexing_init, N, M, name in pbar:
    data, ground_truth = dataset[didx]
    
    if didx not in already_stored:
        tmp = pd.DataFrame()
        tmp["x"] = data[:,0]
        tmp["y"] = data[:,1]
        tmp["z"] = data[:,2]
        tmp["didx"] = didx
        tmp["gt"] = ground_truth
        tmp[[c[4] for c in combs]] = None
        dfdata = pd.concat((dfdata, tmp))
        already_stored.add(didx)
    
    current_indexing = np.random.choice(indexing_init, data.shape[0])
    partcount_gt = np.unique(ground_truth).shape[0]
    stopping_criteria = lambda idx,indexing: stop(idx,indexing,target_partition_count=partcount_gt)
    
    alg = greedy_search(data, current_indexing, cf, cf_prime, stop=stopping_criteria, N=N, M=M)
    for indexing, v, k, stats_bm in alg:
        current_indexing = indexing
        dfruns = dfruns.append({ **stats_bm, "run": name }, ignore_index=True)
    
    dfdata.loc[dfdata["didx"] == didx, name] = current_indexing 
    
    
dfruns.to_csv("./experiments/runs.csv", index=False)
dfmeta.to_csv("./experiments/meta.csv", index=False)
dfdata.to_csv("./experiments/data.csv", index=False)