## Quick python code that can benchmark how fast the program is

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

### List the source files and ensure a TSN-e reduction is available

In [None]:
source_files = [
    Path("../data/abalone/abalone.csv").resolve(),
    Path("../data/absenteeism/absenteeism.csv").resolve(),
    Path("../data/diabetes/diabetes.csv").resolve(),
    Path("../data/epileptic/epileptic.csv").resolve(),
    Path("../data/happiness/happiness.csv").resolve(),
    Path("../data/seismic/seismic.csv").resolve(),
    Path("../data/wbc/wbc.csv").resolve(),
    Path("../data/bank/bank.csv").resolve(),
    Path("../data/winequality/winequality-src-both.csv").resolve(),
    Path("../data/cube/cube100k.csv").resolve(),
]

for file in source_files:
    assert file.exists(), f"{file} does not exist"

In [None]:
# Ensure we have a reduction we can use for each, if not we run TSNe
bench_files = []
for source_file in source_files:
    # check if the reduced-2d.csv file is available in the directory. If not create it.
    
    name = lambda d: f"{source_file.name[:-4]}-TSNE-{d}.csv"
    reduced_file_2d = (source_file.parent / name('2d')).resolve()
    reduced_file_3d = (source_file.parent / name('3d')).resolve()

    df = pd.read_csv(source_file, delimiter=';')
    if len(df) > 50_000:
        print(f"Missing: {reduced_file}, running PCA (because file there are so many points)")
        res = PCA(n_components=3).fit_transform(df)
    else:
        print(f"Running TSNe 2D for {reduced_file_2d}")
        res_2d = TSNE(n_components=2, verbose=10, n_jobs=-5).fit_transform(df)
        print(f"Running TSNe 3D for {reduced_file_3d}")
        res_3d = TSNE(n_components=3, verbose=10, n_jobs=-5).fit_transform(df)
    
    # WRite to file    
    res_df = pd.DataFrame(res_2d)
    res_df.to_csv(reduced_file_2d, index=None, sep=';')
    res_df = pd.DataFrame(res_3d)
    res_df.to_csv(reduced_file_3d, index=None, sep=';')
    
        
#     bench_files.append((source_file, reduced_file))


### Now that we have all the files we can start benchmarking

In [None]:
# Ensure pointctl is installed, never benchmark with a debug build
!pointctl --help

In [None]:
from subprocess import Popen, PIPE
import time, tempfile
def run_benchmark(source_df, reduced_df, r, c, d):
    # Create to temp files where the sources are writtend to and the program loads them from
    with tempfile.NamedTemporaryFile(mode='w+') as source_file, tempfile.NamedTemporaryFile(mode='w+') as reduced_file:
        indices = list(np.random.choice(source_df.index, c, replace=True))
        reduced_df_2 = reduced_df.iloc[indices]
        source_df_2 = source_df.iloc[indices]
        assert len(source_df_2) == len(reduced_df_2), f"Files should contain the same amount of points. Source: {len(source_df_2)}, Reduced: {len(reduced_df_2)}"
        source_df_2.to_csv(source_file, index=None, sep=';')
        reduced_df_2.to_csv(reduced_file, index=None, sep=';')
        
        now = time.time()
        p = Popen(["pointctl", "explain", "-r", r, "--input", source_file.name, "--reduced", reduced_file.name, "./foobar.csv"], stdout=PIPE, stderr=PIPE)
        output = p.communicate()
        print(output)
        timing = time.time() - now
    return timing

In [None]:
import time
from subprocess import Popen, PIPE

def run_benchmark(source, reduced, r, m):
    now = time.time()
    p = Popen(["pointctl", "explain", "-r", r, "-t", "0.90", "--input", source, "--reduced", reduced, "./foobar.csv"], stdout=PIPE, stderr=PIPE)
    output = p.communicate()
    timing = time.time() - now
    print(f"\t{timing:.3f} seconds for {m} / {r}")

for (source, reduced) in bench_files:
    source_df = pd.read_csv(source, delimiter=';')
    print(f"Running on {source.name}. Rows: {len(source_df)} - Dimensions: {len(source_df.columns)}")
    for m in ["silva_variance", "driel_sum"]:
        for r in ["0.1", "0.2", "0.3"]:
            run_benchmark(source, reduced, r, m)
