In [None]:
from sketching import settings
from sketching.datasets import Dataset, Covertype_Sklearn, KDDCup_Sklearn, Webspam_libsvm, Synthetic_Dataset

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib

In [None]:
def get_results_df(dataset: Dataset, time_column):
    df_list = []
    
    for method in ["uniform", "l2s", "sketching"]:
        df = (
            pd.read_csv(settings.RESULTS_DIR / (dataset.get_name() + f"_{method}.csv"))
            .filter(items=[time_column, "size", "ratio"])
            .groupby(["size"], as_index=False)
            .median()
            .assign(method=method)
        )
        df_list.append(df)

    return(pd.concat(df_list, ignore_index=True))

get_results_df(Covertype_Sklearn(), "sampling_time_s")

In [None]:
def make_plot(dataset,  x_min, x_max, y_min, y_max, sampling_time=False, font_size=18, font_size_title=23):
    if sampling_time:
        time_column = "sampling_time_s"
    else:
        time_column = "total_time_s"

    results_df = get_results_df(dataset, time_column=time_column)

    # use TeX for typesetting
    plt.rcParams["text.usetex"] = True
    plt.rc("font", size=font_size)

    fig, ax = plt.subplots()

    colormap = matplotlib.cm.get_cmap(name="tab20")
    colors = {
        "uniform": colormap(0),
        "sketching": colormap(2),
        "l2s": colormap(4),
    }

    labels = {
        "uniform": "Uniform",
        "sketching": "Sketch",
        "l2s": "L2S",
    }

    titles = {
        "covertype_sklearn": "Covertype",
        "kddcup_sklearn": "Kddcup",
        "webspam_libsvm_desparsed": "Webspam",
        "synthetic_n_100000": "Synthetic"
    }

    for cur_method in ["l2s", "uniform", "sketching"]:
        cur_results = results_df.loc[results_df["method"] == cur_method]
        ax.scatter(
            cur_results[time_column], 
            cur_results["ratio"],
            color=colors[cur_method],
            label=labels[cur_method],
        )

    ax.set_xlim(left=x_min, right=x_max)
    ax.set_ylim(bottom=y_min, top=y_max)

    if sampling_time:
        ax.set_xlabel("median sampling time (s)")
    else:
        ax.set_xlabel("median absolute running time (s)")

    ax.set_ylabel("median approximation ratio")

    ax.set_title(titles[dataset.get_name()], fontsize=font_size_title)

    legend = ax.legend(loc="upper right", frameon=True)

    fig.tight_layout()

    plt.show()


In [None]:
dataset = Covertype_Sklearn()
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=True)

In [None]:
dataset = KDDCup_Sklearn()
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=10, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=6, sampling_time=True)

In [None]:
dataset = Webspam_libsvm()
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=2, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=1, y_max=2, sampling_time=True)

In [None]:
dataset = Synthetic_Dataset(n_rows=100000)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=False)
make_plot(dataset, x_min=None, x_max=None, y_min=None, y_max=None, sampling_time=True)