In [None]:
from sketching import settings
from sketching.datasets import Dataset, Covertype_Sklearn, KDDCup_Sklearn, Webspam_libsvm, Synthetic_Dataset, NoisyDataset

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt
import matplotlib

In [None]:
if not settings.PLOTS_DIR.exists():
    settings.PLOTS_DIR.mkdir()

In [None]:
def get_results_df(dataset: Dataset):
    df_list = []
    
    for method in ["uniform", "l2s", "sketching"]:
        df = (
            pd.read_csv(settings.RESULTS_DIR / (dataset.get_name() + f"_{method}.csv"))
            .filter(items=["ratio", "size"])
            .groupby(["size"], as_index=False)
            .median()
            .assign(method=method)
        )
        df_list.append(df)

    df_sgd = (
        pd.read_csv(settings.RESULTS_DIR / (dataset.get_name() + "_sgd.csv"))
        .filter(items=["ratio"])
        .assign(method="sgd", size=np.nan)
        .groupby(["method"], as_index=False)
        .median()
    )
    df_list.append(df_sgd)

    return(pd.concat(df_list, ignore_index=True))

In [None]:
def make_plot(dataset, x_min, x_max, y_min, y_max, font_size=15, font_size_title=23):
    results_df = get_results_df(dataset)

    # use TeX for typesetting
    plt.rcParams["text.usetex"] = True
    plt.rc("font", size=font_size)

    fig, ax = plt.subplots()

    colormap = matplotlib.cm.get_cmap(name="tab20")
    colors = {
        "uniform": colormap(0),
        "sketching": colormap(2),
        "l2s": colormap(4),
    }

    labels = {
        "uniform": "Uniform",
        "sketching": "Sketch",
        "l2s": "L2S",
        "sgd": "SGD"
    }

    titles = {
        "covertype_sklearn": "Covertype",
        "covertype_sklearn_noisy": "Covertype, 1\% noisy",
        "kddcup_sklearn": "Kddcup",
        "kddcup_sklearn_noisy": "Kddcup, 1\% noisy",
        "webspam_libsvm_desparsed": "Webspam",
        "webspam_libsvm_desparsed_noisy": "Webspam, 1\% noisy",
        "synthetic_n_100000": "Synthetic"
    }

    # add SGD
    median_sgd = results_df.loc[results_df["method"] == "sgd"]["ratio"]
    ax.plot([x_min, x_max], [median_sgd, median_sgd], label="SGD", color=colormap(7))

    for cur_method in ["l2s", "uniform", "sketching"]:
        cur_results = results_df.loc[results_df["method"] == cur_method]
        ax.plot(
            cur_results["size"], 
            cur_results["ratio"],
            color=colors[cur_method],
            label=labels[cur_method],
        )

    ax.set_xlim(left=x_min, right=x_max)
    ax.set_ylim(bottom=y_min, top=y_max)

    ax.set_xlabel("reduced size")
    ax.set_ylabel("median approximation ratio")

    ax.set_title(titles[dataset.get_name()], fontsize=font_size_title)

    legend = ax.legend(loc="upper right", frameon=True)

    fig.tight_layout()

    plt.savefig(settings.PLOTS_DIR / f"{dataset.get_name()}_ratio_plot.pdf")

    plt.show()

In [None]:
dataset = Covertype_Sklearn()
make_plot(dataset, x_min=0, x_max=15000, y_min=1, y_max=1.2)

In [None]:
dataset = KDDCup_Sklearn()
make_plot(dataset, x_min=0, x_max=30000, y_min=1, y_max=5)

In [None]:
dataset = Webspam_libsvm()
make_plot(dataset, x_min=0, x_max=15000, y_min=1, y_max=2)

In [None]:
dataset = Synthetic_Dataset(n_rows=100000)
make_plot(dataset, x_min=0, x_max=3000, y_min=1, y_max=35)

In [None]:
dataset = NoisyDataset(dataset=Webspam_libsvm(), percentage=0.01, std=10)
make_plot(dataset, x_min=0, x_max=15000, y_min=1, y_max=10)

In [None]:
dataset = NoisyDataset(dataset=Covertype_Sklearn(), percentage=0.01, std=10)
make_plot(dataset, x_min=0, x_max=15000, y_min=1, y_max=1.5)

In [None]:
dataset = NoisyDataset(dataset=KDDCup_Sklearn(), percentage=0.01, std=10)
make_plot(dataset, x_min=0, x_max=30000, y_min=1, y_max=5)