In [1]:
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
plt.style.use("seaborn-colorblind")
warnings.filterwarnings("ignore")

In [2]:
method_colours = {
    "cao": "tab:green",
    "huang": "tab:blue",
    "matching": "tab:orange",
}

# Plotting results


In [3]:
def violin_plots(name, column, root, destination=None, nseeds=None):

    results = pd.read_csv(f"{root}{name}_results.csv")
    if nseeds:
        results = results[results["seed"] < nseeds]

    fig, ax = plt.subplots(figsize=(8, 4), dpi=300)

    for pos, (group, values) in enumerate(results.groupby("initialisation")):
        ax.boxplot(values[column], positions=[pos], sym=".")
        v = ax.violinplot(values[column], [pos], showextrema=False)
        for body in v["bodies"]:
            body.set_facecolor(method_colours[group])

    separated_column = " ".join(column.split("_"))
    ax.set_xticks((0, 1, 2))
    ax.set_xticklabels(("Cao", "Huang", "Matching"))
    ax.set_xlabel("Initialisation method")
    ax.set_ylabel(separated_column.capitalize())
    ax.set_title(
        " ".join(name.split("_")).capitalize() + f" ({root.split('/')[-2]})"
    )

    if destination is not None:
        plt.tight_layout()
        plt.savefig(destination + f"{name}_{column}_violinplot.pdf", transparent=True)


In [4]:
def empirical_cdf_plots(name, column, root, destination=None, nseeds=None):

    """ Plot the CDF of either the initial or final cost for an initialisation
    method. This is controlled via the `column` argument. """
    
    results = pd.read_csv(f"{root}{name}_results.csv")
    if nseeds:
        results = results[results["seed"] < nseeds]

    fig, ax = plt.subplots(figsize=(8, 4), dpi=300)

    for group, values in results.groupby("initialisation"):
        colour = method_colours[group]
        if group == "cao":
            ax.vlines(
                x=values[column].median(),
                ymin=0,
                ymax=1,
                color=colour,
                label=group.capitalize(),
            )
        else:
            histtype, alpha = "bar", 0.25
            if group == "huang":
                histtype, alpha = "bar", 0.25

            ax.hist(
                values[column],
                cumulative=True,
                bins=nseeds,
                density=True,
                color=colour,
                histtype=histtype,
                alpha=alpha,
                label=group.capitalize(),
            )

    separated_column = " ".join(column.split("_"))
    ax.set_xlabel(separated_column.capitalize())
    ax.set_ylabel("Likelihood of observation")

    ax.set_ylim(0, 1)

    ax.legend()

    if destination is not None:
        plt.tight_layout()
        plt.savefig(destination + f"{name}_{column}_cdfplot.pdf", transparent=True)


In [5]:
def scatter_plots(name, root, destination=None, nseeds=None):

    """ Generate a scatter plot of initial cost vs. final cost. """
    
    results = pd.read_csv(f"{root}{name}_results.csv")
    if nseeds:
        results = results[results["seed"] < nseeds]

    fig, ax = plt.subplots(figsize=(8, 8), dpi=300)

    for group, values in results.groupby("initialisation"):
        colour = method_colours[group]
        alpha, zorder, marker, markersize = 0.5, 1, None, None
        if group == "cao":
            alpha, zorder, marker, markersize = 1, 2, "x", 125

        ax.scatter(
            values["initial_cost"],
            values["final_cost"],
            alpha=alpha,
            zorder=zorder,
            marker=marker,
            s=markersize,
            edgecolor="None",
            facecolor=colour,
            label=group.capitalize(),
        )
        
    ax.set_xlabel("Initial cost")
    ax.set_ylabel("Final cost")
    
    limits = [
        np.min([ax.get_xlim(), ax.get_ylim()]),
        np.max([ax.get_xlim(), ax.get_ylim()]),
    ]

    ax.plot(limits, limits, "gray", alpha=0.5, zorder=0)
    
    ax.set_aspect('equal')
    ax.set_xlim(limits)
    ax.set_ylim(limits)
    
    ax.legend()

    if destination is not None:
        plt.tight_layout()
        plt.savefig(destination + f"{name}_cost_scatterplot.pdf", transparent=True)


In [None]:
nseeds = 250

with open("../tex/repetitions.tex", "w") as f:
    f.write(str(nseeds))

for root in (
    "elbow/",
    "nclasses/",
):
    for name in (
        "breast_cancer",
        "soybean",
        "mushroom",
        "nursery",
    ):
        scatter_plots(name, f"../data/{root}", destination=f"../img/{root}", nseeds=nseeds)
        for column in (
            "initial_cost",
            "final_cost",
        ):
            empirical_cdf_plots(name, column, f"../data/{root}", destination=f"../img/{root}", nseeds=nseeds)


# Result tables


In [None]:
def get_summary(name, root, destination=None, nseeds=None):

    results = pd.read_csv(f"{root}{name}_results.csv")
    if nseeds:
        results = results[results["seed"] < nseeds]

    means = results.groupby("initialisation")[
        ["initial_cost", "final_cost", "n_iterations", "time"]
    ].mean()

    stds = results.groupby("initialisation")[
        ["initial_cost", "final_cost", "n_iterations", "time"]
    ].std()

    summary = pd.DataFrame(columns=means.columns, index=means.index)
    for j, column in enumerate(means):
        for i, _ in enumerate(means[column]):
            summary.iloc[i, j] = f"{means.iloc[i, j]:.2f} ({stds.iloc[i, j]:.3f})"

    summary.columns = ["Initial cost", "Final cost", "No. iterations", "Time"]
    summary.index = ["Cao", "Huang", "Matching"]

    if destination is not None:
        summary.to_latex(f"{destination}{name}_summary.tex")

    print(f"{name} ({root.split('/')[-2]}) summary:\n", summary, "\n\n")


In [None]:
for root in (
    "elbow/",
    "nclasses/",
):
    for name in (
        "breast_cancer",
        "soybean",
        "mushroom",
        "nursery",
    ):
        get_summary(name, f"../data/{root}", f"../tex/{root}", nseeds=nseeds)
