# Summarise the benchmark datasets

This notebook contains the code to summarise the benchmark datasets in Section 3 -- including finding the "optimal" number of clusters in each dataset.

In [None]:
from collections import namedtuple

import numpy as np
import pandas as pd
import tqdm
from kmodes.kmodes import KModes
from yellowbrick.utils import KneeLocator

In [None]:
dataset_costs = {}
for dataset_name in ("breast_cancer", "mushroom", "soybean", "nursery"):

    df = pd.read_csv(f"../data/{dataset_name}.csv", na_values=["?", "dna"])
    df = df.drop("class", axis=1).dropna()

    costs = []
    cluster_range = range(2, int(np.sqrt(len(df))) + 1)
    for k in tqdm.tqdm(cluster_range):
        km = KModes(n_clusters=k, init="cao", random_state=0).fit(df)
        costs.append(km.cost_)

    dataset_costs[dataset_name] = costs

    cost_df = pd.DataFrame(
        {
            "cost": costs,
            "min_nclusters": min(cluster_range),
            "max_nclusters": max(cluster_range),
        }
    )
    cost_df.to_csv(f"../data/elbow/{dataset_name}_costs.csv", index=False)

In [None]:
Summary = namedtuple(
    "Summary",
    [
        "Name",
        "N",
        "m",
        "No_classes",
        "Missing_values",
        "Adjusted_N",
        "Adjusted_no_classes",
        "No_clusters_found",
    ],
)

summaries = []
for name in ("breast_cancer", "mushroom", "nursery", "soybean"):

    costs = pd.read_csv(f"../data/knee/{name}_costs.csv")
    df = pd.read_csv(f"../data/{name}.csv", na_values=["?", "dna"])

    kl = KneeLocator(
        costs.index.values + 2,
        costs["cost"].values,
        curve_nature="convex",
        curve_direction="decreasing",
    )

    classes = df["class"]
    df = df.drop("class", axis=1)

    dropped_df = df.dropna()
    missing_values = len(dropped_df) != len(df)

    nrows, ncols = df.shape
    nclasses = classes.nunique()
    adjusted_nrows = len(dropped_df)
    adjusted_nclasses = classes.iloc[dropped_df.index].nunique()

    summaries.append(
        Summary(
            name,
            nrows,
            ncols,
            nclasses,
            missing_values,
            adjusted_nrows,
            adjusted_nclasses,
            kl.knee,
        )
    )

In [None]:
summary_df = pd.DataFrame(summaries).set_index("Name")
summary_df.index = [" ".join(name.split("_")).capitalize() for name in summary_df.index]
summary_df.columns = [
    " ".join(col.replace("o_", "o._").split("_")) for col in summary_df.columns
]

summary_df.T.to_latex(
    "../tex/dataset_summary.tex",  # column_format="l" + "r" * len(summary_df.columns)
)