# Generate the $k$-by-classes results

This notebook contains the code to generate the data used in Section 3.2

In [None]:
from collections import namedtuple
import time
import warnings

from kmodes.kmodes import KModes
from kmodes.util import dissim

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm


In [None]:
Record = namedtuple(
    "Record",
    [
        "n_clusters",
        "initialisation",
        "seed",
        "initial_cost",
        "final_cost",
        "n_iterations",
        "time",
    ],
)


def find_clustering(data, n_clusters, initialisation, seed):

    start = time.perf_counter()
    km = KModes(n_clusters, init=initialisation, n_init=1, random_state=seed)
    km.fit(data)
    end = time.perf_counter()

    return km.epoch_costs_[0], km.epoch_costs_[-1], km.n_iter_, end - start


def run_experiment(data, initialisation, repetitions):

    data = data.dropna()
    n_clusters = data["class"].nunique()
    data = data.drop("class", axis=1)

    results = []
    for seed in tqdm.tqdm(range(repetitions)):
        initial_cost, final_cost, n_iter, time = find_clustering(
            data, n_clusters, initialisation, seed
        )

        record = Record(
            n_clusters, initialisation, seed, initial_cost, final_cost, n_iter, time
        )
        results.append(record)

    return pd.DataFrame(results)


def main(name, repetitions=250, root="../data/", destination=None):

    data = pd.read_csv(f"{root}{name}.csv", na_values=["?", "dna"])

    dfs = []
    for initialisation in ("cao", "huang", "matching"):
        df = run_experiment(data, initialisation, repetitions)
        dfs.append(df)

    df = pd.concat(dfs, axis=0, ignore_index=True)
    if destination is not None:
        df.to_csv(destination + f"{name}_results.csv", index=False)

    return df


In [None]:
for name in ("breast_cancer", "mushroom", "nursery", "soybean"):
    print(name)
    main(name, root="../data/", destination="../data/nclasses/")
