In [1]:
from collections import namedtuple
import time
import warnings

from kmodes.kmodes import KModes
from kmodes.util import dissim

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm


In [2]:
Record = namedtuple(
    "Record",
    [
        "n_clusters",
        "initialisation",
        "seed",
        "initial_cost",
        "final_cost",
        "n_iterations",
        "time",
    ],
)


def find_clustering(data, n_clusters, initialisation, seed):

    start = time.perf_counter()
    km = KModes(n_clusters, init=initialisation, n_init=1, random_state=seed)
    km.fit(data)
    end = time.perf_counter()

    return km.epoch_costs_[0], km.cost_, km.n_iter_, end - start


def run_experiment(dataset, initialisation, repetitions, n_clusters=None):

    data = dataset.drop("class", axis=1)
    n_clusters = dataset["class"].nunique() if n_clusters is None else n_clusters

    results = []
    for seed in tqdm.tqdm(range(repetitions)):
        initial_cost, final_cost, n_iter, time = find_clustering(
            data, n_clusters, initialisation, seed
        )

        record = Record(
            n_clusters, initialisation, seed, initial_cost, final_cost, n_iter, time
        )
        results.append(record)

    return pd.DataFrame(results)


def main(name, repetitions=50, root="../data/", destination=None, n_clusters=None):

    data = pd.read_csv(f"{root}{name}.csv", na_values=["?", "dna"])
    dataset = data.dropna()

    dfs = [
        run_experiment(dataset, initialisation, repetitions, n_clusters)
        for initialisation in ("cao", "huang", "matching")
    ]

    df = pd.concat(dfs, axis=0, ignore_index=True)
    if destination is not None:
        df.to_csv(destination + f"{name}_results.csv", index=False)

    return df


In [3]:
optimal_nclusters = (10, 4, 6, 6)  # elbow method from Cao's clustering

for name, n_clusters in zip(
    ("breast_cancer", "mushroom", "soybean", "zoo"), optimal_nclusters
):
    main(name, root="../data/", destination="../data/elbow/", n_clusters=n_clusters)


100%|██████████| 50/50 [00:18<00:00,  2.71it/s]
100%|██████████| 50/50 [00:12<00:00,  3.99it/s]
100%|██████████| 50/50 [00:10<00:00,  4.85it/s]
100%|██████████| 50/50 [01:33<00:00,  1.87s/it]
100%|██████████| 50/50 [02:12<00:00,  2.66s/it]
100%|██████████| 50/50 [01:32<00:00,  1.84s/it]
100%|██████████| 50/50 [00:11<00:00,  4.19it/s]
100%|██████████| 50/50 [00:11<00:00,  4.34it/s]
100%|██████████| 50/50 [00:07<00:00,  6.63it/s]
100%|██████████| 50/50 [00:02<00:00, 22.47it/s]
100%|██████████| 50/50 [00:02<00:00, 24.35it/s]
100%|██████████| 50/50 [00:01<00:00, 30.90it/s]
