In [10]:
import pprint
from collections import namedtuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tqdm
from kmodes.kmodes import KModes
from yellowbrick.utils import KneeLocator

%matplotlib inline

In [2]:
Summary = namedtuple(
    "Summary",
    [
        "name",
        "num_rows",
        "num_cols",
        "num_classes",
        "missing_values",
        "adjusted_num_rows",
        "adjusted_num_classes",
    ],
)

summaries = []
for dataset_name in ("breast_cancer", "mushroom", "soybean", "nursery"):

    df = pd.read_csv(f"../data/{dataset_name}.csv", na_values=["?", "dna"])

    classes = df["class"]
    df = df.drop("class", axis=1)

    dropped_df = df.dropna()
    missing_values = len(dropped_df) != len(df)

    nrows, ncols = df.shape
    nclasses = classes.nunique()
    adjusted_nrows = len(dropped_df)
    adjusted_nclasses = classes.iloc[dropped_df.index].nunique()

    summaries.append(
        Summary(
            dataset_name,
            nrows,
            ncols,
            nclasses,
            missing_values,
            adjusted_nrows,
            adjusted_nclasses,
        )
    )


In [3]:
summary_df = pd.DataFrame(summaries).set_index("name")
summary_df.index = [" ".join(name.split("_")).capitalize() for name in summary_df.index]
summary_df.columns = [
    " ".join(col.replace("num", "no.").split("_")).capitalize()
    for col in summary_df.columns
]

summary_df.to_latex("../tex/dataset_summary.tex")
summary_df


Unnamed: 0,No. rows,No. cols,No. classes,Missing values,Adjusted no. rows,Adjusted no. classes
Breast cancer,699,10,2,True,683,2
Mushroom,8124,22,2,True,5644,2
Soybean,307,35,19,True,266,15
Nursery,12960,8,5,False,12960,5


In [7]:
# dataset_costs = {}
# for dataset_name in ("breast_cancer", "mushroom", "soybean", "nursery"):

#     df = pd.read_csv(f"../data/{dataset_name}.csv", na_values=["?", "dna"])
#     df = df.drop("class", axis=1).dropna()

#     costs = []
#     cluster_range = range(2, int(np.sqrt(len(df))) + 1)
#     for k in tqdm.tqdm(cluster_range):
#         km = KModes(n_clusters=k, init="cao", random_state=0).fit(df)
#         costs.append(km.cost_)

#     dataset_costs[dataset_name] = costs
    
#     cost_df = pd.DataFrame(
#         {"cost": costs, "min_nclusters": min(cluster_range), "max_nclusters": max(cluster_range)}
#     )
#     cost_df.to_csv(f"../data/elbow/{dataset_name}_costs.csv", index=False)

100%|██████████| 25/25 [00:10<00:00,  2.40it/s]
100%|██████████| 74/74 [14:41<00:00, 11.91s/it]
100%|██████████| 15/15 [00:04<00:00,  3.68it/s]
100%|██████████| 112/112 [35:54<00:00, 19.24s/it]


In [24]:
for name, costs in dataset_costs.items():
    cost_df = pd.DataFrame({"cost": costs})
    cost_df.to_csv(f"../data/elbow/{name}_costs.csv", index=False)