# Write results to `tex` files
---

This notebook contains the workings out for generating our results in the $k$-modes write-up.

We will be examining a selection of well-known categorical datasets and running our algorithm against them to test for their ability in:

- **clustering** (using the objective function)
- **predictive modelling** (typical classification-esque metrics)
- computational speed

### Each set of results will be found by manipulating some `pandas.DataFrame` and then written to a `tex` file.

In [1]:
from kmodes.kmodes import KModes
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, \
                            homogeneity_score, completeness_score

import pandas as pd
import numpy as np
import time

# Read in datasets

In [101]:
soybean = pd.read_csv('../data/soybean.csv', na_values='?')
mushroom = pd.read_csv('../data/mushroom.csv')
zoo = pd.read_csv('../data/zoo.csv')
breast_cancer = pd.read_csv('../data/breast_cancer.csv')
vehicle = pd.read_csv('../data/vehicle.csv')

for dataset in [soybean, mushroom, zoo, breast_cancer, vehicle]:
    dataset.dropna(axis=0, inplace=True)

# Define functions to get results tables for each approach

In [103]:
def split(data):
    
    X = data.drop('class', axis=1).values
    y = data['class'].values

    return X, y

def get_predictive_results(X, y, init, n_clusters, seed):

    np.random.seed(seed)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=seed,
                                                        stratify=y)
    start = time.clock()
    km = KModes(n_clusters, init=init, n_init=25)
    km.fit(X_train)
    y_pred = km.predict(X_test)
    time_taken = time.clock() - start

    adj_rand_idx = adjusted_rand_score(y_test, y_pred)
    adj_mutual_info = adjusted_mutual_info_score(y_test, y_pred)
    homogeneity = homogeneity_score(y_test, y_pred)
    completeness = completeness_score(y_test, y_pred)

    result = pd.DataFrame({'initialisation': init,
                           'adjusted_rand_index': adj_rand_idx,
                           'adjusted_mutual_information': adj_mutual_info,
                           'homogeneity': homogeneity,
                           'completeness': completeness,
                           'time': time_taken}, index=[''])
    return result

def get_clustering_results(data, init, n_clusters, seed):

    np.random.seed(seed)

    data = data.drop('class', axis=1)

    start = time.clock()
    km = KModes(n_clusters, init=init, n_init=25)
    km.fit_predict(data)
    time_taken = time.clock() - start

    result = pd.DataFrame({'initialisation': init,
                           'objective': km.cost_,
                           'iterations': km.n_iter_,
                           'time': time_taken}, index=[''])
    return result

def get_init_results(data, init, n_clusters, max_seed):

    X, y = split(data)
    clust_dfs, pred_dfs = [], []

    for seed in range(max_seed):

        clust_result = get_clustering_results(data, init, n_clusters, seed)
        clust_dfs.append(clust_result)

        pred_result = get_predictive_results(X, y, init, n_clusters, seed)
        pred_dfs.append(pred_result)

    clust_results = pd.concat(clust_dfs)
    pred_results = pd.concat(pred_dfs)

    return clust_results, pred_results

def latex_formatting(results):

    mean_df = results.groupby('initialisation').mean().reset_index()
    mean_df[''] = 'mean'

    std_df = results.groupby('initialisation').std().reset_index()
    std_df[''] = 'std'
    
    return pd.concat([mean_df, std_df]).groupby(['initialisation', '']).sum().T.round(4)

# Collect results

In [None]:
%%time

n_clusters = 15
max_seed = 5

clust_results, pred_results = [], []
for init in ['cao', 'huang', 'random', 'matching_best', 'matching_worst', 'matching_random']:
    clust_result, pred_result = get_init_results(soybean, init, n_clusters, max_seed)
    clust_results.append(clust_result)
    pred_results.append(pred_result)

cluster_results = pd.concat(clust_results)
predict_results = pd.concat(pred_results)
latex_formatting(cluster_results).to_latex('../test/soybean_clust_results.tex', column_format='ccccccccccccc')
latex_formatting(predict_results).to_latex('../test/soybean_pred_results.tex', column_format='ccccccccccccc')