# Write results to `tex` files
---

This notebook contains the workings out for generating our results in the $k$-modes write-up.

We will be examining a selection of well-known categorical datasets and running our algorithm against them to test for their ability in:

- **clustering** (using the objective function)
- **predictive modelling** (typical classification-esque metrics)
- computational speed

### Each set of results will be found by manipulating some `pandas.DataFrame` and then written to a `tex` file.

In [17]:
from kmodes.kmodes import KModes
from sklearn.model_selection import train_test_split
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, \
                            homogeneity_score, completeness_score, classification_report

import pandas as pd
import numpy as np
import time

# Read in datasets

In [58]:
soybean = pd.read_csv('../data/soybean.csv', na_values='?')
mushroom = pd.read_csv('../data/mushroom.csv')
zoo = pd.read_csv('../data/zoo.csv')
breast_cancer = pd.read_csv('../data/breast_cancer.csv')
vehicle = pd.read_csv('../data/vehicle.csv')

for dataset in [soybean, mushroom, zoo, breast_cancer, vehicle]:
    dataset.dropna(axis=0, inplace=True)

# Define functions to get results tables for each approach

In [63]:
def get_predictive_results(X, y, init, n_clusters, seed):

    np.random.seed(seed)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=seed,
                                                        stratify=y)
    start = time.clock()
    km = KModes(n_clusters, init=init, n_init=25)
    km.fit(X_train)
    y_pred = km.predict(X_test)
    time_taken = time.clock() - start

    adj_rand_idx = adjusted_rand_score(y_test, y_pred)
    adj_mutual_info = adjusted_mutual_info_score(y_test, y_pred)
    homogeneity = homogeneity_score(y_test, y_pred)
    completeness = completeness_score(y_test, y_pred)

    result = pd.DataFrame({'Initialisation': init.replace('_', ' ').title(),
                           'Adjusted Rand index': adj_rand_idx,
                           'Adjusted mutual information': adj_mutual_info,
                           'Homogeneity': homogeneity,
                           'Completeness': completeness,
                           'Time taken (s)': time_taken}, index=[''])
    return result

def get_clustering_results(data, init, n_clusters, seed):

    np.random.seed(seed)

    data = data.drop('class', axis=1)

    start = time.clock()
    km = KModes(n_clusters, init=init, n_init=25)
    km.fit_predict(data)
    time_taken = time.clock() - start

    result = pd.DataFrame({'Initialisation': init.replace('_', ' ').title(),
                           'Objective function': km.cost_,
                           'No. of iterations': km.n_iter_,
                           'Time taken (s)': time_taken}, index=[''])
    return result

def get_init_results(data, init, result_type, max_seed):

    X = data.drop('class', axis=1)
    y = data['class']
    n_clusters = len(np.unique(y))

    if result_type == 'clustering':
        dfs = []
        for seed in range(max_seed):
            result = get_clustering_results(data, init, n_clusters, seed)
            dfs.append(result)
        return pd.concat(dfs)
    
    if result_type == 'prediction':
        dfs = []
        for seed in range(max_seed):
            result = get_predictive_results(data, init, n_clusters, seed)
            dfs.append(result)
        return pd.concat(dfs)

    raise ValueError('result_type must be one of "clustering" or "prediction"')

def format_results(results):

    mean_df = results.groupby('Initialisation').mean().reset_index()
    mean_df[''] = 'mean'

    std_df = results.groupby('Initialisation').std().reset_index()
    std_df[''] = 'std'
    
    return pd.concat([mean_df, std_df]).groupby(['Initialisation', '']).sum().T.round(4)

# Collect results

In [67]:
max_seed = 25

clust_results = []
for init in ['cao', 'huang', 'random', 'matching_best', 'matching_worst', 'matching_random']:
    clust_result = get_init_results(soybean, init, 'clustering', max_seed)
    clust_results.append(clust_result)
#     pred_results.append(pred_result)

cluster_results = pd.concat(clust_results)
# predict_results = pd.concat(pred_results)
result = format_results(cluster_results)
# latex_formatting(predict_results).to_latex('../test/soybean_pred_results.tex')

In [68]:
result

Initialisation,Cao,Cao,Huang,Huang,Matching Best,Matching Best,Matching Random,Matching Random,Matching Worst,Matching Worst,Random,Random
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
No. of iterations,2.0,0.0,4.16,0.7461,4.56,1.044,3.0,0.0,4.4,0.866,4.0,1.0408
Objective function,1314.0,0.0,1328.6,22.8236,1330.96,23.5221,1358.48,2.6,1322.12,18.9326,1283.0,14.7366
Time taken (s),6.6685,0.6718,8.6313,0.5054,6.1882,0.2343,6.5231,0.4726,6.6328,0.2927,2.6883,0.1467
