In [1]:
import pandas as pd
import numpy as np
import itertools
import sympy as sym

from copy import deepcopy
from collections import defaultdict
from kmodes.kmodes import init_huang

In [2]:
def dissim(X, x):
    return np.sum(X != x, axis=1)
    
def summed_dissim(X, x):
    return np.sum(dissim(X, x))

def fraction(A):
    if A != 0:
        return sym.S(f'{A} / 10', evaluate=False)
    return f'{A}'

In [71]:
df = pd.read_csv('../../data/vehicle.csv', dtype=object)
df.index += 1

X = df.values

In [72]:
df.columns = ['No. doors', 'Eco-friendly', 'Maintenance costs',
              'No. passengers', 'Buying price', 'No. wheels']
df = df[['Buying price', 'Maintenance costs', 'No. doors',
         'No. passengers', 'No. wheels', 'Eco-friendly']]

df.to_latex('../../tex/example_table.tex', column_format='ccccccc')

## Relative frequency table

In [73]:
dfs = []
for col in df.columns:
    dfs.append(df[col].value_counts())
freq_df = pd.concat(dfs, axis=1).fillna(0).astype(int)

In [74]:
dfs = []
for col in ['Buying price', 'Maintenance costs']:
    dfs.append(df[col].value_counts())
num_freq_df = pd.concat(dfs, axis=1).fillna(0).astype(int)

dfs = []
for col in ['No. wheels', 'No. passengers', 'No. doors', 'Eco-friendly']:
    dfs.append(df[col].value_counts())
cat_freq_df = pd.concat(dfs, axis=1).fillna(0).astype(int)

In [75]:
num_freq_df = num_freq_df.reindex(['L', 'M', 'H', 'V'])

In [76]:
freq_df = pd.concat([cat_freq_df, num_freq_df]).fillna(0).astype(int)

In [77]:
freq_df

Unnamed: 0,Buying price,Eco-friendly,Maintenance costs,No. doors,No. passengers,No. wheels
0,0,6,0,2,0,0
1,0,4,0,0,1,0
2,0,0,0,5,2,2
3,0,0,0,0,1,0
4,0,0,0,3,2,7
5,0,0,0,0,3,0
7,0,0,0,0,1,0
8,0,0,0,0,0,1
L,2,0,2,0,0,0
M,2,0,5,0,0,0


In [78]:
freq_df = freq_df[['Buying price', 'Maintenance costs', 'No. doors',
                   'No. passengers', 'No. wheels', 'Eco-friendly']]

In [79]:
freq_df

Unnamed: 0,Buying price,Maintenance costs,No. doors,No. passengers,No. wheels,Eco-friendly
0,0,0,2,0,0,6
1,0,0,0,1,0,4
2,0,0,5,2,2,0
3,0,0,0,1,0,0
4,0,0,3,2,7,0
5,0,0,0,3,0,0
7,0,0,0,1,0,0
8,0,0,0,0,1,0
L,2,2,0,0,0,0
M,2,5,0,0,0,0


In [80]:
rel_freq_df = pd.DataFrame({})
for col in freq_df.columns:
    rel_freq_df[col] = freq_df[col].apply(lambda x: fraction(x))

rel_freq_df.to_latex('../../tex/relative_freq.tex', column_format='ccccccc')

## Get virtual modes

In [27]:
def get_virtual_modes(X, n_clusters, dissim, df=df):
    n_attrs = X.shape[1]
    centroids = np.empty((n_clusters, n_attrs), dtype='object')
    # determine frequencies of attributes
    for iattr in range(n_attrs):
        freq = defaultdict(int)
        for curattr in X[:, iattr]:
            freq[curattr] += 1
        # Sample centroids using the probabilities of attributes.
        # (I assume that's what's meant in the Huang [1998] paper; it works,
        # at least)
        # Note: sampling using population in static list with as many choices
        # as frequency counts. Since the counts are small integers,
        # memory consumption is low.
        choices = [chc for chc, wght in freq.items() for _ in range(wght)]
        # So that we are consistent between Python versions,
        # each with different dict ordering.
        choices = sorted(choices)
        centroids[:, iattr] = np.random.choice(choices, n_clusters)

    return centroids

In [32]:
np.random.seed(0)
mu_tilde = get_virtual_modes(X, 3, dissim)

In [42]:
mu_tilde

array([[2, 0, 'L', 4, 'V', 4],
       [0, 1, 'M', 5, 'V', 4],
       [2, 1, 'L', 5, 'H', 4]], dtype=object)

In [33]:
pd.DataFrame(mu_tilde, columns=df.columns)

Unnamed: 0,doors,eco_friendly,maintenance,passengers,price,wheels
0,2,0,L,4,V,4
1,0,1,M,5,V,4
2,2,1,L,5,H,4


In [45]:
tilde = '\\begin{equation}\n'
tilde += '\\nonumber\n'
tilde += '\\begin{aligned}\n'
tilde += '\\tilde{\mu} = \{ '
for i in range(len(mu_tilde)):
    row = mu_tilde[i]
    mode_str = ' & \\left['
    for compt in row:
        mode_str += '\\text{' + str(compt) + '}, \\ '
    mode_str = mode_str[:-4]
    mode_str += '\\right], \\\ '
    tilde += mode_str
tilde = tilde[:-5]
tilde += '\} \\\ \n'
tilde += '\\end{aligned}\n'
tilde += '\\end{equation}'

In [46]:
tilde

'\\begin{equation}\n\\nonumber\n\\begin{aligned}\n\\tilde{\\mu} = \\{  & \\left[\\text{2}, \\ \\text{0}, \\ \\text{L}, \\ \\text{4}, \\ \\text{V}, \\ \\text{4}\\right], \\\\  & \\left[\\text{0}, \\ \\text{1}, \\ \\text{M}, \\ \\text{5}, \\ \\text{V}, \\ \\text{4}\\right], \\\\  & \\left[\\text{2}, \\ \\text{1}, \\ \\text{L}, \\ \\text{5}, \\ \\text{H}, \\ \\text{4}\\right]\\} \\\\ \n\\end{aligned}\n\\end{equation}'

In [47]:
with open('../../tex/huang_virtual_modes.tex', 'w') as f:
    f.write(tilde)

## Replace $\tilde \mu$ with datapoints

In [83]:
mu_df = deepcopy(df)
mu_df['Dissimilarity'] = dissim(X, mu_tilde[0])

sorted_mu_df = mu_df.sort_values('Dissimilarity')
sorted_mu_df.to_latex('../../tex/huang_dissim.tex', column_format='cccccccc')

In [38]:
np.random.seed(0)
mu_bar = init_huang(X, 3, dissim)

In [39]:
idxs = []
for mu in mu_bar:
    idx = np.where(np.all(mu == X, axis=1))
    idxs.append(idx[0][0])

init_modes = df.iloc[idxs]

In [40]:
init_modes

Unnamed: 0,doors,eco_friendly,maintenance,passengers,price,wheels
6,2,1,L,4,M,4
5,2,1,M,5,M,4
4,4,1,L,5,H,4


In [48]:
mu_bar

array([[2, 1, 'L', 4, 'M', 4],
       [2, 1, 'M', 5, 'M', 4],
       [4, 1, 'L', 5, 'H', 4]], dtype=object)

In [49]:
bar = '\\begin{equation}\n'
bar += '\\nonumber\n'
bar += '\\begin{aligned}\n'
bar += '\\bar{\mu} = \{ '
for i in range(len(mu_bar)):
    row = mu_bar[i]
    mode_str = ' & \\left['
    for compt in row:
        mode_str += '\\text{' + str(compt) + '}, \\ '
    mode_str = mode_str[:-4]
    mode_str += '\\right], \\\ '
    bar += mode_str
bar = bar[:-5]
bar += '\} \\\ \n'
bar += '\\end{aligned}\n'
bar += '\\end{equation}'

In [50]:
with open('../../tex/huang_initial_modes.tex', 'w') as f:
    f.write(bar)