# K-Modes
* Outlook (self-learning): K-prototypes; a combination of K-Means and K-Modes

In [1]:
import numpy as np
import pandas as pd

In [2]:
def summed_mismatches(x1, x2):
    return np.sum(x1 != x2)

def get_assignments(X, c):
    M = X.shape[0]
    K = c.shape[0]
    C = [[] for _ in range(K)]
    cost = 0
    for i in range(M):
        mindist = np.infty
        for k in range(K):
            dist = summed_mismatches(X[i,], c[k,])
            if dist < mindist:
                best_k = k
                mindist = dist
        cost += mindist
        C[best_k].append(i)            
    return C, cost

def get_modes(X, C):
    K = len(C)
    D = X.shape[1]
    c = np.empty((K, D), "object") # builds and empty array that can contain "objects" (like strings)
    for k in range(K):
        for d in range(D):
            uniques, counts = np.unique(X[C[k], d], return_counts=True) # returns sorted unique values; deterministic behaviour guaranteed
            c[k, d] = uniques[np.argmax(counts)]
    return c

def kmodes(X, c, max_epochs=100, verbose=False):
    c = c.copy() # don't change the original list
    M = X.shape[0]
    K = c.shape[0]

    for t in range(max_epochs):
        C = [[] for _ in range(K)]
        C, cost = get_assignments(X, c)
        c = get_modes(X, C)
        if verbose:
            print("t =", t, "assignments:\n", C, "\ncluster centres:\n", c)
        if t > 0 and C_prev == C: # why t > 1? Don't check on the first iteration; there is no C_prev
            break # for t
        C_prev = [l.copy() for l in C] # deep-copy the list so it won't get modified
    return t, C, c, cost

In [3]:
# toy-data: not representative of our population in any way...
data = [["married", "PhD"], 
        ["single", "PhD"], 
        ["married", "None"], 
        ["divorced", "Master"], 
        ["divorced", "None"], 
        ["single", "Bachelor"], 
        ["married", "Bachelor"], 
        ["single", "Bachelor"], 
        ["divorced", "None"], 
        ["single", "None"]]
df = pd.DataFrame(data, columns=["marital","academic degree"])
df

Unnamed: 0,marital,academic degree
0,married,PhD
1,single,PhD
2,married,
3,divorced,Master
4,divorced,
5,single,Bachelor
6,married,Bachelor
7,single,Bachelor
8,divorced,
9,single,


In [4]:
# Example assignments and modes
X = np.array(df)
C = [[0,1,2,5,6,7,9], [3,4,8]] # single + Bachelor; divorced + None
c = get_modes(X, C)
print(c)

[['single' 'Bachelor']
 ['divorced' 'None']]


In [5]:
# randomly initialize cluster centres
# ind = np.random.choice(np.arange(M), K, replace=False)
# c = X[ind,]
c = np.array([X[0], X[3]])
print("Initial cluster centres:\n", c)

Initial cluster centres:
 [['married' 'PhD']
 ['divorced' 'Master']]


In [6]:
kmodes(X, c, verbose=True)

t = 0 assignments:
 [[0, 1, 2, 5, 6, 7, 9], [3, 4, 8]] 
cluster centres:
 [['single' 'Bachelor']
 ['divorced' 'None']]
t = 1 assignments:
 [[0, 1, 5, 6, 7, 9], [2, 3, 4, 8]] 
cluster centres:
 [['single' 'Bachelor']
 ['divorced' 'None']]
t = 2 assignments:
 [[0, 1, 5, 6, 7, 9], [2, 3, 4, 8]] 
cluster centres:
 [['single' 'Bachelor']
 ['divorced' 'None']]


(2,
 [[0, 1, 5, 6, 7, 9], [2, 3, 4, 8]],
 array([['single', 'Bachelor'],
        ['divorced', 'None']], dtype=object),
 7)