In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(42)

### Task 2.1
* Implementing Kmeans, GMM

In [46]:
def Kmeans(X, k=3, max_iter=1000):
	"""
	X: data
	k: number of clusters
	max_iter: maximum number of iterations
	"""
	key = X[np.random.choice(X.shape[0], k, replace=False)]
	for _ in range(max_iter):
		dis = np.linalg.norm(X[:, None, :] - key[None, :, :], axis=2)
		typ = np.argmin(dis, axis=1)
		key_ = np.array([np.mean(X[typ == i], axis=0) for i in range(k)])
		if np.all(np.linalg.norm(key - key_, axis=1) < 1e-5):
			break
		key = key_
	return typ, key

def __normal_distribution(X, mu, sigma):
	"""
	X: data
	mu: mean
	sigma: covariance matrix
	Note that the result ignored the constant term (2*pi) ** (-0.5)
	"""
	return np.linalg.det(sigma) ** (-0.5) * np.exp(-0.5 * np.sum((X - mu) @ np.linalg.inv(sigma) * (X - mu), axis=1))
def GMM(X, k=3, max_iter=1000):
	"""
	X: data
	k: number of clusters
	max_iter: maximum number of iterations
	"""
	n, d = X.shape
	mu = X[np.random.choice(n, k, replace=False)]
	sigma = np.array([np.eye(d) for _ in range(k)])
	pi = np.ones(k) / k
	for _ in range(max_iter):
		# E step
		p = np.array([pi[i] * __normal_distribution(X, mu[i], sigma[i]) for i in range(k)])
		p /= np.sum(p, axis=0, keepdims=True)
		# M step
		pi = np.mean(p, axis=1)
		mu = np.sum(p[:, :, None] * X[None, :, :], axis=1) / np.sum(p, axis=1, keepdims=True)
		sigma = np.sum(p[:, :, None, None] * (X[None, :, :, None] - mu[:, None, :, None]) * (X[None, :, None, :] - mu[:, None, None, :]), axis=1) / np.sum(p, axis=1).reshape(-1, 1, 1)
	return np.argmax(p, axis=0), mu

* implementing silhouette coefficient, RI, NMI

In [47]:
def silhouette(X, typ):
	"""
	X: data
	typ: cluster type
	"""
	if np.unique(typ).shape[0] == 1:
		return "N/A"
	n = X.shape[0]
	dis = np.linalg.norm(X[:, None, :] - X[None, :, :], axis=2)
	dis_cluster = np.array([np.mean(dis[typ == i], axis=0) for i in range(np.max(typ) + 1)])
	a = np.array([dis_cluster[typ[i], i] for i in range(n)])
	b = np.array([np.min(np.delete(dis_cluster[:, i], typ[i])) for i in range(n)])
	return np.mean((b - a) / np.maximum(a, b))

def rand_index(cls1, cls2):
	"""
	cls1: cluster type 1
	cls2: cluster type 2
	"""
	n = cls1.shape[0]
	a = np.sum((cls1[:, None] == cls1[None, :]) & (cls2[:, None] == cls2[None, :])) - n
	b = np.sum((cls1[:, None] != cls1[None, :]) & (cls2[:, None] != cls2[None, :]))
	return (a + b) / (n * (n - 1))

def __entropy(cls):
	n = cls.shape[0]
	typ = np.unique(cls)
	return -np.sum([np.sum(cls == i) / n * np.log2(np.sum(cls == i) / n) for i in typ])
def __mutual_info(cls1, cls2):
	n = cls1.shape[0]
	typ1, typ2 = np.unique(cls1), np.unique(cls2)
	return np.sum([
		np.sum((cls1 == i) & (cls2 == j)) / n * # P(i, j)
		np.log2(n * np.sum((cls1 == i) & (cls2 == j)) / np.sum(cls1 == i) / np.sum(cls2 == j)) # log(P(i, j) / P(i) / P(j))
		for i in typ1 for j in typ2 if np.sum((cls1 == i) & (cls2 == j)) > 0
	])
def NMI(cls1, cls2):
	"""
	cls1: cluster type 1
	cls2: cluster type 2
	"""
	if np.unique(cls1).shape[0] == 1 or np.unique(cls2).shape[0] == 1:
		return "N/A"
	return __mutual_info(cls1, cls2) / np.sqrt(__entropy(cls1) * __entropy(cls2))

In [48]:
print("Testing \"seeds.csv\"...")
df = pd.read_csv("seeds.csv")
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values

for k in range(1, 4):
	typ, key = Kmeans(X, k)
	print(f"k = {k} | K-means | silhouette = {silhouette(X, typ)} | RI = {rand_index(y, typ)} | NMI = {NMI(y, typ)}")
	typ, key = GMM(X, k)
	print(f"k = {k} |   GMM   | silhouette = {silhouette(X, typ)} | RI = {rand_index(y, typ)} | NMI = {NMI(y, typ)}")
	print()

Testing "seeds.csv"...
k = 1 | K-means | silhouette = N/A | RI = 0.33014354066985646 | NMI = N/A
k = 1 |   GMM   | silhouette = N/A | RI = 0.33014354066985646 | NMI = N/A

k = 2 | K-means | silhouette = 0.5228955002005704 | RI = 0.7309637730690363 | NMI = 0.552245032504209
k = 2 |   GMM   | silhouette = 0.46407877368925843 | RI = 0.7356573251310093 | NMI = 0.571759581708631

k = 3 | K-means | silhouette = 0.47570673587797513 | RI = 0.8713602187286398 | NMI = 0.7100683008832274
k = 3 |   GMM   | silhouette = 0.4625726308049252 | RI = 0.8350877192982457 | NMI = 0.6542633325693802



In [49]:
print("Testing \"Vowel.csv\"...")
df = pd.read_csv("Vowel.csv")
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values
uni = list(np.unique(y))
y = np.array([uni.index(i) for i in y])

for k in range(1, 4):
	typ, key = Kmeans(X, k)
	print(f"k = {k} | K-means | silhouette = {silhouette(X, typ)} | RI = {rand_index(y, typ)} | NMI = {NMI(y, typ)}")
	typ, key = GMM(X, k)
	print(f"k = {k} |   GMM   | silhouette = {silhouette(X, typ)} | RI = {rand_index(y, typ)} | NMI = {NMI(y, typ)}")
	print()

Testing "Vowel.csv"...
k = 1 | K-means | silhouette = N/A | RI = 0.08998988877654196 | NMI = N/A
k = 1 |   GMM   | silhouette = N/A | RI = 0.08998988877654196 | NMI = N/A

k = 2 | K-means | silhouette = 0.49158686786902217 | RI = 0.49767441860465117 | NMI = 0.0
k = 2 |   GMM   | silhouette = 0.4805860969603036 | RI = 0.4979277098589535 | NMI = 0.0010904873313119184

k = 3 | K-means | silhouette = 0.38378410361741594 | RI = 0.6359959555106168 | NMI = 0.0
k = 3 |   GMM   | silhouette = 0.3560268360194179 | RI = 0.6077723646985528 | NMI = 0.002008103084793696

