## K-Means Clustering 
1. Choose the number of clusters(K) and obtain the data points 
2. Place the centroids c_1, c_2, ..... c_k randomly 
3. Repeat steps 4 and 5 until convergence or until the end of a fixed number of iterations
4. for each data point x_i:
       - find the nearest centroid(c_1, c_2 .. c_k) 
       - assign the point to that cluster 
5. for each cluster j = 1..k
       - new centroid = mean of all points assigned to that cluster
6. End 

In [84]:
import numpy as np
import random

def dist(a, b):
    return np.linalg.norm(a - b)

def nearest_cluster(centroids, point):
		distances = []
		for centroid in centroids:
			distances.append(dist(centroid, point))
		return distances.index(min(distances))
		
def KMeans(data, k = 3):
	centroids = []
	for i in range(0, k):
		c = [float(np.random.randint(4,8)),float(np.random.randint(1,5)),
			float(np.random.randint(1,7)),float(np.random.randint(0,3))]
		centroids.append(c)

	epochs = 1
	while(epochs <= 100):
		clusters = [ [] for i in range(0, k)]

		for point in data:
			pos = nearest_cluster(centroids, point)
			clusters[pos].append(point)

		previous_centroids = centroids.copy()
		
		for c in range(0, len(clusters)):
			cluster = clusters[c]
			if len(cluster) != 0:
				cluster = np.array(cluster)
				centroid = [ ]
				for i in range(0, len(cluster[0])):
					s = sum(cluster[:, i])
					centroid.append(s / float(len(cluster)))
				centroids[c] = centroid

		if dist(np.array(centroids), np.array(previous_centroids)) == 0:
			print("Converged in " + str(epochs))
			break
		epochs += 1
	return centroids

In [85]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

df = pd.read_csv('../datasets/iris.csv') 
classes = df['Name']  
df = df.drop(['Name'],axis=1) 
data = df.values.tolist() 
data = np.array(data)
data,classes = shuffle(data,classes) 
train_data = data[:135]  
test_data = data[135:]
 
clusters = KMeans(train_data, 3)

pred = []
for point in test_data:
	distances = []
	for centroid in clusters:
		distances.append(dist(centroid, point))

	pos = distances.index(min(distances))
	pred.append(pos)

print(pred)
score = silhouette_score(test_data, pred)
print(score)

Converged in 5
[1, 1, 2, 2, 2, 1, 0, 2, 1, 2, 0, 0, 1, 2, 1]
0.422643406413528
