# Unsupervised Learning - Clustering

In [None]:
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# import a data set - student grades
data = np.genfromtxt('grades_km_input.csv', delimiter=',', skip_header=1)

In [None]:
data

In [None]:
# we need columns 2, 3 and 4
data = data[:,range(1,4)]
data

In [None]:
# plot the data
x = data[:,0]
y = data[:,1]
z = data[:,2]
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x,y,z)
ax.set_xlabel('English')
ax.set_ylabel('Maths')
ax.set_zlabel('Science')
plt.show()

#### The Elbow Method

The k-means clustering method requires the number of clusters as an input. How can we decide what that number should be?

The Elbow method is a "rule-of-thumb" approach to finding the optimal number of clusters. Here, we look at the cluster dispersion for different values of k

In [None]:
# We are comparing the Within Sum of Squares (WSS) (.inertia) for each run and selecting the smallest.
# WSS is the sum of the squares of the distances from each point to the centroid of its cluster.
WSS = []
for i in range(1, 11):
    km = KMeans(n_clusters=i,random_state=0).fit(data)
    WSS.append(km.inertia_)

plt.plot(range(1, 11), WSS, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('WSS')
plt.show()

Then, we pick the value that resembles the "pit of an elbow." As we can see, this would be k=3 in this case, which makes sense given our visual expection of the dataset previously.

In [None]:
# k-means clustering - 3 clusters
clusters = KMeans(n_clusters=3, random_state=42).fit(data)
clusters.labels_

In [None]:
x = data[:,0]
y = data[:,1]
z = data[:,2]
fig = plt.figure(figsize=(8,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x,y,z,c=clusters.labels_)
ax.set_xlabel('English')
ax.set_ylabel('Maths')
ax.set_zlabel('Science')
plt.show()