# Unsupervised Machine Learning by K-Means Algorithm

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist

## Step 1: Data Preparation

In [None]:
!pwd

In [None]:
# Load input data
# x = pd.read_csv('../data/sales.csv', delimiter=',')
# X = x.values
# X = X[:,1:]

X = np.loadtxt('/Users/tdi/Documents/Teaching/Data/data_clustering.txt', delimiter=',')
print(X)

In [None]:
# Plot the input data
# a. creates figure
plt.figure()
plt.title('Input data')

# b. get the range of X and Y (long way)
# first column
x_min = X[:, 0].min()
x_max = X[:, 0].max()

# second column
y_min = X[:, 1].min()
y_max = X[:, 1].max()

# same as above: get the range of X and Y (short way)
# x_min, x_max = X[:, 0].min(), X[:, 0].max()
# y_min, y_max = X[:, 1].min(), X[:, 1].max()

# c. set plot limits
plt.xlim(x_min - 1, x_max + 1)
plt.ylim(y_min - 1, y_max + 1)

# plot the points
plt.scatter(X[:,0], X[:,1], marker='o', facecolors='none', edgecolors='black', s=30)

plt.show()

## Step 2: Implement KMeans Algorithm for Training a Prediction Model

### Determine K by Elbow Method

In [None]:
# Determine k by minimizing the distortion - 
# the sum of the squared distances between each observation vector and its centroid
distortions = []
K = range(2,10)
for k in K:
    model = KMeans(n_clusters=k, n_init=10).fit(X)
    model.fit(X)
    distortions.append(sum(np.min(cdist(X, model.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0]) 
print("Distortion: ", distortions)

In [None]:
# Plot the distortion to discover the elbow
plt.title('Elbow Method for Optimal K')
plt.plot(K, distortions, 'bx-')
plt.xlabel('K')
plt.ylabel('Distortion')
plt.show()

In [None]:
# Optimal number of clusters K
num_clusters = 5

In [None]:
# Create an instance of KMeans classifier
kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=20)
# init: method of experimemtal finding the initial location of the centroids
# n_init: the algorithm will run n_init times with different cetroids and the best result of those will be taken

In [None]:
# Train the KMeans clustering model
kmeans.fit(X)

### Determin K by Silhouette Score
A silhouette score is a measure of clusterts quality in terms of __cohesion__ (small distance between the data in one cluster) and __separation__ (large distance between the clusters). <br>
It is calculated for each point separately and then as an average value of the individual scores.<br>
The value is a number [-1, +1]. The best result is 1, while -1 means wrong clustering. The peripherial points in a cluster would produce result close to 0.<br>
We can apply the method for choosing the right number of clusters, as well as for comparing different methods of clustering.
We always search for the maximum silhouette score.

In [None]:
# Determine k by maximising the silhouette score for each number of clusters
scores = []
K = range(2,10)
for k in K:
    model = KMeans(n_clusters=k, n_init=10)
    model.fit(X)
    score = metrics.silhouette_score(X, model.labels_, metric='euclidean', sample_size=len(X))
    print("\nNumber of clusters =", k)
    print("Silhouette score =", score)
    scores.append(score)

In [None]:
# Plot the elbow
plt.title('Silhouette Score Method for Discovering the Optimal K')
plt.plot(K, scores, 'bx-')
plt.xlabel('K')
plt.ylabel('Silhouette Score')
plt.show()

## Step 3: Implement the Trained Model for Prediction

In [None]:
# Predict 5 clusters in X
y = kmeans.predict(X)
y

In [None]:
# See the predicted labels of clusters
# cluster labels are stored in variable 'kmeans.labels_'
print(kmeans.labels_)

# same as print(Y)

## Step 4: Review the Results

### Clusters seen one by one

In [None]:
# Split the clusters, for demo purpose only
for i in range(num_clusters):
    # slice the cluster
    cluster = X[y == i]    
    # print the shape
    print("Cluster ", i, ": ", cluster.shape)    
    # plot the points of this cluster
    plt.scatter(cluster[:, 0], cluster[:, 1])   
    plt.grid(True)
    plt.show()

### All Clusters in One Plot

In [None]:
# Plot the points with color code of the predicted clusters
# viridis - a color map, https://matplotlib.org/users/colormaps.html
# s - marker size, here, the dots size
# c - marker color, here taken from the predicted clusters
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='viridis')

plt.grid(True)
plt.show()

In [None]:
# See the cluster centres, created during the clustering
# cluster centres are stored in a variable named 'cluster_centers_'
print(kmeans.cluster_centers_)

### Print Bondaries of Clusters

In [None]:
# we need a grid of points for drawing a smooth border between clusters
# define step size of the mesh 
step_size = 0.01

# we need to cover all points of our data
# create a grid out of an array of X values and an array of y values
x_coord = np.arange(x_min, x_max, step_size)
y_coord = np.arange(y_min, y_max, step_size)

# meshgrid() creates a matrix of coordinates from the two vectors of coordinates
x_vals, y_vals = np.meshgrid(x_coord, y_coord)

In [None]:
# Predict cluster labels for all the points on the grid 
# ravel() returns 1D-array
xx = x_vals.ravel()
yy = y_vals.ravel()

# np.c_ concatenates the arguments
labels = kmeans.predict(np.c_[xx,yy])
labels = labels.reshape(x_vals.shape)

In [None]:
# Plot the clusters
# create new plot area
plt.figure()
# clear the plot area
plt.clf()

plt.title('Boundaries of clusters')

# plot the frame
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)

# plot the clusters 
plt.imshow(labels, interpolation='nearest',
           extent=(x_vals.min(), x_vals.max(), y_vals.min(), y_vals.max()),
           cmap='viridis', aspect='auto')
# plot the points as they belong to the clusters
plt.scatter(X[:,0], X[:,1], marker='o', facecolors='none', edgecolors='white', s=30) 

# plot the centroids
centers = kmeans.cluster_centers_
plt.scatter(centers[:,0], centers[:,1],  s=200, linewidths=2, color='yellow', marker='*', zorder=3, facecolors='black')

# annotate the centroids
for i, center in enumerate(centers):
    plt.annotate(i, center+[0.0,1.0], 
                 size=15, zorder=1, color='yellow', weight='bold', 
                 horizontalalignment='center', verticalalignment='center',) 
plt.show()

For labeling clusters see also https://nikkimarinsek.com/blog/7-ways-to-label-a-cluster-plot-python

In [None]:
# Repeat the training and clustering with different number of clusters K
# Compare the images
# Select the one with most compact clusters

## Step 5: Validate the Model

In [None]:
#!pip install yellowbrick

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

In [None]:
k = 5
model = KMeans(n_clusters=k, n_init=10)
model.fit_predict(X)

In [None]:
score = silhouette_score(X, model.labels_, metric='euclidean')
print('Silhouette Score: %.3f' % score)

In [None]:
# Visualize the silhouette scores of all points
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(X)
visualizer.show()  

Interpretation: Each component of the figure represents one cluster with a horisontal bar chart of each cluster point. <br>
Clusters are seen almost equaly tick and long - similar. There is no cluster with a silhouette score lower than the average (the red line). <br>
The score is bigger than 0.5 - relatively good.

## Step 6: Implement the Model on New Data

In [None]:
# predict cluster of nonseen data
test = kmeans.predict([[5.8, 4.5]])
print(test)

In [None]:
test = kmeans.predict([[6.0, 8.0]]) == 3
print(test)

## Step 7: Store the Model in a File

In [None]:
!pip3 install joblib

In [None]:
# For serialization and deserialization of data from/to file
# from sklearn.externals import joblib
import joblib
joblib.dump(kmeans, '../data/kmmodel.pkl')

## Practice

Implement the procedure to a data set from the file __sales.csv__ to segment the sales and discover patterns.
Observe the recommended number of clusters and the initial location of the centroids.