# KMeans Intro

## Generate Some Random Data

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

num_centers = 5

X, y = make_blobs(n_samples=1000,  n_features=2, centers=num_centers)

print ('X.shape:', X.shape)
print ('y.shape:', y.shape)

# note the color coding of clusters
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y,
            s=25, edgecolor='k')
plt.show()

## KMeans

In [None]:
from sklearn.cluster import KMeans
import numpy as np

## TODO: number of clusters should equal to 'num_centers'
## but let's start with a mis-match:  -1, +1,  0
# k = num_centers - 1
# k = num_centers + 1
k = num_centers

kmeans = KMeans(n_clusters=k)
kmeans.fit(X)

# now kmeans will predict clusters
y_pred = kmeans.predict(X)
print ('k:', k, ', wssse:', kmeans.inertia_)
print ('predicted y_pred (10) = ', np.random.choice(y_pred, 10))
print ("cluster centers:\n", kmeans.cluster_centers_)

In [None]:
## plot kmeans predicted results

## now observe the color coding of clusters
## do they match?
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.scatter(kmeans.cluster_centers_[:][0:,0], kmeans.cluster_centers_[:][0:,1], marker='^', c='red')

## WSSSE

let's iterate over some Ks

In [None]:
kvals = []
wssses = []

for k in range(2, num_centers + 3):
    kmeans = KMeans(n_clusters=k, random_state=0)
    model = kmeans.fit(X)
    wssse = model.inertia_
    print ("k={},  wssse={}".format(k,wssse))
    kvals.append(k)
    wssses.append(wssse)

In [None]:
import pandas as pd

df_wssse = pd.DataFrame({'k': kvals, 'wssse':wssses})
df_wssse

In [None]:
df_wssse.plot(x="k")

## Silhouette Score

- Silhouette score ranges from -1 to +1.
- Lower the values the worse performance
- Scores close to 0 will indicate overlapping clusters

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html


In [None]:
from  sklearn.metrics import silhouette_score

sil_score = silhouette_score (X, y_pred)

print ("silhoutte_score : ", sil_score)

In [None]:
# Let's loop through a few ranges
sil_scores = []

for k in range(2, num_centers+3):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    y_pred = kmeans.predict(X)
    sil_score = silhouette_score (X, y_pred)
    sil_scores.append(sil_score)
    print ("silhoutte_score : ", sil_score)

In [None]:
import pandas as pd

df_sil = pd.DataFrame({'k': kvals, 'sil_score':sil_scores})
df_sil

In [None]:
df_sil.plot(x="k")

## Calinski-Harabasz Index

- The score is higher when clusters are dense and well separated, which relates to a standard concept of a cluster.
- The score is fast to compute.
- The Calinski-Harabasz index is generally higher for convex clusters than other concepts of clusters, such as density based clusters like those obtained through DBSCAN.

https://scikit-learn.org/stable/modules/clustering.html#calinski-harabasz-index

In [None]:
from sklearn.metrics import calinski_harabasz_score

ch_score = calinski_harabasz_score (X, y_pred)
print ("calinski_harabasz_score : ", ch_score)

In [None]:
# Let's loop through a few ranges

ch_scores = []

for k in range(2, num_centers+3):
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    y_pred = kmeans.predict(X)
    ch_score = calinski_harabasz_score (X, y_pred)
    ch_scores.append(ch_score)
    print ("calinski_harabasz_score : ", ch_score)

In [None]:
import pandas as pd

df_ch = pd.DataFrame({'k': kvals, 'ch_score':ch_scores})
df_ch

In [None]:
df_ch.plot(x="k")

## Choose the Optimal K

From above experiments, choose a K that is scoring high. 

For example k=4

And visualize the results

In [None]:
import numpy as np

# using Silhouette Score
max_index_of_score = np.argmax(df_sil['sil_score'])
optimal_k = df_sil['k'].iloc[max_index_of_score]
print ("From Silhouette Score, optimal k=", optimal_k)

# from ch_scores
max_index_of_score = np.argmax(df_ch['ch_score'])
optimal_k = df_ch['k'].iloc[max_index_of_score]
print ("From Calinski-Harabasz, optimal k=", optimal_k)


In [None]:
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=optimal_k)
kmeans.fit(X)

# now kmeans will predict clusters
y_pred = kmeans.predict(X)

plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.scatter(kmeans.cluster_centers_[:][0:,0], kmeans.cluster_centers_[:][0:,1], marker='^', c='red')

## End

---

## Combine Scores

In [None]:
sil_scores = []
ch_scores = []

k_vals = (num_centers - 1, num_centers, num_centers + 1)

for k in k_vals:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(X)
    y_pred = kmeans.predict(X)
    
    sil_score = silhouette_score (X, y_pred)
    sil_scores.append(sil_score)
    
    ch_score = calinski_harabasz_score (X, y_pred)
    ch_scores.append(ch_score)
    
    print ("k={},  silhoutte_score={}, calinski_harabasz_score={}".format(k,sil_score,ch_score ))

In [None]:
import pandas as pd

df = pd.DataFrame({'k': k_vals, 'silhoutte_score':sil_scores, 'calinski_harabasz_score':ch_scores})
df

In [None]:
%matplotlib inline
from matplotlib import pyplot

df.plot(x="k", y=["silhoutte_score", "calinski_harabasz_score"], kind="bar", 
        secondary_y=['calinski_harabasz_score'], figsize=(9, 6))