# MeanShift

## Step-1: Generate Some Data

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt

num_centers = 5

X, y = make_blobs(n_samples=1000,  n_features=2, centers=num_centers)

print ('X.shape:', X.shape)
print ('y.shape:', y.shape)

# note the color coding of clusters
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y,
            s=25, edgecolor='k')
plt.show()

## Step-2: Run MeanShift Clustering

In [None]:
from sklearn.cluster import MeanShift

ms = MeanShift(bandwidth=2).fit(X)
print ("num clusters : ", len(ms.cluster_centers_))
print (ms.cluster_centers_)

## Step-3: Visualize Clusters

In [None]:
## plot predicted results
y_pred = ms.predict(X)

## now observe the color coding of clusters
## do they match?
plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.scatter(ms.cluster_centers_[:][0:,0], ms.cluster_centers_[:][0:,1], marker='^', c='red')

## Experiment

Change the bandwidth in step-2 and run the algorithm.  Does it change the predicted clusters?

## Iterate over bandwidth (silhouette_score)

In [None]:
from sklearn.cluster import MeanShift
from  sklearn.metrics import silhouette_score

bandwidths = [0.5, 1, 1.5, 2, 2.5, 3]
sil_scores = []
for bandwidth in bandwidths:
  ms = MeanShift(bandwidth=bandwidth).fit(X)
  y_pred = ms.predict(X)
  sil_score = silhouette_score (X, y_pred)
  sil_scores.append(sil_score)
  print ("bandwidth={}, num clusters={}, silhoutte_score={}".format(
      bandwidth, len(ms.cluster_centers_), sil_score))

In [None]:
import pandas as pd
df_sil = pd.DataFrame({'bandwidth': bandwidths, 'sil_score':sil_scores})
df_sil

In [None]:
df_sil.plot(x="bandwidth")

## Iterate over bandwidth (Calinski-Harabasz)

In [None]:
from sklearn.metrics import calinski_harabasz_score
from sklearn.cluster import MeanShift

ch_scores = []

bandwidths = [0.5, 1, 1.5, 2, 2.5, 3]
sil_scores = []
for bandwidth in bandwidths:
  ms = MeanShift(bandwidth=bandwidth).fit(X)
  y_pred = ms.predict(X)
  ch_score = calinski_harabasz_score (X, y_pred)
  ch_scores.append(ch_score)
  print ("bandwidth={}, num clusters={}, ch_score={}".format(
      bandwidth, len(ms.cluster_centers_), ch_score))

In [None]:
import pandas as pd
df_ch = pd.DataFrame({'bandwidth': bandwidths, 'ch_score':ch_scores})
df_ch

In [None]:
df_ch.plot(x="bandwidth")

## Find the Optimal Bandwidth

In [None]:
import numpy as np

# from ch_scores
max_index_of_score = np.argmax(df_ch['ch_score'])
optimal_bandwidth = df_ch['bandwidth'].iloc[max_index_of_score]
print ("From Calinski-Harabasz, optimal bandwidth=", optimal_bandwidth)


In [None]:
from sklearn.cluster import MeanShift

ms = MeanShift(bandwidth=optimal_bandwidth).fit(X)
print ("num clusters : ", len(ms.cluster_centers_))
y_pred = ms.predict(X)

plt.scatter(X[:, 0], X[:, 1], c=y_pred)
plt.scatter(ms.cluster_centers_[:][0:,0], ms.cluster_centers_[:][0:,1], marker='^', c='red')