## In this practice session, we will implement both K-Means and Gaussian mixture model algorithms in python and compare which algorithm to choose for a particular problem in unsupervised setting

In [None]:
# import some libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(style="white", color_codes=True)
import warnings
warnings.filterwarnings("ignore")

# show plots inline
%matplotlib inline

In [None]:
X = load_iris()['data']
X.shape

In [None]:
y = load_iris()['target']

In [None]:
X.shape, y.shape

In [None]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler()

scaler.fit(X)
X_scaled_array = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled_array, columns = load_iris()['feature_names'])

X_scaled.sample(5)

## **K Means Clustering**

In [None]:
from sklearn.cluster import KMeans

nclusters = 3 # this is the k in kmeans
seed = 0

km = KMeans(n_clusters=nclusters, random_state=seed)
km.fit(X_scaled)

# predict the cluster for each data point
y_cluster_kmeans = km.predict(X_scaled)
y_cluster_kmeans.shape

In [None]:
import matplotlib.patches as mpatches

red_patch = mpatches.Patch(color='red', label='Setosa')
green_patch = mpatches.Patch(color='green', label='Versicolor')
blue_patch = mpatches.Patch(color='blue', label='Virginica')

colors = np.array(['blue', 'red', 'green'])
plt.scatter(X_scaled.iloc[:, 2], X_scaled.iloc[:, 3], c=colors[y_cluster_kmeans])

plt.xlabel("PetalLengthCm")
plt.ylabel("PetalWidthCm")

plt.legend(handles=[red_patch, green_patch, blue_patch], edgecolor='k')
plt.show();

In [None]:
from sklearn import metrics

score = metrics.silhouette_score(X_scaled, y_cluster_kmeans)

print('Silhouette Coefficient : ', score)

In [None]:
scores = metrics.silhouette_samples(X_scaled, y_cluster_kmeans)

sns.distplot(scores);

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

# first let's see how the k-means clustering did - 
score = adjusted_rand_score(y, y_cluster_kmeans)

print('Adjusted Rand Score : ', score)

## **Gaussian Mix Modelling**

In [None]:
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components=nclusters)
gmm.fit(X_scaled)

# predict the cluster for each data point
y_cluster_gmm = gmm.predict(X_scaled)
y_cluster_gmm.shape

In [None]:
red_patch = mpatches.Patch(color='red', label='Setosa')
green_patch = mpatches.Patch(color='green', label='Versicolor')
blue_patch = mpatches.Patch(color='blue', label='Virginica')

colors = np.array(['blue', 'red', 'green'])
plt.scatter(X_scaled.iloc[:, 2],X_scaled.iloc[:, 3],c=colors[y_cluster_gmm])

plt.xlabel("PetalLengthCm")
plt.ylabel("PetalWidthCm")

plt.legend(handles=[red_patch, green_patch, blue_patch], edgecolor='k')
plt.show()

In [None]:
from sklearn import metrics

score = metrics.silhouette_score(X_scaled, y_cluster_gmm)
print('Silhouette Coefficient : ', score)

In [None]:
from sklearn.metrics.cluster import adjusted_rand_score

# first let's see how the k-means clustering did - 
score = adjusted_rand_score(y, y_cluster_gmm)

print('Adjusted Rand Score : ', score)