In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import cluster, datasets, metrics, model_selection


# Data


## Create synthetic data

In [None]:
X, _ = datasets.make_blobs(
    centers=5,
    random_state=42
)

In [None]:
_ = plt.scatter(X[:, 0], X[:, 1])
_ = plt.xlabel("Feature 0")
_ = plt.ylabel("Feature 1")
_ = plt.grid()

## Load the MNIST data

We will load the data using the [Pandas](https://pandas.pydata.org/) library. Highly recommend the most recent edition of [*Python for Data Analysis*](https://learning.oreilly.com/library/view/python-for-data/9781491957653/) by Pandas creator Wes Mckinney for anyone interested in learning how to use Pandas.

In [None]:
%%bash
ls ./sample_data

In [None]:
mnist_train_df = pd.read_csv(
    "./sample_data/mnist_train_small.csv",
    header=None,
    names=["label"] + [f"p{i}" for i in range(784)],
)
mnist_train_features_df = mnist_train_df.drop("label", axis=1)
mnist_train_target = mnist_train_df.loc[:, "label"]

In [None]:
mnist_train_features_df.info()

In [None]:
mnist_train_target.head()

# K-Means

In [None]:
cluster.KMeans?

In [None]:
# hyper-parameters
_hyperparameters = {
    "n_clusters": 5,
    "n_init": "auto",
    "random_state": 42,
}

feature_extractor = cluster.KMeans(**_hyperparameters)

In [None]:
Z = feature_extractor.fit_transform(X)

In [None]:
Z.shape

In [None]:
Z

In [None]:
cluster_labels = feature_extractor.predict(X)

In [None]:
cluster_labels

In [None]:
feature_extractor.labels_

In [None]:
feature_extractor.cluster_centers_

In [None]:
_ = plt.scatter(X[:, 0], X[:, 1], c=cluster_labels)
_ = plt.xlabel("Feature 0")
_ = plt.ylabel("Feature 1")
_ = plt.grid()

### Exercise: Centroid Initialization

Load the California house price data and then use Google Maps to find the latitute and longitude of the 5 largest cities in California. Use these latitude and longitude values to initialize the KMeans algorithm.

Fit the KMeans algrorithm with your "good" initialization and then use the trained model to create 5 new features. Add these new features to the original features and compute the correlation between the new features and the house price. Are these new features useful?

### Solution

## Finding the optimal number of cluster

In [None]:
kmeans = cluster.KMeans(n_clusters=5, n_init="auto")
_ = kmeans.fit(X)

print(f"Silhouette Score: {metrics.silhouette_score(X, kmeans.labels_)}")

In [None]:
n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10]
scores = []
for n_cluster in n_clusters:
    kmeans = cluster.KMeans(n_clusters=n_cluster, n_init="auto")
    _ = kmeans.fit(X)
    scores.append(metrics.silhouette_score(X, kmeans.labels_))


In [None]:
_ = plt.plot(n_clusters, scores)
_ = plt.xlabel("Number of clusters")
_ = plt.ylabel("Silhouette Score")
_ = plt.grid()

In [None]:
from yellowbrick.cluster import silhouette_visualizer


# function returns the fitted k-means model!
kmeans = silhouette_visualizer(
    cluster.KMeans(4, n_init="auto", random_state=42),
    X,
    colors='yellowbrick'
)

In [None]:
_ = silhouette_visualizer(
    cluster.KMeans(5, n_init="auto", random_state=42),
    X,
    colors='yellowbrick'
)

### Exercise

Use KMeans to cluster the MNIST dataset. Try three different values for the number of clusters and compare the results of Silhouette scores and plots to determine the "correct" number of clusters.

### Solution

In [None]:
kmeans = cluster.KMeans(n_clusters=10, n_init="auto")
_ = kmeans.fit(mnist_train_features_df)

In [None]:
Z = kmeans.transform(mnist_train_features_df)
cluster_labels = kmeans.predict(mnist_train_features_df)

_ = plt.scatter(Z[:, 0], Z[:, 1], c=cluster_labels)
_ = plt.xlabel("Z0")
_ = plt.ylabel("Z1")
_ = plt.grid()

In [None]:
from sklearn import metrics

_report = metrics.classification_report(
    mnist_train_target,
    cluster_labels
)
print(_report)

## Using clustering for semi-supervised learning

In [None]:
mnist_labeled_features_df, mnist_unlabeled_features_df, mnist_labeled_target, _ = (
    model_selection.train_test_split(
        mnist_train_features_df,
        mnist_train_target,
        test_size=0.8,
        random_state=42,
        stratify=mnist_train_target
    )
)

In [None]:
kmeans = cluster.KMeans(n_clusters=50, n_init="auto")
_ = kmeans.fit(mnist_unlabeled_features_df)

In [None]:
kmeans.cluster_centers_.shape

In [None]:
fig, axes = plt.subplot(5, 10, sharex=True, sharey=True)
for i, cluster_center in enumerate(kmeans.cluster_centers_.reshape(50, 28, 28)):
    axes[i, j].imshow(cluster_center)
