In [None]:
import time
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

from helper import get_datasets, COLORS, MARKERS
from matplotlib.lines import Line2D
from ipywidgets import interactive
from IPython.display import display
%matplotlib widget
%matplotlib inline

np.random.seed(0)


In [None]:
def fit_knn(X, n_clusters):
    two_means = cluster.MiniBatchKMeans(n_clusters=n_clusters)
    t0 = time.time()

    # catch warnings related to kneighbors_graph
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="the number of connected components of the "
            + "connectivity matrix is [0-9]{1,2}"
            + " > 1. Completing it to avoid stopping the tree early.",
            category=UserWarning,
        )
        warnings.filterwarnings(
            "ignore",
            message="Graph is not fully connected, spectral embedding"
            + " may not work as expected.",
            category=UserWarning,
        )
        two_means.fit(X)

    t1 = time.time()
    return ("MiniBatch\nKMeans", two_means, t1 - t0)

# Part 1 - Intuition

We have six datasets each one of them with its own structure.

This interactive demo lets you explore the KNN algorithm. 

We can visualize the how the cluster algorithms clusters the diferent datasets.

In [None]:
plot_datasets, default_base_params = get_datasets()

In [None]:
plt.ioff()
# ============
# Set up cluster parameters
# ============
fig = plt.figure(figsize=(9 * 2 + 3, 13))
plt.subplots_adjust(
    left=0.02, right=0.98, bottom=0.001, top=0.95, wspace=0.05, hspace=0.01
)
def plot_boundary(n_clusters):
    fig.clear(True)
    plot_num = 1

    for i_dataset, (dataset, algo_params) in enumerate(plot_datasets):
        # update parameters with dataset-specific values
        params = default_base_params.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(
            X, n_neighbors=params["n_neighbors"], include_self=False
        )
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)


        name, algorithm, delta = fit_knn(X, n_clusters)

        if hasattr(algorithm, "labels_"):
            y_pred = algorithm.labels_.astype(int)
        else:
            y_pred = algorithm.predict(X)

        plt.subplot(3,2, plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)

        colors = np.array(
            list(
                islice(
                    cycle(
                        [
                            "#377eb8",
                            "#ff7f00",
                            "#4daf4a",
                            "#f781bf",
                            "#a65628",
                            "#984ea3",
                            "#999999",
                            "#e41a1c",
                            "#dede00",
                        ]
                    ),
                    int(max(y_pred) + 1),
                )
            )
        )
        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.text(
            0.99,
            0.01,
            ("%.2fs" % (delta)).lstrip("0"),
            transform=plt.gca().transAxes,
            size=15,
            horizontalalignment="right",
        )
        plot_num += 1
    display(fig)

inter = interactive(
    plot_boundary,
    n_clusters=[1, 3, 5, 7, 9],
)

display(inter)

Recall how the KNN splits the space. 

Which are the datasets more suitable for this algorithm?

## Part 2 Train and Fit

In [None]:
# Load the dataset iris https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets
# attribute X and y

In [None]:
# 2. Instantiate a KMEANS even a MiniBatch Kmeans
# Hint: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html?highlight=kmeans#sklearn.cluster.KMeans

In [None]:
# 3 Make predicts on X

In [None]:
dataset = datasets.load_iris()
X = dataset.data
y = dataset.target



In [None]:
est = cluster.KMeans(n_clusters=5)
est.fit(X)
y_pred = est.predict(X)

## Extra check clusters and classes

In [None]:
plt.close()
import seaborn as sns
cmap_bold = ["darkorange", "c", "darkblue"]
from matplotlib.lines import Line2D
fig, ax = plt.subplots(figsize=(10,7))


plot_x = 2
plot_y = 3 

legends = []
for i in range(max(dataset.target)+1):
    index = np.where(y == i)[0]
    col = [COLORS[ y_pred[j] ] for j in index]
    plt.scatter(
        x=X[:, plot_x][index],
        y=X[:, plot_y][index],
        c=col,
        marker=MARKERS[i],
        label=dataset.target_names[i])

    legends.append(Line2D([0], [0], marker=MARKERS[i], color='w', label=dataset.target_names[i],
                          markerfacecolor='w', markersize=12, markeredgecolor="black"))
for i in range(max(y_pred) + 1):
    legends.append(Line2D([0], [0], marker="o", color=COLORS[i], label=f"cluster {i+1}",
                        markerfacecolor=COLORS[i], markersize=12, markeredgecolor="black"))
    plt.scatter(
        x=est.cluster_centers_[i, plot_x],
        y=est.cluster_centers_[i, plot_y],
        color=COLORS[i],
        marker="o",
        edgecolor="black",
        s=100)
ax.legend(handles=legends)
plt.show()