# K-means clustering for the Iris data set

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
# this cell has been tagged with "parameters"

# TODO fetch iris dataset
iris = None

In [None]:
# this cell has been tagged with "parameters"
n_clusters = 3

In [None]:
iris.head()

In [None]:
iris["Species"].unique()

In [None]:
iris.describe()

In [None]:
# TODO - fill out with our first analysis of the data

Famously, while it's easy to see from theese plots that _Iris setosa_ is linearly separable from the other two species, the _versicolor_ and _virginica_ flowers are not linearly separable. We can see this another way by looking at box plots of the measured attributes:

In [None]:
fig, axs = plt.subplots(ncols=2, nrows=2, figsize=(12, 12))

sns.boxplot(x="Species", y="Sepal length (cm)", data=iris, ax=axs[0, 0])
sns.swarmplot(x="Species", y="Sepal length (cm)", data=iris, color=".25", ax=axs[0, 0])

sns.boxplot(x="Species", y="Sepal width (cm)", data=iris, ax=axs[0, 1])
sns.swarmplot(x="Species", y="Sepal width (cm)", data=iris, color=".25", ax=axs[0, 1])

sns.boxplot(x="Species", y="Petal length (cm)", data=iris, ax=axs[1, 0])
sns.swarmplot(x="Species", y="Petal length (cm)", data=iris, color=".25", ax=axs[1, 0])

sns.boxplot(x="Species", y="Petal width (cm)", data=iris, ax=axs[1, 1])
sns.swarmplot(x="Species", y="Petal width (cm)", data=iris, color=".25", ax=axs[1, 1])

If we try to cluster this data using a method such as _k_-means, we'll be unsuccessful. Consider:

In [None]:
import sklearn.cluster

estimator = sklearn.cluster.KMeans(n_clusters=n_clusters)
estimator.fit(
    iris[["Sepal length (cm)", "Sepal width (cm)", "Petal length (cm)", "Petal width (cm)"]]
)

In [None]:
iris["K-means cluster assignment"] = estimator.labels_

In [None]:
fig, axs = plt.subplots(ncols=3, nrows=3, figsize=(12, 12))

sns.scatterplot(
    x="Sepal width (cm)",
    y="Sepal length (cm)",
    data=iris,
    ax=axs[0, 0],
    hue="Species",
    style="K-means cluster assignment",
    legend=False,
    marker="x",
    alpha=0.5,
)
sns.scatterplot(
    x="Petal length (cm)",
    y="Sepal length (cm)",
    data=iris,
    ax=axs[0, 1],
    hue="Species",
    style="K-means cluster assignment",
    legend=False,
    marker="x",
    alpha=0.5,
)
sns.scatterplot(
    x="Petal width (cm)",
    y="Sepal length (cm)",
    data=iris,
    ax=axs[0, 2],
    hue="Species",
    style="K-means cluster assignment",
    legend=False,
    marker="x",
    alpha=0.5,
)

sns.scatterplot(
    x="Petal length (cm)",
    y="Sepal width (cm)",
    data=iris,
    ax=axs[1, 1],
    hue="Species",
    style="K-means cluster assignment",
    legend=False,
    marker="x",
    alpha=0.5,
)
sns.scatterplot(
    x="Petal width (cm)",
    y="Sepal width (cm)",
    data=iris,
    ax=axs[1, 2],
    hue="Species",
    style="K-means cluster assignment",
    legend=False,
    marker="x",
    alpha=0.5,
)

p = sns.scatterplot(
    x="Petal width (cm)",
    y="Petal length (cm)",
    data=iris,
    ax=axs[2, 2],
    hue="Species",
    style="K-means cluster assignment",
    legend=False,
    marker="x",
    alpha=0.5,
)

axs[1, 0].axis("off")
axs[2, 0].axis("off")
axs[2, 1].axis("off")

import matplotlib.patches as mpatches
import matplotlib.lines as mlines

palette = sns.color_palette()
setosa = mpatches.Patch(color=palette[0], label="Iris setosa", alpha=0.5)
versicolor = mpatches.Patch(color=palette[1], label="Iris versicolor", alpha=0.5)
virginica = mpatches.Patch(color=palette[2], label="Iris virginica", alpha=0.5)

class_0 = mlines.Line2D(
    [], [], marker="o", color="lightgrey", linestyle="None", markersize=10, label="Class 0"
)
class_1 = mlines.Line2D(
    [], [], marker="X", color="lightgrey", linestyle="None", markersize=10, label="Class 1"
)
class_2 = mlines.Line2D(
    [], [], marker="s", color="lightgrey", linestyle="None", markersize=10, label="Class 2"
)

axs[2, 0].legend(handles=[setosa, versicolor, virginica, class_0, class_1, class_2])

We can see that _versicolor_ and _virginica_ individuals are incorrectly classified by the _k_-means estimator.