# Instructor Do: The K-Means Algorithm

In [None]:
# Initial imports
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import plotly.express as px
import hvplot.pandas
from pathlib import Path

In [None]:
# Loading data
file_path = Path("data/new_iris_data.csv")
df_iris = pd.read_csv(file_path)
df_iris.head(10)

## Using K-Means

In [None]:
# Initializing model with K = 3 (since we already know there are three classes of iris plants)
model = KMeans(n_clusters=3, random_state=5)

In [None]:
# Fitting model
model.fit(df_iris)

In [None]:
# Get predictions
predictions = model.predict(df_iris)
print(predictions)

In [None]:
# Add a new class column to df_iris
df_iris["class"] = model.labels_
df_iris.head()

In [None]:
# Plotting the clusters with two features
df_iris.hvplot.scatter(x="sepal_length", y="sepal_width", by="class")

In [None]:
df_iris.hvplot.scatter(x="petal_length", y="petal_width", by="class")

In [None]:
# Plotting the clusters with three features
fig = px.scatter_3d(
    df_iris,
    x="petal_width",
    y="sepal_length",
    z="petal_length",
    color="class",
    symbol="class",
    size="sepal_width",
    width=800,
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

## Finding the best value for _k_ using the Elbow Curve

In [None]:
inertia = []
k = list(range(1, 11))

# Looking for the best k
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_iris)
    inertia.append(km.inertia_)

# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)