**Import needed packages/modules**

In [None]:
# Cell 1
from dataclasses import dataclass
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive

**Mount your Google Drive and determine the path to this notebook**

In [None]:
# Cell 2
drive.mount("/content/gdrive", force_remount=True)
notebook_path = Path("/content/gdrive/MyDrive/SciComp101-GC")
notebook_path /= Path("Session 20 - Machine Learning")
notebook_path

**Define a Python `dataclass` to store each data point in the data file**
1. A dataclass automatically generates all of the standard special methods expected of Python classes
2. These double underscore (dunder) methods include \_\_init\_\_, \_\_repr\_\_, \_\_eq\_\_, etc.
3. Each `DataPoint` object stores its $(x,y)$ 2D Cartesian coordinate
4. It also stores a reference to which `Cluster` object this data point currently belongs

In [None]:
# Cell 3
@dataclass
class DataPoint:
    x, y = 0, 0
    cluster = None

**Define a Python `dataclass` to store information about each cluster**
1. Each `Cluster` object has a unique <u>integer</u> (index) to identify that cluster
2. Each cluster has a unique matplotlib color name, stored as a string
3. Each cluster stores its $(x,y)$ 2D Cartesian coordinate
4. Its current population (the number of data points assigned to that cluster)
5. The average distance of each point assigned to this cluster to the cluster's center

In [None]:
# Cell 4
@dataclass
class Cluster:
    index = 0
    color = ""
    x, y = 0, 0
    population = 0
    mean_distance = 0.0

**Define a function to read the data points from the CSV data file**

In [None]:
# Cell 5
def init_points(include_outliers):
    file_name = "cluster_samples.csv"
    file_path = notebook_path / file_name
    samples = np.genfromtxt(file_path, delimiter=",")

    pts = []
    for s in samples:
        p = DataPoint()
        p.x, p.y = s[0], s[1]
        pts.append(p)
    if not include_outliers:
        pts.pop()
    return pts

**Define a function to initialize the clusters**
1. The function accepts a parameter indicating the number of clusters to create
2. The *maximum* number of clusters is **six** based upon the number of defined color names
3. Each cluster gets a unique index value

In [None]:
# Cell 6
def init_clusters(num_clusters):
    cs: list[Cluster] = []
    colors = ("red", "blue", "green", "purple", "yellow", "orange")
    for i in range(num_clusters):
        c = Cluster()
        c.index = i
        c.color = colors[i]
        cs.append(c)
    return cs

**Define a function to assign each data point to an <u>initial</u> cluster**
1. Lacking any other information, the data points are assigned to a cluster in a *round robbin* fashion
2. As we assign each data point to a cluster, we must increment that cluster's population count

In [None]:
# Cell 7
def init_assign(pts, cs):
    for i, p in enumerate(pts):
        p.cluster = cs[i % len(cs)]
        p.cluster.population += 1

**Define a function to implement the k-means clustering algorithm**\
The function receives three parameters:
1. The list of data points
2. The list of clusters
3. A distance threshold to determine if a point should be **evicted** from a cluster


The function returns True or False to indicate if the clusters have **converged**

In [None]:
# Cell 8
def reassign(pts, cs, mean_multiple):
    # Phase I: Calculate the new geometric mean of each
    # cluster based upon current data point assignments
    converged = True
    for c in cs:
        nx, ny = 0.0, 0.0
        for p in pts:
            if p.cluster.index == c.index:
                nx += p.x
                ny += p.y
        nx /= c.population
        ny /= c.population
        if c.x != nx or c.y != ny:
            c.x, c.y = nx, ny
            converged = False

    # Phase II: Assign data points to nearest cluster
    for p in pts:
        min_d = np.finfo(np.float64).max
        min_i = 0
        for c in cs:
            d = np.hypot(p.x - c.x, p.y - c.y)
            if d < min_d:
                min_d = d
                min_i = c.index
        if p.cluster.index != min_i and p.cluster.population > 1:
            p.cluster.population -= 1
            p.cluster = cs[min_i]
            p.cluster.population += 1
            converged = False

    # Phase III - Evict any point too far away from its cluster's center
    if converged and mean_multiple > 0:
        # Calculate mean distance from each cluster's center
        # to the assigned points for that cluster
        for c in cs:
            d = 0.0
            for p in pts:
                if p.cluster.index == c.index:
                    d += np.hypot(p.x - c.x, p.y - c.y)
            c.mean_distance = d / c.population

        # Only keep points where the distance to its assigned cluster's
        # center is less than a multiple of that cluster's mean distance
        # to its assigned points
        new_pts = []
        for p in pts:
            c = p.cluster
            d = np.hypot(p.x - c.x, p.y - c.y)
            if d < c.mean_distance * mean_multiple:
                new_pts.append(p)
            elif c.population > 1:
                print(f"Evicted DataPoint({p.x}, {p.y}) from Cluster {c.index}")
                c.population -= 1
                converged = False
        pts[:] = new_pts

    return converged

**Define a function to iterate the k-means algorithm a given number of times**\
This function takes two required and two optional inbound parameters:
1. The `num_clusters` represents how many k-clusters to populate
2. The `max_iter` is the maximum # of iterations you want to call the **reassign()** function
3. Set `include_outliers` = **True** to include the datapoint that is far away from the others
4. Set `mean_multiple` $>0$ to include the Phase III cluster eviction code in **reassign()**

In [None]:
# Cell 9
def kmeans(num_clusters, max_iter, include_outliers=False, mean_multiple=0):
    points = init_points(include_outliers)
    clusters = init_clusters(num_clusters)
    init_assign(points, clusters)
    converged = False
    for itr in range(max_iter):
        converged = reassign(points, clusters, mean_multiple)
    for p in points:
        plt.scatter(p.x, p.y, color=p.cluster.color, alpha=0.5, edgecolor="black")
    for c in clusters:
        plt.scatter(c.x, c.y, color=c.color, marker="s")
    plt.title(f"k-Means Clustering (k={num_clusters})")
    plt.xlim(-5, 45)
    plt.ylim(-5, 45)
    plt.gca().set_aspect("equal")
    plt.show()
    if converged:
        print(f"Clusters converged after {itr + 1} iterations!")

In [None]:
# Cell 10
kmeans(num_clusters=3, max_iter=0)

In [None]:
# Cell 11
kmeans(num_clusters=3, max_iter=1)

In [None]:
# Cell 12
kmeans(num_clusters=3, max_iter=2)

In [None]:
# Cell 13
kmeans(num_clusters=3, max_iter=3)

In [None]:
# Cell 14
kmeans(num_clusters=3, max_iter=4)

In [None]:
# Cell 15
kmeans(num_clusters=3, max_iter=5)

In [None]:
# Cell 16
kmeans(num_clusters=3, max_iter=0, include_outliers=True)

In [None]:
# Cell 17
kmeans(num_clusters=3, max_iter=6, include_outliers=True)

In [None]:
# Cell 18
kmeans(num_clusters=4, max_iter=0, include_outliers=True)

In [None]:
# Cell 19
kmeans(num_clusters=4, max_iter=3, include_outliers=True)

In [None]:
# Cell 20
kmeans(num_clusters=5, max_iter=0, include_outliers=True)

In [None]:
# Cell 21
kmeans(num_clusters=5, max_iter=6, include_outliers=True)

In [None]:
# Cell 22
kmeans(num_clusters=3, max_iter=0, include_outliers=True, mean_multiple=2)

In [None]:
# Cell 23
kmeans(num_clusters=3, max_iter=5, include_outliers=True, mean_multiple=2)

In [None]:
# Cell 24
kmeans(num_clusters=3, max_iter=6, include_outliers=True, mean_multiple=2)

In [None]:
# Cell 25
kmeans(num_clusters=3, max_iter=8, include_outliers=True, mean_multiple=2)