### Implementing K-Means with numpy and pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from copy import deepcopy
from sklearn.cluster import KMeans

In [2]:
df = pd.read_csv('Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
data = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]

### Scikit-learn KMeans

In [4]:
%%time
sse_sk = []

for i in range(1, 12):
    kmeans_sk = KMeans(n_clusters=i, init='random', random_state=999).fit(data)
    sse_sk.append(kmeans_sk.inertia_)

ks_sk = np.arange(1, 12)

px.line(x=ks_sk, y=sse_sk)



Wall time: 1.15 s


### Implementation with numpy and pandas

In [5]:
def fit_k_means(df, n_clusters=3, tol=0.000001, max_iter=1000, random_state=7):
    
    k=n_clusters
    n=df.shape[0]
    
    # sample centers
    centroids_df = df.sample(n=n_clusters, random_state=random_state, axis=0)
    centroids = df.sample(n=n_clusters, random_state=random_state, axis=0).values

    data = df[df.columns].values

    old_centroids = np.zeros(centroids.shape) # store old centers
    new_centroids = deepcopy(centroids) # store new centers

    clusters = np.zeros(n)
    distances = np.zeros((n,k))
    mean_distances = np.zeros(k)
    inertias = np.zeros(k)

    error = np.linalg.norm(new_centroids - old_centroids)

    iterations = 0

    # creates dataframes to store history
    init_df = pd.DataFrame(data, index=df.index, columns=df.columns)
    init_df['cluster'] = 0
    init_df['iterations'] = iterations + 1
    init_df['size'] = 0.2

    centroids_df = pd.DataFrame(new_centroids, index=centroids_df.index, columns=df.columns)
    centroids_df['cluster'] = np.arange(0, k)
    centroids_df['iterations'] = iterations + 1
    centroids_df['size'] = 1

    history = pd.concat([init_df, centroids_df])

    history_list = []
    history_list.append(history)

    # loop until error < tool or reach max_iter
    while error > tol and iterations != max_iter:
        # measure the distance to every centroid
        for i in range(k):
            distances[:,i] = np.linalg.norm(data - new_centroids[i], axis=1)
        # assign data to closest centroid (cluster)
        clusters = np.argmin(distances, axis = 1)
    
        old_centroids = deepcopy(new_centroids)
        # calculate mean for every cluster and update the centroid
        for i in range(k):
            new_centroids[i] = np.mean(data[clusters == i], axis=0)
        error = np.linalg.norm(new_centroids - old_centroids)
        
        # calculate mean distance of points to the cluster centroid
        for i in range(k):
            mean_distances[i]= np.mean(distances[np.argwhere(clusters == i), i])
        
        # calculate within cluster sum of square distances-wcss (inertia)
        for i in range(k):
            inertias[i]= np.sum(distances[np.argwhere(clusters == i), i]**2)
        
        # updates history dataframe
        init_df = pd.DataFrame(data, index=df.index, columns=df.columns)
        init_df['cluster'] = clusters
        init_df['iterations'] = iterations + 1
        init_df['size'] = 0.2
        
        centroids_df = pd.DataFrame(new_centroids, index=centroids_df.index, columns=df.columns)
        centroids_df['cluster'] = np.arange(0, k)
        centroids_df['iterations'] = iterations + 1
        centroids_df['size'] = 1

        history = pd.concat([init_df, centroids_df])
        history_list.append(history)
        
        iterations +=1
    
    # final atributes
    fit_k_means.centroids = new_centroids
    fit_k_means.error = error
    fit_k_means.mean_distances = mean_distances
    fit_k_means.inertia = np.sum(inertias)
    fit_k_means.history = pd.concat(history_list)
    
    return clusters

In [6]:
%%time
clusters = fit_k_means(df=data, max_iter=20, random_state=999)
clusters

Wall time: 55 ms


array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 0, 2], dtype=int64)

In [7]:
fit_k_means.error

0.0

In [8]:
fit_k_means.centroids

array([[6.85384615, 3.07692308, 5.71538462, 2.05384615],
       [5.006     , 3.418     , 1.464     , 0.244     ],
       [5.88360656, 2.74098361, 4.38852459, 1.43442623]])

In [9]:
fit_k_means.mean_distances

array([0.73184588, 0.48413225, 0.73110849])

In [10]:
fit_k_means.inertia

78.94506582597731

In [11]:
kmeans_history = fit_k_means.history

In [12]:
kmeans_history.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,cluster,iterations,size
0,5.1,3.5,1.4,0.2,0,1,0.2
1,4.9,3.0,1.4,0.2,0,1,0.2
2,4.7,3.2,1.3,0.2,0,1,0.2
3,4.6,3.1,1.5,0.2,0,1,0.2
4,5.0,3.6,1.4,0.2,0,1,0.2


In [13]:
data['clusters'] = clusters
px.scatter(data, x='SepalLengthCm', y='PetalLengthCm', color='clusters', title="Implementation Results - Clusters")



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [14]:
sse_sk = []

for i in range(1, 12):
    clusters = fit_k_means(df=data, n_clusters=i, max_iter=20, random_state=999)
    sse_sk.append(fit_k_means.inertia)

ks_sk = np.arange(1, 12)

px.line(x=ks_sk, y=sse_sk, title="Implementation Results - Inertia")

In [15]:
kmeans_history.tail()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,cluster,iterations,size
148,6.2,3.4,5.4,2.3,0,11,0.2
149,5.9,3.0,5.1,1.8,2,11,0.2
61,6.853846,3.076923,5.715385,2.053846,0,11,1.0
64,5.006,3.418,1.464,0.244,1,11,1.0
80,5.883607,2.740984,4.388525,1.434426,2,11,1.0
