In [1]:
import pandas as pd
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt

In [2]:
k = 4
np.random.seed(1)

In [3]:
df_with_labels = pd.read_csv('data/iris_all.csv', index_col=0)
df = df_with_labels.drop(columns=['Species'])

In [4]:
number_of_observations = len(df)
number_of_dimensions = len(df.columns)
min_value = df.min().min()
max_value = df.max().max()

In [5]:
def initialize_centroids():
    centroids = []
  
    for centroid in range(k):
        centroid = np.random.uniform(min_value, max_value, number_of_dimensions)
        centroids.append(centroid)

    return centroids

In [6]:
def calculate_euclidean_distance(x1, x2):
    distance = 0
    for i in range(number_of_dimensions):
        distance += (x1[i] - x2[i]) ** 2
    return sqrt(distance)

In [7]:
def get_errors(observation, centroids):
    errors = []
    for centroid in centroids:
        error = calculate_euclidean_distance(observation, centroid)
        errors.append(error)
    return errors

In [8]:
def assign_centroids(centroids):
    centroid_assign = []
    centroid_errors = []
    data = df.to_numpy()
    
    for observation in data:
        errors = get_errors(observation, centroids)
        closest_centroid = errors.index(min(errors))
        centroid_error = min(errors)

        centroid_assign.append(closest_centroid)
        centroid_errors.append(centroid_error)

    return centroid_assign, centroid_errors

In [9]:
def kmeans():
    centroids = initialize_centroids()
    total_error = 0
    loop_condition = True
    i = 0
    
    while(loop_condition):
        df['Centroid'], centroid_errors  = assign_centroids(centroids)
        
        if total_error == sum(centroid_errors):
            loop_condition = False
            
        total_error = sum(centroid_errors)
        
        centroids = df.groupby('Centroid').aggregate('mean').reset_index(drop = True)
        centroids = centroids.to_numpy()
        
        print(i)
        print(total_error)
        i += 1

In [10]:
kmeans()

0
732.4405188559748
1
116.48921684939155
2
104.31856782368888
3
99.69280588748012
4
98.36502981782091
5
98.05081457534052
6
97.79680272026668
7
97.46664149213983
8
97.2788014001845
9
97.20584742866642
10
97.12505123764717
11
97.06877456223792
12
97.2248690338732
13
97.2248690338732


In [11]:
labels = df_with_labels['Species']
df['Species'] = labels

for i in range(k):
    data = df[df['Centroid'] == i] 
    print(data[['Centroid', 'Species']])

    Centroid Species
nr                  
4          0  setosa
5          0  setosa
10         0  setosa
11         0  setosa
16         0  setosa
17         0  setosa
22         0  setosa
23         0  setosa
28         0  setosa
29         0  setosa
34         0  setosa
35         0  setosa
40         0  setosa
41         0  setosa
46         0  setosa
47         0  setosa
1          0  setosa
2          0  setosa
3          0  setosa
6          0  setosa
7          0  setosa
8          0  setosa
9          0  setosa
12         0  setosa
13         0  setosa
14         0  setosa
15         0  setosa
18         0  setosa
19         0  setosa
20         0  setosa
21         0  setosa
24         0  setosa
25         0  setosa
26         0  setosa
27         0  setosa
30         0  setosa
31         0  setosa
32         0  setosa
33         0  setosa
36         0  setosa
37         0  setosa
38         0  setosa
39         0  setosa
42         0  setosa
43         0  setosa
44         0 

In [12]:
df.groupby('Centroid').Species.value_counts(normalize=True).mul(100).round(1).astype(str).sort_index() + '%'

Centroid  Species   
0         setosa        100.0%
1         versicolor     77.0%
          virginica      23.0%
2         versicolor      7.7%
          virginica      92.3%
Name: Species, dtype: object