In [1]:
'''
Author: Dhananjay Shettigar (Roll No. 8702)
Program written for TEIT DMBI practical on clustering algorithm (Simple K-means clustering)
'''

import pandas as pd
import numpy as np
import math

import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# Seaborn inbuilt dataset
df = sns.load_dataset('iris')
num_clusters = 3
df


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [3]:
# Target column separation
target_col = 'species'
target = df[target_col]
columns = [col for col in df.columns]
columns.pop()
data = df.drop('species', axis = 1)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [11]:
def euclideanDist(x, y, columns):
    dist = 0
    for col in columns:
        dist += (x[col] - y[col]) * (x[col] - y[col])
    return math.sqrt(dist)

def randomInit(data, num_clusters):
    initial_centroids = data.sample(n = num_clusters)
    clusters = [[] for _ in range(num_clusters)]
    for idx in range(num_clusters):
        clusters[idx].append(initial_centroids.iloc[idx].to_dict())
    return clusters

def getNewCentroids(cluster_arr, columns):
    num_clusters = len(cluster_arr)
    centroids = [{} for _ in range(num_clusters)]
    for idx, cluster in enumerate(cluster_arr):
        cluster_as_df = pd.DataFrame(cluster)
        for col in columns:
            centroids[idx] = cluster_as_df.mean(axis=0)
    return centroids

def areSameCentroids(cluster_arr, centroids, columns, min_dist):
    are_same_centroids = True
    for idx in range(len(centroids)):
        dist = euclideanDist(cluster_arr[idx][0], centroids[idx], columns)
        if dist > min_dist:
            are_same_centroids = False
            break
    return are_same_centroids

def updateCentroids(cluster_arr, centroids, columns):
    cluster_arr = [[{}] for _ in range(len(centroids))]
    for idx in range(len(centroids)):
        for col in columns:
            cluster_arr[idx][0][col] = centroids[idx][col]
    return cluster_arr

def fillClusters(cluster_arr, dataset, columns):
    num_clusters = len(cluster_arr)
    distances = [0 for _ in range(num_clusters)]
    temp = 5
    for row_idx in dataset.index:
        for cluster_idx, cluster in enumerate(cluster_arr):
            distances[cluster_idx] = euclideanDist(cluster[0], dataset.iloc[row_idx], columns)
        min_dist_idx = distances.index(min(distances))
        cluster_arr[min_dist_idx].append(dataset.iloc[row_idx].to_dict())
        distances = [0 for _ in range(num_clusters)]

def findCentroidsUsingKMeans(dataset, columns, num_clusters=2, min_dist=0):
    # Initialize with random rows for centroids
    # clusters = randomInit(dataset, num_clusters)
    
    # Test centroids for cross validating against WEKA implementation
    clusters = [
        [
            {
                'sepal_length': 6.1,
                'sepal_width': 2.9,
                'petal_length': 4.7,
                'petal_width': 1.4
            }
        ],
        [
            {
                'sepal_length': 6.2,
                'sepal_width': 2.9,
                'petal_length': 4.3,
                'petal_width': 1.3
            }
        ],
        [
            {
                'sepal_length': 6.9,
                'sepal_width': 3.1,
                'petal_length': 5.1,
                'petal_width': 2.3
            }
        ]
    ]

    are_final_centroids = False
    centroids = []
    while not are_final_centroids:
        fillClusters(clusters, data, columns)
        centroids = getNewCentroids(clusters, columns)
        are_final_centroids = areSameCentroids(clusters, centroids, columns, min_dist)
        clusters = updateCentroids(clusters, centroids, columns)
    return centroids, clusters


In [13]:
final_centroids, final_clusters = findCentroidsUsingKMeans(data, columns, 3, 0)
print("Final centroids: ")
for centroid in final_centroids:
    print(centroid)

test = []
for idx in range(data.shape[0]):
    distances = [0 for _ in range(len(final_centroids))]
    for idx1, centroid in enumerate(final_centroids):
        distances[idx1] = euclideanDist(centroid, data.iloc[idx], columns)
    min_dist_idx = distances.index(min(distances))
    distances = [0 for _ in range(len(final_centroids))]
    test.append(min_dist_idx)

arr = np.array(test)
print("Cluster sizes: ", np.bincount(arr))

Final centroids: 
sepal_length    5.901613
sepal_width     2.748387
petal_length    4.393548
petal_width     1.433871
dtype: float64
sepal_length    5.006
sepal_width     3.428
petal_length    1.462
petal_width     0.246
dtype: float64
sepal_length    6.850000
sepal_width     3.073684
petal_length    5.742105
petal_width     2.071053
dtype: float64
Cluster sizes:  [62 50 38]
