In [63]:
#k-means clustering using iris dataset
import random
import numpy as np
from scipy.spatial.distance import euclidean
from collections import defaultdict
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from itertools import combinations


def k_means(X, k=5, max_iter=1000):
    """Performs k means
    Args:
    - X - feature matrix
    - k - number of clusters
    - max_iter - maximum iteratations
    Returns:
    - clusters - dict mapping cluster centers to observations
    """
    c = [tuple(pt) for pt in random.sample(list(X), k)]#c will have 5 tuples of 4 points each as every row of X has four values
    for i in range(max_iter):
        clusters = defaultdict(list)

        for datapoint in X:
            distances = [euclidean(datapoint, cute) for cute in c]
            center = c[np.argmin(distances)]#Returns the indices of the minimum values along an axis.
            #print(c[np.argmin(distances)])
            clusters[center].append(datapoint)#it's like whichever distance to a particular point is minimum then assign
            #that point as the center

        new_centers = []
        for center, pts in clusters.items():
            new_center = np.mean(pts, axis=0)#it's the new centriod
            new_centers.append(tuple(new_center))
            #print(new_centers)

        if set(new_centers) == set(c):
            break

        c = new_centers#assigning the mean of data points calculated to the initial c and the loop continues up till 1000 iterations
        #if not broke by the condition: if set(new_centers) == set(c)

    return clusters

if __name__ == '__main__':
    iris = datasets.load_iris()
    X = iris.data

In [64]:
k_means(X)

defaultdict(list,
            {(5.005999999999999,
              3.4180000000000006,
              1.464,
              0.2439999999999999): [array([5.1, 3.5, 1.4, 0.2]),
              array([4.9, 3. , 1.4, 0.2]),
              array([4.7, 3.2, 1.3, 0.2]),
              array([4.6, 3.1, 1.5, 0.2]),
              array([5. , 3.6, 1.4, 0.2]),
              array([5.4, 3.9, 1.7, 0.4]),
              array([4.6, 3.4, 1.4, 0.3]),
              array([5. , 3.4, 1.5, 0.2]),
              array([4.4, 2.9, 1.4, 0.2]),
              array([4.9, 3.1, 1.5, 0.1]),
              array([5.4, 3.7, 1.5, 0.2]),
              array([4.8, 3.4, 1.6, 0.2]),
              array([4.8, 3. , 1.4, 0.1]),
              array([4.3, 3. , 1.1, 0.1]),
              array([5.8, 4. , 1.2, 0.2]),
              array([5.7, 4.4, 1.5, 0.4]),
              array([5.4, 3.9, 1.3, 0.4]),
              array([5.1, 3.5, 1.4, 0.3]),
              array([5.7, 3.8, 1.7, 0.3]),
              array([5.1, 3.8, 1.5, 0.3]),
            

In [61]:
iris = datasets.load_iris()
X = iris.data


In [62]:
#To illustrate what random.sample does (using X=iris.data)
centers = [tuple(pt) for pt in random.sample(list(X), 5)]
centers

[(5.7, 2.5, 5.0, 2.0),
 (5.0, 2.0, 3.5, 1.0),
 (4.6, 3.4, 1.4, 0.3),
 (5.1, 3.7, 1.5, 0.4),
 (6.7, 3.3, 5.7, 2.1)]