# Computing K-Means


In [12]:
import math

In [13]:
data = [[2,0], [1,3], [3,5],[2,2], [4,6]]
print(data)

[[2, 0], [1, 3], [3, 5], [2, 2], [4, 6]]


In [3]:
# (x1, y1) and (x2, y2) --> geo dist = sqrt((x1-x2)^2 + (y1-y2)^2)
def geo_dist (p1, p2): # given point1 and point2
    return math.sqrt(((p1[0] - p2[0]) **2 + (p1[1] - p2[1]) **2))

def centroids(data):
    x, y = zip(*data)
    l = len(x)
    return sum(x) / l, sum(y) / l

In [4]:
# Round 1: picking point 2 and point 4 as centroids
p2 = data[1]
p4 = data[3]

p1 = data[0]
p3 = data[2]
p5 = data[4]

geo_distances = []
geo_distances.append([geo_dist(p1, p2), geo_dist(p1, p4)])
geo_distances.append([geo_dist(p3, p2), geo_dist(p3, p4)])
geo_distances.append([geo_dist(p5, p2), geo_dist(p5, p4)])

print ('p1:', geo_dist(p1, p2), geo_dist(p1, p4))
print ('p3:', geo_dist(p3, p2), geo_dist(p3, p4))
print ('p5:', geo_dist(p5, p2), geo_dist(p5, p4))

geo_distances

p1: 3.1622776601683795 2.0
p3: 2.8284271247461903 3.1622776601683795
p5: 4.242640687119285 4.47213595499958


[[3.1622776601683795, 2.0],
 [2.8284271247461903, 3.1622776601683795],
 [4.242640687119285, 4.47213595499958]]

In [5]:
import numpy as np
# making clusters & selecting new centroid
c1 = np.array([p2, p3, p5])
c2 = np.array([p4, p1])

c1, c2

(array([[1, 3],
        [3, 5],
        [4, 6]]),
 array([[2, 2],
        [2, 0]]))

## Coding K-Means

In [37]:
import pandas as pd

'''
Computing the geometric distance between a centroid and a data_point.
'''
def geo_distance(centroid, data_point):
    sum_of_squares = 0
    for i in range(len(centroid)):
        sum_of_squares += (centroid[i] - data_point[i]) ** 2
    return math.sqrt(sum_of_squares)

'''
Compute all the distances of a data_points set against the centroids.
'''
def get_geo_distances(centroids, data_points):
    distances_from_centroids = []
    for i in range(len(centroids)):
        distances = []
        for j in range(len(data_points)):
            distances.append(geo_distance(centroids.iloc[i].tolist(), 
                                          data_points.iloc[j].tolist()))
        distances_from_centroids.append(distances)

    return pd.DataFrame(distances_from_centroids).transpose()
    
'''
Make a set of clusters based on the distances between data_points and centroids.
'''
def make_clusters_from_distances(distances, centroids, data_points, include_centroids=True):
    ## creating a new set of clusters based on the distances.
    #     return centroids
    clusters = []
    for i in range(len(centroids)):
        if include_centroids:
            clusters.append(pd.DataFrame([centroids.iloc[i]]))        
        else:
            clusters.append(pd.DataFrame())
            
    display_clusters(clusters)

    for i in range(len(distances)):
        min_idx = distances.iloc[i].idxmin()
        clusters[min_idx] = clusters[min_idx].append([data_points.iloc[i]])
    
    return clusters

'''
Displaying the clusters ...
'''
def display_clusters(clusters):
    cluster_no = 1
    for cluster in clusters:
        print(f'there are {len(cluster)} elements in the following cluster # {cluster_no}')
        cluster_no += 1
        display(cluster)
        
'''
Computing centroids for a given set of clusters.
'''        
def get_centroids (clusters):
    centroids = pd.DataFrame()
    for cluster in clusters:        
        centroids = centroids.append([cluster.mean()])
    return centroids
    
'''
Creating a clusters using k-means algorigthm.
'''        
def kmeans(data, num_clusters=2, iteration=10):
    if len(data) < num_clusters:
        print('*** list is smaller than number of clusters ... ***')
        return
    
    ## First, we select initial centroid point by randomly 
    ## pick num_cluster points from the list (without replacement)
    
    centroid_indice = np.random.choice(np.arange(len(data)), num_clusters, replace=False)
    remaining_data = data.drop(centroid_indice)
    centroids = data.iloc[centroid_indice]    
    
    ### Get Geo Distances for all the remaining data points
    distances = get_geo_distances(centroids, remaining_data)
    
    ## creating a new set of clusters based on the distances.
    #     return centroids
    clusters = make_clusters_from_distances(distances, centroids, remaining_data)
    
    ### End of getting an initial clusters.
    
    # print('iteration begins ...')    
    for i in range(iteration):        
        centroids = get_centroids(clusters)
        distances = get_geo_distances(centroids, data)
        clusters  = make_clusters_from_distances(distances, centroids, data, include_centroids=False)
        
    return clusters

def get_cluster_ids_by_distances(distances, centroids):
    cluster_ids = []
    for i in range(len(distances)):
        min_idx = distances.iloc[i].idxmin()
        cluster_ids.append(min_idx)
    return cluster_ids
        
def kmeans_fit(clusters, test_data):
    cluster_indice = []
    centroids = get_centroids(clusters)
    distances = get_geo_distances(centroids, test_data)
    return get_cluster_ids_by_distances(distances, centroids)
        

In [38]:
# Test dataset.
pd_data = pd.DataFrame([[2.0,0.0], [1.0,3.0], [3.0,5.0],[2.0,2.0], [4.0,6.0]])
num_clusters = 2


clusters = kmeans(pd_data, num_clusters, 50)
display_clusters(clusters)
res = kmeans_fit(clusters, pd_data)
res

there are 1 elements in the following cluster # 1


Unnamed: 0,0,1
3,2.0,2.0


there are 1 elements in the following cluster # 2


Unnamed: 0,0,1
2,3.0,5.0


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 3 elements in the following cluster # 1


Unnamed: 0,0,1
0,2.0,0.0
1,1.0,3.0
3,2.0,2.0


there are 2 elements in the following cluster # 2


Unnamed: 0,0,1
2,3.0,5.0
4,4.0,6.0


[0, 0, 1, 0, 1]

In [39]:
## Testing the kmeans with iris dataset
filename = '../data/iris.data'
pd_iris = pd.read_csv(filename, header=None, sep=",", names=["sepal.length", "sepal.width", "petal.length", "petal.width", "species"])
pd_iris

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [40]:
data = pd_iris.drop('species', axis=1).dropna() # Drop both Species column and NaN
data

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [41]:
clusters = kmeans(data, 3)
display_clusters(clusters)

there are 1 elements in the following cluster # 1


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
126,6.2,2.8,4.8,1.8


there are 1 elements in the following cluster # 2


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
34,4.9,3.1,1.5,0.1


there are 1 elements in the following cluster # 3


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
149,5.9,3.0,5.1,1.8


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 0 elements in the following cluster # 1


there are 0 elements in the following cluster # 2


there are 0 elements in the following cluster # 3


there are 62 elements in the following cluster # 1


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
50,7.0,3.2,4.7,1.4
51,6.4,3.2,4.5,1.5
53,5.5,2.3,4.0,1.3
54,6.5,2.8,4.6,1.5
55,5.7,2.8,4.5,1.3
...,...,...,...,...
133,6.3,2.8,5.1,1.5
138,6.0,3.0,4.8,1.8
142,5.8,2.7,5.1,1.9
146,6.3,2.5,5.0,1.9


there are 50 elements in the following cluster # 2


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
5,5.4,3.9,1.7,0.4
6,4.6,3.4,1.4,0.3
7,5.0,3.4,1.5,0.2
8,4.4,2.9,1.4,0.2
9,4.9,3.1,1.5,0.1


there are 38 elements in the following cluster # 3


Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width
52,6.9,3.1,4.9,1.5
77,6.7,3.0,5.0,1.7
100,6.3,3.3,6.0,2.5
102,7.1,3.0,5.9,2.1
103,6.3,2.9,5.6,1.8
104,6.5,3.0,5.8,2.2
105,7.6,3.0,6.6,2.1
107,7.3,2.9,6.3,1.8
108,6.7,2.5,5.8,1.8
109,7.2,3.6,6.1,2.5


In [42]:
species = kmeans_fit(clusters, data)
for i in range(len(species)):
    print (i, species[i])

0 1
1 1
2 1
3 1
4 1
5 1
6 1
7 1
8 1
9 1
10 1
11 1
12 1
13 1
14 1
15 1
16 1
17 1
18 1
19 1
20 1
21 1
22 1
23 1
24 1
25 1
26 1
27 1
28 1
29 1
30 1
31 1
32 1
33 1
34 1
35 1
36 1
37 1
38 1
39 1
40 1
41 1
42 1
43 1
44 1
45 1
46 1
47 1
48 1
49 1
50 0
51 0
52 2
53 0
54 0
55 0
56 0
57 0
58 0
59 0
60 0
61 0
62 0
63 0
64 0
65 0
66 0
67 0
68 0
69 0
70 0
71 0
72 0
73 0
74 0
75 0
76 0
77 2
78 0
79 0
80 0
81 0
82 0
83 0
84 0
85 0
86 0
87 0
88 0
89 0
90 0
91 0
92 0
93 0
94 0
95 0
96 0
97 0
98 0
99 0
100 2
101 0
102 2
103 2
104 2
105 2
106 0
107 2
108 2
109 2
110 2
111 2
112 2
113 0
114 0
115 2
116 2
117 2
118 2
119 0
120 2
121 0
122 2
123 0
124 2
125 2
126 0
127 0
128 2
129 2
130 2
131 2
132 2
133 0
134 2
135 2
136 2
137 2
138 0
139 2
140 2
141 2
142 0
143 2
144 2
145 2
146 0
147 2
148 2
149 0


# Images data

In [None]:
filename = '2_Image_Test.txt'
pd_image = pd.read_csv(filename, header=None, sep=",")
pd_image = pd_image.dropna() # Removing NaN
pd_image

In [None]:
clusters = kmeans(pd_image, num_clusters=10, iteration=1)
# predicted_numbers = kmeans_fit(clusters, pd_image)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#     print(data_out)
# predicted_numbers

In [None]:
predicted_numbers = kmeans_fit(clusters, pd_image)
predicted_numbers

In [None]:
answers = [0] * len(pd_image)
answers = np.empty(len(pd_image), dtype=object)
for i in range(len(clusters)):
    answers[clusters[i].index] = i
    
pd_answers = pd.DataFrame(answers)
display(pd_answers)

In [None]:
x = pd.Series([1, 2, 3, 4, 5])
y = pd.Series([6, 7, 8, 9, 10])
dist = (np.linalg.norm(x-y))
  
print("Series 1:")
print(x)
  
print("Series 2:")
print(y)
  
print("Euclidean distance between two series is:", dist)

In [None]:
import math
# One dimensional Point
  
# Coordinate of Point P
P = 3
  
# Coordinates of point Q
Q = -8
  
# Calculate the Euclidean distance 
# between points P and Q
eDistance = math.dist([P], [Q])
print(eDistance)