In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

In [None]:
def euclidean_distance(x1,x2):
  return np.sqrt(np.sum((x1-x2)**2))

In [None]:
class KMeans :
  def __init__(self, K=5, max_iters=100, plot_steps=False):
    self.K = K
    self.max_iters = max_iters
    self.plot_steps = plot_steps
    

    #sample indeces for each cluster
    self.clusters = [[] for i in range(self.K)]
    #centroids 
    self.centroids = []
  
  def train(self,X):
    self.X = X
    self.n_samples, self.n_features = X.shape

    #initialize centroids
    random_sample_idxs = np.random.choice(self.n_samples, self.K, replace = False)
    self.centroids = [self.X[idx] for idx in random_sample_idxs]

    #optimize
    for i in range(self.max_iters):
      #update clusters
      self.clusters = self.create_clusters(self.centroids)
      if self.plot_steps:
        self.plot()

      #update centroids
      centroids_old = self.centroids
      self.centroids = self.get_centroids(self.clusters)
      if self.plot_steps:
        selpf.plot()
      #check if converged
      if self.is_converged(centroids_old, self.centroids):
        break
    
    #return cluster labels
    return self.get_cluster_labels(self.clusters)
  
  def get_cluster_labels(self, clusters):
    labels = np.empty(self.n_samples)
    for cluster_idx, cluster in enumerate(clusters):
      for sample_idx in cluster:
        labels[sample_idx] = cluster_idx
    return labels

  def create_clusters(self, centroids):
    clusters = [[] for i in range(self.K)]
    for idx, sample in enumerate(self.X):
      centroid_idx = self.closest_centroid(sample,centroids)
      clusters[centroid_idx].append(idx)
    return clusters
  
  def closest_centroid(self, sample, centroids):
    dist = [euclidean_distance(sample, point) for point in centroids]
    closest_idx = np.argmin(dist)
    return closest_idx
  
  def get_centroids(self, clusters):
    centroids = np.zeros((self.K, self.n_features))
    for cluster_idx , cluster in enumerate(clusters):
      cluster_mean = np.mean(self.X[cluster], axis = 0)
      centroids[cluster_idx] = cluster_mean
    return centroids
  
  def is_converged(self, centroids_old, centroids):
    distances = [euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)]
    return sum(distances) == 0
  
  def plot(self):
    fig, ax = plt.subplots(figsize = (12,8))

    for i, index in enumerate(self.clusters):
      point = self.X[index].T
      ax.scatter(*point)
    
    for point in self.centroids:
      ax.scatter(*point, marker="x", color = "black", linewidth = 2)
    plt.show()
  


In [None]:
data = pd.read_csv("/content/drive/MyDrive/cyberlabs/emnist-letters-train.csv")

In [None]:
data = np.array(data)
m , n = data.shape  ## n is the number of features and m is the no. of exapmles
n = n - 2
print(m,n)
y = (data[:,n+1:])/100
X = (data[:,:n+1])/100
z = np.ones((m,1))
X = np.concatenate((z,X),axis = 1)
print(X.shape,y.shape)

88799 783
(88799, 785) (88799, 1)


In [None]:
k = KMeans(K = 26, max_iters=100, plot_steps = False)
y_pred = k.train(X)
print(k.centroids)
#k.plot()

[[1.         0.12651083 0.         ... 0.         0.         0.        ]
 [1.         0.21907504 0.         ... 0.         0.         0.        ]
 [1.         0.0973184  0.         ... 0.         0.         0.        ]
 ...
 [1.         0.09329488 0.         ... 0.         0.         0.        ]
 [1.         0.16435976 0.         ... 0.         0.         0.        ]
 [1.         0.15161214 0.         ... 0.         0.         0.        ]]
