In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Import necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### K-means class

In [3]:
class K_clust:

    def set(self,Xin,noc):
        self.X=Xin
        self.m=self.X.shape[0] # number of training examples
        self.k=noc      # number of clusters user requires
        ind=np.random.choice(self.m,self.k) # selects k random indices from our dataset
        self.centroids=self.X[ind,:] # initialise centroids to the values at those indices

    def set_k(self,noc):
        self.k=noc;   # number of clusters       

    def sq_dist(self,X1,X2,ax=1):
        
        return np.linalg.norm(X1-X2,axis=ax)

    def assign_clust(self):
        """ 
        This function will assign cluster to each training example, such that it gets assigned to that
        cluster whose centroid is nearest to it
        """
        clust=np.zeros(self.m,dtype=np.int64)
        for i in range(self.m):
            clust[i]=np.argmin(self.sq_dist(self.X[i],self.centroids))
        return clust

    def move_cent(self,clust):
        """ This function will move the centroids to the mean positions of their respective clusters"""
        for i in range(self.k):
            ind_match=np.argwhere(i==(clust)) # To store indices that match with a particular cluster
            self.centroids[i,:]=np.mean(self.X[ind_match,:],axis=0)

    def fit(self,noi=100):
        
        for i in range(noi):
            clust=self.assign_clust()
            self.move_cent(clust)
        return clust

    def predict(self,Xtest):
        
        m = Xtest.shape[0]
        clust=np.zeros(m,dtype=np.int64)
        for i in range(m):
            clust[i]=np.argmin(self.sq_dist(Xtest[i],self.centroids))
        return clust


### Loading the datasets

In [4]:
train_df = pd.read_csv("/content/drive/MyDrive/logistic_data/emnist-letters-train.csv")

In [11]:
test_df = pd.read_csv("/content/drive/MyDrive/logistic_data/emnist-letters-test.csv")

In [6]:
y_train = train_df['23'].values
x_train = train_df[train_df.columns[1:]]
x_train = x_train/256.0
x_train = x_train.values

In [8]:
tr = K_clust()
tr.set(x_train, 70)
prediction = tr.fit(noi = 100)
cl = prediction

#### Retrieving id's wrt cluster labels

In [9]:
def retrieve_info(cluster_labels,y_train):

# Initializing
  reference_labels = {}
# For loop to run through each label of cluster label
  for i in range(len(np.unique(cl))):
    index = np.where(cluster_labels == i,1,0)
    num = np.bincount(y_train[index==1]).argmax()
    reference_labels[i] = num
  return reference_labels
reference_labels = retrieve_info(cl,y_train)
number_labels = np.random.rand(len(cl))
for i in range(len(cl)):
  number_labels[i] = reference_labels[cl[i]]
print(number_labels[:20].astype('int32'))
print(y_train[:20])

[15 16 15 14 20 13 11 22 24 20 11 13 22 25 21 21 11 19  7 26]
[ 7 16 15 23 17 13 11 22 24 10 14 18 21 26 21 21 24 19  5  2]


### Calculating accuracy for train set

In [10]:
from sklearn.metrics import accuracy_score
print(accuracy_score(number_labels,y_train)*100)

51.03886305025958


### Calculating model's accuracy on test dataset

In [12]:
y_test = test_df["1"].values
x_test = test_df[test_df.columns[1:]]
x_test = x_test/256.0
x_test = x_test.values
tr1 = K_clust()
tr1.set(x_test, 70)
prediction1 = tr1.fit(noi = 100)
cl1 = prediction1
reference_labels1 = retrieve_info(cl1,y_test)
number_labels1 = np.random.rand(len(cl1))
for i in range(len(cl1)):
  number_labels1[i] = reference_labels1[cl1[i]]
from sklearn.metrics import accuracy_score
print(accuracy_score(number_labels1,y_test)*100)

55.753767146428814
