In [22]:
import numpy as np
import pandas as pd

In [23]:
class HierarchicalClustering:
    
    def __init__(self,n_clusters=2,linkage="single"):
    
        self.n_clusters = n_clusters
        self.linkage = linkage

    def fit_predict(self,X):
    
        n=X.shape[0] 
        d=self.d_matrix(X)   
        cluster=self.get_initial_cluster(n)
       
        s=set(range(n))     
        for _ in range(n-self.n_clusters):
            p,q=np.unravel_index(np.argmin(d, axis=None), d.shape)
            t_set=s-{p,q} 
            d=self.update_d(d,p,q,t_set,self.linkage)
            cluster=self.update_cluster(cluster,p,q) 
            s=s-{max(p,q)}
        decor_l=[]
        for v in cluster.values():
            decor_l.append(v)
        
        self.labels_= self.clustertolabels(decor_l)
        return self.labels_

    def clustertolabels(self,clusters):
    
        ln = sum([len(c) for c in clusters])
        labels = np.zeros(ln,dtype = np.int64)
        ind = -1
        for c in clusters:
            ind+=1
            for i in c:
                labels[i] = ind
        return labels


    def d_matrix(self,data):
    
        n=data.shape[0] 
        d=np.empty(shape=[n,n])
        d.fill(np.inf)  
        
        for i in range(n-1):
            for j in range(i+1,n):
                d[i,j]=distance(data[i],data[j]) 
        return d

    
    def get_initial_cluster(self,n):
    
        c={}
        for i in range(n):
            c[i]={i}   
        return c

   
    def update_d(self,d,p,q,t_set,linkage):
    
        for i in t_set:
            
            u,v=min(i,p),max(i,p) 
            w,x=min(i,q),max(i,q)
            if(linkage=="complete"):
                t=max(d[u,v],d[w,x])
            elif(linkage=="average"):
                t=(d[u,v]+d[w,x])/2
            else:     
                t=min(d[u,v],d[w,x])
            
            d[u,v]=t
            d[w,x]=t
        m_pq=max(p,q)
        d[m_pq,:]=np.inf
        d[:,m_pq]=np.inf
        return d


    def update_cluster(self,c,p,q):
    
        i=c.pop(max(p,q))
        m=min(p,q)
        c[m]=c[m].union(i) 
        return c


def distance(pt1,pt2):
    
    if(len(pt1)!=len(pt2)):
        print("Error distance(): The dimensions of two points are not equal")
        return  
    dim=len(pt1)
    s=0
    for i in range(dim):
        s+=(pt1[i]-pt2[i])**2
    dist=np.sqrt(s)  
    return dist

In [24]:
df = pd.read_csv("2c.csv")
df['g'].replace(['M', 'W'], [1,0], inplace=True)
labels = df['g'].values
df = df.iloc[:,:-1]
data = df.values

def dataset_minmax(dataset):
	minmax = list()
	for i in range(len(dataset[0])):
		col_values = [row[i] for row in dataset]
		value_min = min(col_values)
		value_max = max(col_values)
		minmax.append([value_min, value_max])
	return minmax

def normalize_dataset(dataset, minmax):
	for row in dataset:
		for i in range(len(row)):
			row[i] = (row[i] - minmax[i][0]) / (minmax[i][1] - minmax[i][0])

minmax = dataset_minmax(data)
normalize_dataset(data, minmax)

In [25]:
n_c = [2,4,6,8]

# Single Linkage

In [26]:
print("Single Linkage")
for j in n_c:
    clustering = HierarchicalClustering(n_clusters=j,linkage="single")
    cls_pred = clustering.fit_predict(data)
    
    wt_acc = []
    for i in np.unique(cls_pred):
        indices = np.where(cls_pred == i)
        arr = [labels[i] for i in indices]
        max(np.unique(arr, return_counts=True)[1])
        accuracy_cluster = max(np.unique(arr, return_counts=True)[1])/len(arr[0])
        wt_cl_acc = (accuracy_cluster*len(arr[0]))/(len(cls_pred))
        wt_acc.append(wt_cl_acc)
    print(f"Clusters = {j}, Accuracy = {sum(wt_acc)*100  :.4f}")

Single Linkage
Clusters = 2, Accuracy = 53.3333
Clusters = 4, Accuracy = 54.1667
Clusters = 6, Accuracy = 54.1667
Clusters = 8, Accuracy = 55.0000


# Complete Linkage

In [27]:
print("Complete Linkage")
for j in n_c:
    clustering = HierarchicalClustering(n_clusters=j,linkage="complete")
    cls_pred = clustering.fit_predict(data)
    
    wt_acc = []
    for i in np.unique(cls_pred):
        indices = np.where(cls_pred == i)
        arr = [labels[i] for i in indices]
        max(np.unique(arr, return_counts=True)[1])
        accuracy_cluster = max(np.unique(arr, return_counts=True)[1])/len(arr[0])
        wt_cl_acc = (accuracy_cluster*len(arr[0]))/(len(cls_pred))
        wt_acc.append(wt_cl_acc)
    print(f"Clusters = {j}, Accuracy = {sum(wt_acc)*100  :.4f}")

Complete Linkage
Clusters = 2, Accuracy = 62.5000
Clusters = 4, Accuracy = 62.5000
Clusters = 6, Accuracy = 70.0000
Clusters = 8, Accuracy = 70.8333


# Average Linkage

In [28]:
print("Average Linkage")
for j in n_c:
    clustering = HierarchicalClustering(n_clusters=j,linkage="average")
    cls_pred = clustering.fit_predict(data)
    
    wt_acc = []
    for i in np.unique(cls_pred):
        indices = np.where(cls_pred == i)
        arr = [labels[i] for i in indices]
        max(np.unique(arr, return_counts=True)[1])
        accuracy_cluster = max(np.unique(arr, return_counts=True)[1])/len(arr[0])
        wt_cl_acc = (accuracy_cluster*len(arr[0]))/(len(cls_pred))
        wt_acc.append(wt_cl_acc)
    print(f"Clusters = {j}, Accuracy = {sum(wt_acc)*100 :.4f}")

Average Linkage
Clusters = 2, Accuracy = 59.1667
Clusters = 4, Accuracy = 62.5000
Clusters = 6, Accuracy = 62.5000
Clusters = 8, Accuracy = 67.5000
