# Code

In [1]:
import numpy as np
import math

In [2]:
class Agglomerative:
    __valid_linkage = ['single', 'complete', 'average', 'group average']
    
    def __init__(self, n_clusters = 2, linkage = 'single'):
        if (linkage not in self.__valid_linkage):
            raise ValueError("Unknown linkage type %s."
                             "Valid options are %s" % (linkage,
                                                       self.__valid_linkage))
        if (n_clusters < 1):
            raise ValueError("Invalid value of n_clusters, got %d. Must be at least 1" % n_clusters)
        self.__n_clusters = n_clusters
        self.__linkage = linkage
        self.__dist_mat = None
        self.__n_data = 0
        self.__n_attr = 0
        self.__data = None
    
    def fit(self, data):
        if (len(data) < 2):
            raise ValueError("Need at ,east 2 instance of data")
        self.__data = data
        self.__n_data = len(data)
        self.__n_attr = len(data[0])
        self.__clusters = []
        for i in range(0, self.__n_data):
            self.__clusters.append([i])
        while (len(self.__clusters) > self.__n_clusters):
            #print("Current clusters: " + str(self.__clusters))
            min_dist = self.dist(0, 1)
            min_index = (0, 1)
            #print("Current min: "+str(min_dist)+" between cluster "+str(min_index))
            for i in range(0, len(self.__clusters)-1):
                for j in range(i+1, len(self.__clusters)):
                    temp = self.dist(i, j)
                    #print("    Check distance between cluster "+str(i)+","+str(j)+" = "+str(temp))
                    if (temp < min_dist):
                        min_dist = temp
                        min_index = (i, j)
                        #print("        Found new min: "+str(min_dist)+" between cluster "+str(min_index))
            #print("Merge cluster "+str(min_index))
            self.merge(min_index[0], min_index[1])
        self.labels_ = self.getLabels()
        return self
    
    def merge(self, i, j):
        #print(self.__clusters)
        #print(i, j)
        #print("Merge item "+str(self.__clusters[i])+" and "+str(self.__clusters[j]))
        for item in self.__clusters[j]:
            self.__clusters[i].append(item)
        del(self.__clusters[j])
    
    def dist(self, i, j):
        if (self.__linkage == 'single'):
            return self.mindist(i, j)
        elif (self.__linkage == 'complete'):
            return self.maxdist(i, j)
        elif (self.__linkage == 'average'):
            return self.cendist(i, j)
        elif (self.__linkage == 'group average'):
            return self.avgdist(i, j)
    
    def mindist(self, i, j):
        result = self.dist_idx(self.__clusters[i][0], self.__clusters[j][0])
        for itemi in self.__clusters[i]:
            for itemj in self.__clusters[j]:
                temp = self.dist_idx(itemi, itemj)
                #print("        Data "+str(itemi)+" and "+str(itemj)+" has distance "+str(temp))
                if (temp < result):
                    result = temp
        return result
    
    def maxdist(self, i, j):
        result = self.dist_idx(self.__clusters[i][0], self.__clusters[j][0])
        for itemi in self.__clusters[i]:
            for itemj in self.__clusters[j]:
                temp = self.dist_idx(itemi, itemj)
                #print("        Data "+str(itemi)+" and "+str(itemj)+" has distance "+str(temp))
                if (temp > result):
                    result = temp
        return result
    
    def avgdist(self, i, j):
        result = 0
        for itemi in self.__clusters[i]:
            for itemj in self.__clusters[j]:
                result += self.dist_idx(itemi, itemj)
                #print("        Data "+str(itemi)+" and "+str(itemj)+" has distance "+str(temp))
        return float(result) / float(len(self.__clusters[i]) * len(self.__clusters[j]))
    
    def cendist(self, i, j):
        ceni = np.repeat(0, self.__n_attr).tolist()
        for idx in self.__clusters[i]:
            for p in range(0, self.__n_attr):
                ceni[p] += self.__data[idx][p]
        for p in range(0, self.__n_attr):
            ceni[p] = float(ceni[p]) / float(len(self.__clusters[i]))
            
        cenj = np.repeat(0, self.__n_attr).tolist()
        for idx in self.__clusters[j]:
            for p in range(0, self.__n_attr):
                cenj[p] += self.__data[idx][p]
        for p in range(0, self.__n_attr):
            cenj[p] = float(cenj[p]) / float(len(self.__clusters[j]))
        
        return self.dist_data(ceni, cenj)
    
    def dist_idx(self, i, j):
        return self.dist_data(self.__data[i], self.__data[j])
    
    def dist_data(self, item1, item2):
        result = 0;
        for i in range(0, self.__n_attr):
            result += pow((item1[i] - item2[i]), 2)
        return math.sqrt(result)
    
    def getLabels(self):
        result = np.repeat(-1, self.__n_data)
        for i in range(0, self.__n_clusters):
            for idx in self.__clusters[i]:
                result[int(idx)] = i
        return result

# Contoh pemakaian

In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

iris = pd.read_csv('Iris.csv')
iris = iris.drop(labels='Id', axis=1)
iris_data = iris.values[:,:-1]
iris_label = iris.values[:,-1]
species_encoder = LabelEncoder().fit(iris_label)
iris_label = species_encoder.transform(iris_label)

## Single Linkage Purity

In [4]:
model = Agglomerative(n_clusters=3, linkage='single').fit(iris_data)
mat = confusion_matrix(model.labels_, iris_label)
purity = float(mat[0].max() + mat[1].max() + mat[2].max()) / float(mat.sum())
purity

0.68

## Complete Linkage Purity

In [5]:
model = Agglomerative(n_clusters=3, linkage='complete').fit(iris_data)
mat = confusion_matrix(model.labels_, iris_label)
purity = float(mat[0].max() + mat[1].max() + mat[2].max()) / float(mat.sum())
purity

0.84

## Average Linkage Purity

In [6]:
model = Agglomerative(n_clusters=3, linkage='average').fit(iris_data)
mat = confusion_matrix(model.labels_, iris_label)
purity = float(mat[0].max() + mat[1].max() + mat[2].max()) / float(mat.sum())
purity

0.9066666666666666

## Group Average Linkage Purity

In [7]:
model = Agglomerative(n_clusters=3, linkage='group average').fit(iris_data)
mat = confusion_matrix(model.labels_, iris_label)
purity = float(mat[0].max() + mat[1].max() + mat[2].max()) / float(mat.sum())
purity

0.9066666666666666