In [1]:
from math import *


class Node:
    def __init__(self, name):
        self.name = str(name)
        self.next = None

    def __str__(self):
        nextnode = 'NULL' if not self.next else self.next.name
        return str(self.name) + '->' + nextnode


def get_distance_matrix(data):
    matrix = []
    for i in range(len(data)):
        tmp = []
        for j in range(len(data)):
            tmp.append(get_dist(data[i], data[j]))
        matrix.append(tmp)
    
    return matrix


def get_dist(p1, p2):
    dist = 0
    for i in range(len(p1)):
        dist = dist + (p1[i]-p2[i])**2

    return sqrt(dist)


def agglomerative(data, min_dist):
    clusters = [Node(_) for _ in range(len(data))]
    matrix = get_distance_matrix(data)

    for _ in range(len(data) - 1):
        matrix, i, j, dist = get_closest_point(matrix)
        if dist <= min_dist:
            clusters[i].next = clusters[j]
        else:
            break

    classes = []
    for i in range(len(data)):
        clas = clusters[i]
        while clas.next != None:
            clas = clas.next
        classes.append(clas.name)

    uniq = list(set(classes))
    mapped = {}
    for i in range(len(uniq)):
        mapped[uniq[i]] = i

    for i in range(len(classes)):
        classes[i] = mapped[classes[i]]
    return classes
    

def get_closest_point(matrix):
    INF = 99999999
    min_dist = INF
    ii = 0
    jj = 1
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i != j and matrix[i][j] < min_dist:
                min_dist = matrix[i][j]
                ii = i
                jj = j
                matrix[i][j] = INF
                matrix[j][i] = INF

    return matrix, ii, jj, min_dist

In [2]:
class K_Means:
    def __init__(self, clusters):
        self.clusters = clusters
        self.max_iter = 100
    
    def fit(self, data):        
        # Init centroids
        self.centroids = data.copy()
        np.random.shuffle(self.centroids)
        self.centroids = self.centroids[:self.clusters]
        
        for i in range(self.max_iter):
            distance = np.sqrt(((data - self.centroids[:, np.newaxis])**2).sum(axis=2))
            classes = np.argmin(distance, axis=0)
            self.centroids = np.array([data[classes==k].mean(axis=0) for k in range(self.centroids.shape[0])])
        
        print(self.centroids)
            
    def predict(self, data):
        distance = np.sqrt(((data - self.centroids[:, np.newaxis])**2).sum(axis=2))
        classes = np.argmin(distance, axis=0)
        return classes

In [3]:
def kmeans(input_dataset, number_of_groups):

    # Init centroids
    centroids = input_dataset.copy()
    np.random.shuffle(centroids)
    centroids = centroids[:number_of_groups]
    
    max_iter = 100
    for i in range(max_iter):
        distance = np.sqrt(((input_dataset - centroids[:, np.newaxis])**2).sum(axis=2))
        classes = np.argmin(distance, axis=0)
        centroids = np.array([input_dataset[classes==k].mean(axis=0) for k in range(centroids.shape[0])])
        
    return centroids

In [4]:
def pre_process(path_file):
    df = pd.read_csv(path_file, sep=',', names=['A1', 'A2', 'A3', 'A4', 'kelas'])
    
    obj_df = df.select_dtypes(include=['object']).copy()
    obj_df['kelas'] = obj_df['kelas'].astype('category')
    obj_df['kelas'] = obj_df['kelas'].cat.codes
    
    df['kelas'] = obj_df['kelas']
    
    return df

In [5]:
import pandas as pd
import numpy as np

df = pre_process('iris.data')
df

Unnamed: 0,A1,A2,A3,A4,kelas
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
5,5.4,3.9,1.7,0.4,0
6,4.6,3.4,1.4,0.3,0
7,5.0,3.4,1.5,0.2,0
8,4.4,2.9,1.4,0.2,0
9,4.9,3.1,1.5,0.1,0


In [6]:
# Split into train and test data

X = df.drop(['kelas'], axis = 1)
y = df['kelas']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

X_train.shape

(120, 4)

In [7]:
res_fun = kmeans(X_train, 3)
print(res_fun)
print(len(res_fun))

[[5.82888889 2.72222222 4.35333333 1.42222222]
 [5.02051282 3.4025641  1.46153846 0.23846154]
 [6.87777778 3.08888889 5.69444444 2.06666667]]
3


In [8]:
clf = K_Means(3)
clf.fit(X_train)

[[5.02051282 3.4025641  1.46153846 0.23846154]
 [6.87777778 3.08888889 5.69444444 2.06666667]
 [5.82888889 2.72222222 4.35333333 1.42222222]]


In [9]:
y_pred = clf.predict(X_test)

y_pred

array([2, 2, 0, 1, 0, 1, 0, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 0, 0, 2, 2,
       0, 0, 2, 0, 0, 2, 2, 0], dtype=int64)

In [10]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

print(f"accuracy_score scratch:  {accuracy_score(y_test, y_pred)}")
print(f"f1_score scratch: {f1_score(y_test, y_pred, average=None)}")
print(f"recall_score scratch: {recall_score(y_test, y_pred, average=None)}")

accuracy_score scratch:  0.4666666666666667
f1_score scratch: [1.         0.         0.27272727]
recall_score scratch: [1.  0.  0.5]


In [11]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 3)
kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [12]:
kmeans_pred = kmeans.predict(X_test)

kmeans_pred

array([0, 0, 2, 1, 2, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 0,
       2, 2, 0, 2, 2, 0, 0, 2])

In [13]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

print(f"accuracy_score library:  {accuracy_score(y_test, kmeans_pred)}")
print(f"f1_score library: {f1_score(y_test, kmeans_pred, average=None)}")
print(f"recall_score library: {recall_score(y_test, kmeans_pred, average=None)}")

accuracy_score library:  0.0
f1_score library: [0. 0. 0.]
recall_score library: [0. 0. 0.]
