In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [37]:
iris_df = pd.read_csv('../dataset/iris.data', names=['feature_1', 'feature_2', 'feature_3', 'feature_4', 'label'], index_col=False)

In [38]:
iris_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [39]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
le = LabelEncoder()
iris_df['label'] = le.fit_transform(iris_df['label'])

# Label Encoding Guidelines
* setosa = 0
* versicolor = 1
* virginica = 2

In [40]:
X = iris_df.drop(['label'], axis=1).values
y = iris_df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [41]:
def initialize_centroids(features, k):
    return features[np.random.choice(features.shape[0], k), :]

In [42]:
def assign_cluster(features, centroids):
    return np.array([np.argmin([np.dot(x_i-y_k, x_i-y_k) for y_k in centroids]) for x_i in features])

In [43]:
def kmeans(features, k, max_iter=10):
    np.random.seed(1)
    centroids = initialize_centroids(features, k)
    centroids_unchanged = True
    for i in range(0, max_iter):
        C = assign_cluster(features, centroids)
        old_centroids = centroids
        centroids = [features[C == k_].mean(axis = 0) for k_ in range(k)]
        centroids_unchanged = np.array_equal(old_centroids, centroids)
        if centroids_unchanged:
            print("Cluster already converged")
            break
    return np.array(C), np.array(centroids)

In [44]:
C = initialize_centroids(X_train, 3)

In [45]:
clustered = kmeans(X_train, 3, 300)
clustered[1]

Cluster already converged


array([[4.96585366, 3.36097561, 1.46341463, 0.23170732],
       [6.82941176, 3.07941176, 5.65588235, 2.04705882],
       [5.90444444, 2.73333333, 4.42444444, 1.44222222]])

In [46]:
predicted = assign_cluster(X_test, clustered[1])

In [47]:
from sklearn.cluster import KMeans

In [48]:
kmeans = KMeans(3, random_state=1)

In [49]:
kmeans.fit(X_train)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=1, tol=0.0001, verbose=0)

In [50]:
predict_sklearn = kmeans.predict(X_test)

In [51]:
kmeans.labels_

array([1, 1, 0, 2, 2, 2, 0, 0, 0, 1, 0, 2, 0, 1, 0, 0, 2, 0, 1, 1, 1, 1,
       0, 1, 1, 2, 2, 2, 1, 2, 0, 0, 2, 1, 0, 1, 2, 1, 1, 2, 0, 0, 2, 2,
       1, 0, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 0, 2, 1, 1, 1, 0, 1, 2, 0, 2,
       0, 0, 2, 2, 0, 1, 0, 1, 0, 2, 1, 0, 1, 1, 2, 2, 2, 1, 1, 0, 1, 2,
       1, 0, 0, 1, 2, 1, 0, 1, 0, 0, 2, 1, 0, 1, 0, 0, 1, 0, 0, 0, 2, 0,
       1, 1, 2, 0, 2, 2, 0, 0, 2, 0], dtype=int32)

In [52]:
kmeans.cluster_centers_

array([[5.90444444, 2.73333333, 4.42444444, 1.44222222],
       [4.96585366, 3.36097561, 1.46341463, 0.23170732],
       [6.82941176, 3.07941176, 5.65588235, 2.04705882]])

In [53]:
from scipy.stats import mode
from sklearn.metrics import confusion_matrix, accuracy_score

def calculate_accuracy(y_truth, y_predicted):
    labels = np.zeros_like(y_predicted)
    for i in range(3):
        mask = (y_predicted == i)
        labels[mask] = mode(y_truth[mask])[0]
    return accuracy_score(y_truth, labels)

In [54]:
calculate_accuracy(y_train, kmeans.labels_)

0.8833333333333333

In [55]:
calculate_accuracy(y_train, clustered[0])

0.8833333333333333

In [56]:
calculate_accuracy(y_test, predicted)

0.9

In [57]:
calculate_accuracy(y_test, predict_sklearn)

0.9