In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
iris_df = pd.read_csv('../dataset/iris.data', names=['feature_1', 'feature_2', 'feature_3', 'feature_4', 'label'], index_col=False)

In [7]:
iris_df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
iris_df['label'] = le.fit_transform(iris_df['label'])

# Label Encoding Guidelines
* setosa = 0
* versicolor = 1
* virginica = 2

In [9]:
def find_neighbors(data, core, eps):
    neighbors = []
    
    for i in range(len(data)):
        if np.linalg.norm(core - data[i]) < eps:
           neighbors.append(i)
            
    return neighbors

In [10]:
def iterate_neighbors(data, eps, min_pts, cluster, labels, core, neighbors):
    i = 0
    while i < len(neighbors):      
        neighbor = neighbors[i]
        if labels[neighbor] == -1:
           labels[neighbor] = cluster
        elif labels[neighbor] == -2:
            labels[neighbor] = cluster
            
            new_neighbors = find_neighbors(data, data[neighbor], eps)

            if len(new_neighbors) >= min_pts:
                neighbors = neighbors + new_neighbors
        i += 1        

In [11]:
def dbscan(data, eps, min_pts):
    labels = [-2 for i in range(len(data))]
    
    cluster = 0
    
    for i in range(len(data)):
        if (labels[i] == -2):
            neighbors = find_neighbors(data, data[i], eps)
            
            if len(neighbors) < min_pts:
                labels[i] = -1   
            else:
                iterate_neighbors(data, eps, min_pts, cluster, labels, data[i], neighbors)
                cluster+=1
    
    return np.array(labels)

In [12]:
clustered = dbscan(iris_df.drop(['label'], axis=1).values, 0.5, 5)

clustered

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [13]:
from sklearn.cluster import DBSCAN

In [14]:
features = iris_df.drop(['label'], axis=1).values
dbscan = DBSCAN(min_samples=5, eps=0.5)
dbscan.fit(features)

DBSCAN(algorithm='auto', eps=0.5, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=5, n_jobs=None, p=None)

In [15]:
dbscan.labels_

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1,  1,  1,  1,
        1,  1,  1, -1, -1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1, -1, -1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1, -1, -1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
      dtype=int64)

In [16]:
from scipy.stats import mode
from sklearn.metrics import confusion_matrix, accuracy_score

y = iris_df['label'].values
def calculate_accuracy(y_truth, y_predicted):
    labels = np.zeros_like(y_predicted)
    for i in range(3):
        mask = (y_predicted == i)
        labels[mask] = mode(y_truth[mask])[0]
    return accuracy_score(y_truth, labels)

In [17]:
calculate_accuracy(y, dbscan.labels_)

0.6266666666666667

In [18]:
calculate_accuracy(y, clustered)

0.6266666666666667