In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import silhouette_score

In [2]:
def get_nearest_neighbor(core_vector, X, eps, core_vector_index):
    nearest_neighbor = []
    for i in range(len(X)):
        if i != core_vector_index:
            neighbor = X[i]
            euclidean_distance = np.linalg.norm(core_vector - neighbor)
            if(euclidean_distance <= eps):
                nearest_neighbor.append(i)
    
    return nearest_neighbor


def check_core_point(eps,minPts, X, index):

    nearest_neighbor = list(get_nearest_neighbor(X[index], X, eps, index))
    
    if len(nearest_neighbor) >= minPts:
        return (nearest_neighbor, 1)
    
    elif (len(nearest_neighbor) < minPts) and len(nearest_neighbor) > 0:
        return (nearest_neighbor, 2)
    
    elif len(nearest_neighbor) == 0:
        return (nearest_neighbor, 3)

In [3]:
def _dbscan(eps, minPts, X):
    
    #initiating cluster number
    cluster_num = 1

    q = set()
    unvisited = [i for i in range(0, len(X))]
    clusters = []
    
    while (len(unvisited) > 0): #run until all points have been visited

        #identifier for first point of a cluster
        first_point = True
        
        #choose a random unvisited point
        q.add(random.choice(unvisited))
        
        while len(q) > 0:
            pop = q.pop()
            unvisited.remove(pop)
            
            neighbor_ind, point_type = check_core_point(eps, minPts, X, pop)
            
            #dealing with an edge case
            if point_type == 2 and first_point:
                
                clusters.append((pop, 0))
                for ind in neighbor_ind:
                    clusters.append((ind, 0))

                unvisited = [element for element in unvisited if element not in neighbor_ind]
                continue

            first_point = False
            
            #CORE POINT
            if point_type == 1:
                clusters.append((pop,cluster_num))
                neighbor_ind = set(neighbor_ind) & set(unvisited)
                q.update(neighbor_ind)

            #BORDER POINT
            elif point_type == 2:
                clusters.append((pop,cluster_num))
                continue
            
            #OUTLIER
            elif point_type == 3:
                clusters.append((pop, 0))
                continue
                
        if not first_point:
            cluster_num += 1
        
    return clusters

In [4]:


household_data = pd.read_csv('household_power_consumption.txt', sep=';')
df_moons = pd.DataFrame(household_data, columns=['Global_active_power', 'Global_active_power', 'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2', 'Sub_metering_3'])
df_moons = df_moons.apply (pd.to_numeric, errors='coerce')

df_moons_norm = df_moons.apply(lambda iterator: ((iterator.max() - iterator)/(iterator.max() - iterator.min())).round(2))
df_moons_cleaned = df_moons_norm.dropna()
train_data, test_data, train_labels, test_labels = train_test_split(df_moons_cleaned, df_moons_cleaned, test_size=0.999, random_state=42)
raw_data_household = train_data.to_numpy()

  household_data = pd.read_csv('household_power_consumption.txt', sep=';')


In [5]:
train_data.shape

(2049, 7)

In [23]:
clustered = _dbscan(0.04, 3, raw_data_household)
ind , cluster = list(zip(*clustered))
cluster_df = pd.DataFrame(clustered, columns = ["ind", "cluster"])
labels = list(cluster)
print(np.unique(cluster))

[ 0  1  2  4  9 24 30 51 57]


In [24]:
cluster = list(cluster)
while len(cluster) > len(train_data):
    cluster.pop()

In [8]:
len(cluster)

2049

In [9]:
def calculate_gini(labels):
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    gini_index = 1 - np.sum(probabilities ** 2)
    return gini_index

In [10]:
gini_index = calculate_gini(cluster)
print("Gini Index with eps=5: ", gini_index)

Gini Index with eps=5:  0.3013904579386295


In [25]:

score = silhouette_score(train_data, cluster)
print(score)

0.5772779134704406
