In [79]:
import os
import numpy as np
import codecs
from dotenv import load_dotenv
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import pdist, squareform
from collections import deque
import matplotlib.pyplot as plt

load_dotenv()

True

# 20 NG Dataset

In [None]:
categories = ["alt.atheism", "sci.med", "sci.electronics", "comp.graphics", "talk.politics.guns", "sci.crypt"]
documents = fetch_20newsgroups(categories=categories).data
labels_orig = fetch_20newsgroups(categories=categories).target

vectorizer = TfidfVectorizer(max_features=1000)
vectors = vectorizer.fit_transform(documents).toarray()

class DBSCAN:

    def __init__(self, epsilon, min_pts):
        self.epsilon = epsilon
        self.min_pts = min_pts
        self.labels = None

    def _region_query(self, dataset, i):
        similarities = cosine_similarity(dataset[i].reshape(1, -1), dataset)[0]
        # print(f"Point {i}: max={np.max(similarities)}, min={np.min(similarities)}, mean={np.mean(similarities)}")
        return np.where(similarities > self.epsilon)[0]
    
    def _expand_cluster(self, dataset, i, neighbors, cluster_id):
        self.labels[i] = cluster_id
        queue = deque(neighbors)

        while queue:
            index = queue.popleft()
            if self.labels[index] == -2:
                self.labels[index] = cluster_id
            elif self.labels[index] == -1:
                self.labels[index] = cluster_id
                current_neighbors = self._region_query(dataset, index)
                if len(current_neighbors) >= self.min_pts:
                    queue.extend(current_neighbors)

    def fit(self, dataset):
        n = dataset.shape[0]
        self.labels = np.full(n, -1)
        cluster_id = 0

        for i in range(n):
            if self.labels[i] == -1:
                neighbours = self._region_query(dataset, i)
                if len(neighbours) < self.min_pts:
                    self.labels[i] = -2
                else:
                    self._expand_cluster(dataset, i, neighbours, cluster_id)
                    cluster_id += 1

        return self.labels

In [179]:
dbscan_20ng = DBSCAN(epsilon=0.85, min_pts=3)
labels = dbscan_20ng.fit(vectors)

In [180]:
label, count = np.unique(labels, return_counts=True)
print(label)
print(count)

[-2  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27]
[3290    3    3    4    3    3    7    5    6    4    3    3    4    4
    4    3    3    3    3    3    4    3    4    3    3    3    3    3
    3]


In [181]:
score = silhouette_score(vectors, labels, metric='cosine')
print("Silhouette Score:", score)

Silhouette Score: -0.18976953977916106


# Fashion MNIST dataset

In [80]:
path_to_fmnist_dataset = os.environ.get("FASHION_MNIST_DATA_PATH")
files = os.listdir(path_to_fmnist_dataset)
fmnist_files = [x for x in files if x.endswith("ubyte")]

fmnist_dataset = {}

def convert_to_int(byts):
    integer = int(codecs.encode(byts, 'hex'), 16)
    return integer

for file in fmnist_files:
    with open(path_to_fmnist_dataset + file, 'rb') as fd:
        fmnist_data = fd.read()

        category = convert_to_int(fmnist_data[:4])
        length = convert_to_int(fmnist_data[4:8])
        if category == 2051:
            category = "images"
            no_of_rows = convert_to_int(fmnist_data[8: 12])
            no_of_cols = convert_to_int(fmnist_data[12: 16])
            parsed = np.frombuffer(fmnist_data, dtype=np.uint8, offset=16)
            parsed = parsed.reshape(length, no_of_rows, no_of_cols)
        if category == 2049:
            category = "labels"
            parsed = np.frombuffer(fmnist_data, dtype=np.uint8, offset=8)
            parsed = parsed.reshape(length)
        if length == 60000:
            set_type = "train"
        if length == 10000:
            set_type = "test"
    fmnist_dataset[set_type + "_" + category] = parsed

print(fmnist_dataset['train_images'][:5])
print(fmnist_dataset.keys())

[[[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]

 [[0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  ...
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]
  [0 0 0 ... 0 0 0]]]
dict_keys(['test_images', 'test_labels', 'train_images', 'train_labels'])


In [162]:
fmnist_train_images = fmnist_dataset['train_images']
fmnist_train_images_flattened = fmnist_train_images.reshape(fmnist_train_images.shape[0], -1)

In [163]:
class DBSCANImages:

    def __init__(self, epsilon, min_pts):
        self.epsilon = epsilon
        self.min_pts = min_pts
        self.labels = None

    def _region_query(self, dataset, i):
        return np.where(self.dist_matrix[i] < self.epsilon)[0]
    
    def _expand_cluster(self, dataset, i, neighbors, cluster_id):
        self.labels[i] = cluster_id
        queue = deque(neighbors)

        while queue:
            index = queue.popleft()
            if self.labels[index] == -2:
                self.labels[index] = cluster_id
            elif self.labels[index] == -1:
                self.labels[index] = cluster_id
                current_neighbors = self._region_query(dataset, index)
                if len(current_neighbors) >= self.min_pts:
                    queue.extend(current_neighbors)

    def fit(self, dataset):
        n = dataset.shape[0]
        self.labels = np.full(n, -1)
        self.dist_matrix = squareform(pdist(dataset, metric='euclidean'))
        cluster_id = 0

        for i in range(n):
            # print(f"Point {i}: max={np.max(self.dist_matrix)}, min={np.min(self.dist_matrix)}, mean={np.mean(self.dist_matrix)}")            
            if self.labels[i] == -1:
                neighbours = self._region_query(dataset, i)
                if len(neighbours) < self.min_pts:
                    self.labels[i] = -2
                else:
                    self._expand_cluster(dataset, i, neighbours, cluster_id)
                    cluster_id += 1

        return self.labels

In [176]:
sample_fmnist_data = fmnist_train_images_flattened[np.random.choice(60000, 20000, replace=False)]
sample_fmnist_data = sample_fmnist_data/255
dbscan_fmnist = DBSCANImages(7, 2)
labels_fmnist = dbscan_fmnist.fit(sample_fmnist_data)

In [177]:
l1, c1 = np.unique(labels_fmnist, return_counts=True)

In [178]:
print(l1)
print(c1)
score1 = silhouette_score(sample_fmnist_data, labels_fmnist, metric='euclidean')
print("Silhouette Score:", score1)

[-2  0  1  2  3  4  5  6]
[  293 19694     2     3     2     2     2     2]
Silhouette Score: -0.027808517073190483


# Household dataset

In [149]:
import pandas as pd

file_path = os.environ.get("DATASET_PATH") + "household_power_consumption.txt"
df = pd.read_csv(file_path, sep=";", nrows=1, low_memory=False)

sample_data = pd.read_csv(file_path, sep=";", skiprows=lambda x: x > 1 and np.random.rand() > (20000 / 2075259), low_memory=False, na_values=["?"], names=df.columns)

In [152]:
print(sample_data.columns)

Index(['Date', 'Time', 'Global_active_power', 'Global_reactive_power',
       'Voltage', 'Global_intensity', 'Sub_metering_1', 'Sub_metering_2',
       'Sub_metering_3'],
      dtype='object')


In [None]:
sample_data.drop(["Date", "Time"],axis=1, inplace=True)

sample_data = sample_data.apply(pd.to_numeric, errors="coerce")

sample_data.dropna(inplace=True)

In [None]:
dbscan_uci = DBSCANImages(2.5, 13)
labels_uci = dbscan_uci.fit(sample_data)
l2, c2 = np.unique(labels_uci, return_counts=True)

print(l2)
print(c2)
s2 = silhouette_score(sample_data, labels_uci, metric="euclidean")
print(f"Silhouette Score: {s2}")

Silhouette Score: 0.3125562015528818
