In [6]:
!pip3 install scikit-learn

Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/25/92/ee1d7a00bb6b8c55755d4984fd82608603a3cc59959245068ce32e7fb808/scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata
  Downloading scikit_learn-1.6.1-cp311-cp311-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Obtaining dependency information for scipy>=1.6.0 from https://files.pythonhosted.org/packages/61/d8/84da3fffefb6c7d5a16968fe5b9f24c98606b165bb801bb0b8bc3985200f/scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl.metadata
  Downloading scipy-1.15.2-cp311-cp311-macosx_14_0_arm64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting joblib>=1.2.0 (from scikit-learn)
  Obtaining dependency information for joblib>=1.2.0 from https://files.pythonhosted.org/packages/91/29/df4b9b42f2be0b623cbd5e2140cafcaa2bef0759a00b7b701

## DBSCAN Algorithm Implementation

In [19]:
# Add Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN as SklearnDBSCAN

In [8]:
dataset_path = "./../datasets"

iris_dataset_path = dataset_path + "/iris.csv"                                         
ai_global_index_path = dataset_path + "/AI_index_db.csv"
global_earthquake_data_path = dataset_path + "/earthquakes.csv"

In [20]:
iris_df = pd.read_csv(iris_dataset_path)
ai_global_index_df = pd.read_csv(ai_global_index_path)
global_earthquake_data_df = pd.read_csv(global_earthquake_data_path)

datasets = {
    "iris": iris_df,
    "ai_global_index": ai_global_index_df,
    "global_earthquake": global_earthquake_data_df
}

In [21]:
class DBSCANFromScratch:
    def __init__(self, eps=0.5, min_pts=5):
        self.eps = eps
        self.min_pts = min_pts
        self.labels_ = None

    def fit(self, X):
        n = len(X)
        self.labels_ = np.full(n, -1)  # -1 = noise
        visited = np.zeros(n, dtype=bool)
        cluster_id = 0

        for i in range(n):
            if visited[i]:
                continue
            visited[i] = True
            neighbors = self.region_query(X, i)

            if len(neighbors) < self.min_pts:
                self.labels_[i] = -1  # noise
            else:
                self.expand_cluster(X, i, neighbors, cluster_id, visited)
                cluster_id += 1

    def expand_cluster(self, X, point_idx, neighbors, cluster_id, visited):
        self.labels_[point_idx] = cluster_id
        i = 0
        while i < len(neighbors):
            neighbor_idx = neighbors[i]
            if not visited[neighbor_idx]:
                visited[neighbor_idx] = True
                neighbor_neighbors = self.region_query(X, neighbor_idx)
                if len(neighbor_neighbors) >= self.min_pts:
                    neighbors += [n for n in neighbor_neighbors if n not in neighbors]
            if self.labels_[neighbor_idx] == -1:
                self.labels_[neighbor_idx] = cluster_id
            i += 1

    def region_query(self, X, idx):
        distances = np.linalg.norm(X - X[idx], axis=1)
        return list(np.where(distances <= self.eps)[0])

In [22]:
results = {}

for name, df in datasets.items():
    print(f"\nProcessing {name} dataset")

    # Handle missing values
    df = df.dropna()  # or use df.fillna(df.mean(numeric_only=True))

    # Select only numeric columns
    X = df.select_dtypes(include=[np.number]).values

    # Normalize
    X = StandardScaler().fit_transform(X)

    # Run custom DBSCAN
    dbscan_custom = DBSCANFromScratch(eps=0.5, min_pts=5)
    dbscan_custom.fit(X)
    print("Custom DBSCAN Labels:", dbscan_custom.labels_)

    # Run sklearn DBSCAN
    dbscan_sklearn = SklearnDBSCAN(eps=0.5, min_samples=5)
    dbscan_sklearn.fit(X)
    print("Sklearn DBSCAN Labels:", dbscan_sklearn.labels_)

    # Compare clustering results (NOTE: Cluster numbers might not match; we only check label structure)
    same_labels = np.array_equal(dbscan_custom.labels_, dbscan_sklearn.labels_)
    results[name] = same_labels

# Save results
results = pd.Series(results)
results.to_csv("./../results/01-dbscan.csv", header=False)


Processing iris dataset
Custom DBSCAN Labels: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1  0  0  0  0  0  0 -1  0
  0  0  0  0  0  0  0  0 -1 -1  0  0  0  0  0  0  0 -1  0  0  0  0  0  0
  0  0  1  1  1  1  1  1 -1 -1  1 -1 -1  1 -1  1  1  1  1  1 -1  1  1  1
 -1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1 -1  1  1  1  1  1 -1  1  1
  1  1 -1  1 -1  1  1  1  1 -1 -1 -1 -1 -1  1  1  1  1 -1  1  1 -1 -1 -1
  1  1 -1  1  1 -1  1  1  1 -1 -1 -1  1  1  1 -1 -1  1  1  1  1  1  1  1
  1  1  1  1 -1  1]
Sklearn DBSCAN Labels: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 -1 -1  0  0  0  0  0  0 -1  0
  0  0  0  0  0  0  0  0 -1 -1  0  0  0  0  0  0  0 -1  0  0  0  0  0  0
  0  0  1  1  1  1  1  1 -1 -1  1 -1 -1  1 -1  1  1  1  1  1 -1  1  1  1
 -1  1  1  1  1  1  1  1  1  1  1  1  1 -1  1 -1  1  1  1  1  1 -1  1  1
  1  1 -1  1 -1  1  1  1  1 -1 -1 -1 -1 -1  1  1  1  1 -1  1  1 -1 -1 -1
  1  1 -1  1  1 -1  1  1  1 -1 -1 -1  1  1  1 -1 -1  1  1  1  1  1  1  1
  1  1  1  1 -1  1]

Processing ai