import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


california_housing = pd.read_csv('housing.csv')


features = california_housing[['longitude', 'latitude']]


data = features.to_numpy()


def dbscan(data, eps, min_samples):
    def range_query(data, point, epsilon):
        neighbors = np.where(np.linalg.norm(data - point, axis=1) <= epsilon)[0]
        return neighbors

    def expand_cluster(point_index, neighbors, cluster, epsilon, min_samples, iteration):
        cluster.append(point_index)
        for neighbor_index in neighbors:
            if not visited[neighbor_index]:
                visited[neighbor_index] = True
                neighbor_neighbors = range_query(data, data[neighbor_index], epsilon)
                if len(neighbor_neighbors) >= min_samples:
                    neighbors = np.concatenate((neighbors, neighbor_neighbors))
            if neighbor_index not in cluster:
                cluster.append(neighbor_index)
        if iteration >= max_iterations:
            return

    n = len(data)
    visited = np.zeros(n, dtype=bool)
    clusters = []
    noise = []

    for point_index, point in enumerate(data):
        if visited[point_index]:
            continue
        neighbors = range_query(data, point, eps)

        if len(neighbors) < min_samples:
            noise.append(point_index)
        else:
            cluster = []
            iteration = 0
            expand_cluster(point_index, neighbors, cluster, eps, min_samples, iteration)
            clusters.append(cluster)

    return clusters, noise


epsilon = 0.2  
min_samples = 5  
max_iterations = 1000  


clusters, noise = dbscan(data, epsilon, min_samples)


for i, cluster in enumerate(clusters):
    plt.scatter(data[cluster, 0], data[cluster, 1], label=f'Cluster {i + 1}')
plt.scatter(data[noise, 0], data[noise, 1], c='gray', marker='x', label='Noise')

plt.title(f'DBSCAN Clustering (epsilon={epsilon}, min_samples={min_samples})')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()


print(f'Number of clusters: {len(clusters)}')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score

california_housing = pd.read_csv('housing.csv')
features = california_housing[['longitude', 'latitude']]
data = features.to_numpy()

def dbscan(data, eps, min_samples):
    def range_query(data, point, epsilon):
        neighbors = np.where(np.linalg.norm(data - point, axis=1) <= epsilon)[0]
        return neighbors

    def expand_cluster(point_index, neighbors, cluster, epsilon, min_samples, iteration):
        cluster.append(point_index)
        for neighbor_index in neighbors:
            if not visited[neighbor_index]:
                visited[neighbor_index] = True
                neighbor_neighbors = range_query(data, data[neighbor_index], epsilon)
                if len(neighbor_neighbors) >= min_samples:
                    neighbors = np.concatenate((neighbors, neighbor_neighbors))
            if neighbor_index not in cluster:
                cluster.append(neighbor_index)
        if iteration >= max_iterations:
            return

    n = len(data)
    visited = np.zeros(n, dtype=bool)
    clusters = []
    noise = []

    for point_index, point in enumerate(data):
        if visited[point_index]:
            continue
        neighbors = range_query(data, point, eps)

        if len(neighbors) < min_samples:
            noise.append(point_index)
        else:
            cluster = []
            iteration = 0
            expand_cluster(point_index, neighbors, cluster, eps, min_samples, iteration)
            clusters.append(cluster)

    return clusters, noise

epsilon = 0.2
min_samples = 5
max_iterations = 1000

clusters, noise = dbscan(data, epsilon, min_samples)

# Calculate Silhouette Score
def calculate_silhouette_score(data, clusters):
    n = len(data)
    cluster_assignments = np.zeros(n, dtype=int)

    for i, cluster in enumerate(clusters):
        for point in cluster:
            cluster_assignments[point] = i

    silhouette = silhouette_score(data, cluster_assignments)
    return silhouette

silhouette = calculate_silhouette_score(data, clusters)
print(f'Silhouette Score: {silhouette}')

# Visualize the clusters
for i, cluster in enumerate(clusters):
    plt.scatter(data[cluster, 0], data[cluster, 1], label=f'Cluster {i + 1}')
plt.scatter(data[noise, 0], data[noise, 1], c='gray', marker='x', label='Noise')

plt.title(f'DBSCAN Clustering (epsilon={epsilon}, min_samples={min_samples})')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()
plt.show()

print(f'Number of clusters: {len(clusters)}')