### Investigate Clustering Algorithms

To investigate

- k-means
- DBSCAN
- Weighted versions of above


Process

- Get POIs from area of study
- Randomly samply some POIs
- Cluster testing different params and observe results
- Translate findings to sparsification techniques

In [46]:
import pandas as pd
import geopandas as gpd
from shapely import Point



In [2]:
#Import OAs
#import OA data
wm_oas = gpd.read_file('data/west_midlands_OAs/west_midlands_OAs.shp')
wm_oas = wm_oas[wm_oas['LAD11CD'] == 'E08000026']
oa_info = pd.read_csv('data/oa_info.csv')
oa_info = oa_info.merge(wm_oas[['OA11CD']], left_on = 'oa_id', right_on = 'OA11CD', how = 'inner')
oaIndex = list(oa_info['oa_id'])
oaLatLon = oa_info[['oa_lon','oa_lat']].values

In [3]:
#import POI data
pois = pd.read_csv('data/POIs/pois.csv', index_col=0)

#Select local POIs
poisInRegion = []

for i,r in pois.iterrows():
    poiPoint = Point(tuple(list(r[['poi_lon','poi_lat']])))
    
    for i2, r2 in wm_oas.iterrows():
        if r2['geometry'].intersects(poiPoint):
            poisInRegion.append(r['poi_id'])

pois = pois[pois['poi_id'].isin(poisInRegion)]
poiIndex = list(pois['poi_id'])
poisLatLon = pois[['poi_lat','poi_lon']].values


In [22]:

poiRand = pois.sample(50)
poiIndex = list(poiRand['poi_id'])
poisLatLon = poiRand[['poi_lat','poi_lon']].values

In [4]:
# k-means clustering

from sklearn.cluster import KMeans
import numpy as np

In [23]:
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=0, n_init="auto").fit(poisLatLon)

In [24]:
# Get cluster labels and centroids
cluster_labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# Initialize an array to store the index of the point closest to the centroid in each cluster
closest_points_indices = []

# Iterate through each cluster
for cluster_id in range(num_clusters):
    # Get the indices of points in the current cluster
    points_in_cluster = np.where(cluster_labels == cluster_id)[0]
    
    # Calculate distances between each point in the cluster and its centroid
    distances_to_centroid = np.linalg.norm(poisLatLon[points_in_cluster] - centroids[cluster_id], axis=1)
    
    # Find the index of the point closest to the centroid
    closest_point_index = points_in_cluster[np.argmin(distances_to_centroid)]
    
    # Store the index in the array
    closest_points_indices.append(closest_point_index)

# Print the indices of points closest to the centroids
print("Indices of points closest to centroids:", closest_points_indices)

Indices of points closest to centroids: [10, 25, 28, 43, 18]


In [25]:
#Append cluster label to POIs
poiRand['kmcluster'] = cluster_labels

In [26]:
#Indicate if POI closest point
poi_centroid =[]
for i in range(len(poiRand)):
    if i in closest_points_indices:
        poi_centroid.append('Y')
    else:
        poi_centroid.append('N')
poiRand['kmpoi_centroid'] = poi_centroid

In [27]:
#Output dataset as geodataframe

geometries = []
for i,r in poiRand.iterrows():
    geometries.append(Point(r['poi_lon'],r['poi_lat']))

poiRand['geometry'] = geometries

In [None]:
# Weighted k-means clustering

In [15]:
#DBSCAN

from sklearn.cluster import DBSCAN

In [44]:
dbscan_clustering = DBSCAN(eps=0.01, min_samples=2).fit(poisLatLon)

# Get cluster labels and centroids
cluster_labels = dbscan_clustering.labels_

poiRand['dbcluster'] = cluster_labels

print(poiRand['dbcluster'].value_counts())

dbcluster
-1    14
 2    12
 0     4
 3     4
 4     4
 1     3
 8     3
 5     2
 6     2
 7     2
Name: count, dtype: int64


In [45]:
poi_gdf = gpd.GeoDataFrame(poiRand[['geometry','kmcluster','kmpoi_centroid','dbcluster']], geometry = poiRand['geometry'])
poi_gdf.to_file('data/outputs/poi-kmeans.geojson', driver='GeoJSON')  