<a href="https://colab.research.google.com/github/fcm1006/CUHK/blob/GISM/GeoSpatialBigData%20/%20Tutorial_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 Clustering on spatial data


Cluster algorithms are useful for spatial data because they allow us to **group together similar data points based on their proximity to one another.** This can help us identify patterns and relationships in the data that might not be immediately apparent when looking at individual data points.

For example, if we have a dataset of customer locations, we might use a cluster algorithm to group together customers who are located near one another. This could help us identify areas where we might want to open a new store or target our marketing efforts.



<img src="https://asmaloney.com/images/2015/06/Leaflet_Clusters_Too_Many_Markers.jpg"/>

<img src="https://media.geeksforgeeks.org/wp-content/uploads/merge3cluster.jpg" alt="Drawing" style="width: 1000px;"/>

## 1.1 Clustering algorithm

Different clustering algorithm

https://scikit-learn.org/stable/modules/clustering.html

<img src="https://miro.medium.com/max/700/1*oNt9G9UpVhtyFLDBwEMf8Q.png" alt="Drawing" style="width: 1000px;"/>



How DBScan works

https://www.kdnuggets.com/2020/04/dbscan-clustering-algorithm-machine-learning.html




<img src="https://miro.medium.com/proxy/1*tc8UF-h0nQqUfLC8-0uInQ.gif"/>

## 1.2 Try DBsscan on test data

Let's try it

<img src="https://i.redd.it/142fd50lrqu21.jpg"/>


In [None]:
!pip install sklearn # an important machine learning package

In [None]:
# Generate some random coordinates at Shanghai
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs

sh_lat = 31.22 
sh_lng = 121.46

data, label = make_blobs(n_samples=100, n_features=2, centers=5) # five clusters, randomly

sample_data =pd.DataFrame(data = data*0.01 + np.array([sh_lat,sh_lng]),  
                  columns = ["y", "x"]) 

sample_data.head(10) # show the top rows

In [None]:
# plot in map
!pip install folium
import folium

gaode = 'http://wprd03.is.autonavi.com/appmaptile?style=7&x={x}&y={y}&z={z}' 
map_sh_random_pt = folium.Map(location=[sh_lat,sh_lng],tiles=gaode, attr = 'test')

for index, row in sample_data.iterrows():
    folium.CircleMarker(
        [row['y'],row['x']],
        radius=3).add_to(map_sh_random_pt)
map_sh_random_pt

In [None]:
# create a dbscan cluster

from sklearn.cluster import DBSCAN
db_default = DBSCAN(eps = 0.01, # ~ 1000 meter
                    min_samples = 3).fit(sample_data[['x','y']]) # train the model 

sample_data['label'] = db_default.fit_predict(sample_data[['x','y']]) # assign the clustering result

sample_data.head(10)

<img src="http://wondernote.org/wp-content/uploads/2019/03/Bright-Color-Palettes-Wondernote.jpg" height="500">

In [None]:
map_sh_random_pt_cluster = folium.Map(location=[sh_lat,sh_lng],tiles=gaode, attr = 'test')

# find a color palettes
colors = ['#c05780', '#ff828b', '#e7c582', '#00b0ba', '#0065a2', '#ffec59' '#555555']

for index, row in sample_data.iterrows():
  folium.CircleMarker(
      [row['y'],row['x']],
      color = colors[int(row['label'])],
      radius=3,
      fill=True
  ).add_to(map_sh_random_pt_cluster)

map_sh_random_pt_cluster

## 1.3 Shared bike O-D

In [None]:
# load a mobike data in Shanghai
mobike_df = pd.read_csv('https://github.com/gyshion/tutorial/raw/main/mobike_shanghai1.csv')
mobike_df

In [None]:
# Plot 1% of the Origin-destination line from mobike data

map_sh = folium.Map(location=[sh_lat,sh_lng],tiles=gaode, attr = 'shared bike')

for index, row in mobike_df.iterrows():
    if index%100 == 0:
    #if index%10 == 0:
        OD = [
            [row['start_location_y'],row['start_location_x']],
            [row['end_location_y'],row['end_location_x']]]
        
        folium.PolyLine(OD, color='red').add_to(map_sh)
map_sh

## 1.4 Try DBscan on Mobike dataset

In [None]:
# stack the origin points with destination points
new_ll = []

for index, row in mobike_df.iterrows():
    new_ll.append([row['start_location_y'],row['start_location_x']])
    new_ll.append([row['end_location_y'],row['end_location_x']])

all_pt =pd.DataFrame(data = new_ll, columns = ["y", "x"]) 

all_pt.head(5)

In [None]:
db_mobike = DBSCAN(eps = 0.0005, # ~ 50 meter
                   min_samples = 20).fit(all_pt[['y','x']]) # train the model 

In [None]:
predict_mobike = db_mobike.fit_predict(all_pt[['y','x']]) # predict by the model
all_pt['label'] = predict_mobike

all_pt

In [None]:
# assign color for different labels
import random
def random_color(seed):
    random.seed(seed+10)
    return "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])

print(random_color(4435)) # return the same color code for same seed

In [None]:
# check how many clusters we have
np.unique(predict_mobike)

In [None]:
map_sh_4 = folium.Map(location=[sh_lat,sh_lng],tiles=gaode,attr='test')

for index, row in all_pt.iterrows():
    if row['label']>=0:
      if random.random()<0.02:
        folium.CircleMarker(
            [row['y'],row['x']],
            color = random_color(row['label']),
            radius=3,
            fill=True
        ).add_to(map_sh_4)
map_sh_4

In [None]:
# assign the cluster to OD points
mobike_df['start_cluster'] = db_mobike.fit_predict(mobike_df[['start_location_y','start_location_x']]) # predict by the model
mobike_df['end_cluster'] = db_mobike.fit_predict(mobike_df[['end_location_y','end_location_x']]) # predict by the model
mobike_df

In [None]:
# filter only the od among clusters

mobike_df_cluster = mobike_df.query("start_cluster>=0 and end_cluster>=0")
mobike_df_cluster


In [None]:
# Plot 100% of the Origin-destination line from mobike data with the cluster

map_sh_4 = folium.Map(location=[sh_lat,sh_lng],tiles=gaode,attr='test')

for index, row in mobike_df_cluster.iterrows():
    if index%1==0:
        OD = [
            [row['start_location_y'],row['start_location_x']],
            [row['end_location_y'],row['end_location_x']]]
        folium.PolyLine(OD,weight=0.5,color = 'black').add_to(map_sh_4)
        
        
        folium.CircleMarker(
            [row['start_location_y'],row['start_location_x']],
            color = random_color(row['start_cluster']),
            radius=3,
            fill=True
        ).add_to(map_sh_4)

        folium.CircleMarker(
            [row['end_location_y'],row['end_location_x']],
            color = random_color(row['end_cluster']),
            radius=3,
            fill=True
        ).add_to(map_sh_4)
        
map_sh_4

In [None]:
# todo: extract the network from shared bike data

# 2 Network

## 2.1 Network representing

In [None]:
!pip install networkx --user # most important network module
!pip install matplotlib --user
import networkx as nx
import matplotlib

In [None]:
G=nx.Graph() # create a network and add some edge

# adding just one node:
G.add_node("a")
# a list of nodes:
G.add_nodes_from(["b","c"])

print("Nodes of graph: ")
print(G.nodes())
print("Edges of graph: ")
print(G.edges())

In [None]:
# adding more of edges:
G.add_edges_from([("a","c"),("c","d"), ("a",1), (1,"d"), ("a",2)])

print("Nodes of graph: ")
print(G.nodes())
print("Edges of graph: ")
print(G.edges())

In [None]:
nx.draw(G,with_labels=True)


In [None]:
# Can you make this network connected as a whole?

nx.draw(G,with_labels=True)

## 2.2 Calculate node centrality index
https://networkx.org/documentation/stable/reference/algorithms/centrality.html

In [None]:
G2 = nx.random_geometric_graph(8, 0.4) # generate a network randomly: 50 nodes, located within x:[0 to 1], y:[0 to 1] by default
# Two nodes are joined by an edge if the distance between the nodes <=0.4
nx.draw(G2,with_labels=True)

In [None]:
closeness_centrality_map = nx.algorithms.centrality.closeness_centrality(G2)

for k in closeness_centrality_map:
  print(k, closeness_centrality_map[k])

In [None]:
betweenness_centrality_map = nx.algorithms.centrality.betweenness_centrality(G2)
# print(betweenness_centrality_map)

for k in betweenness_centrality_map:
  print(k, betweenness_centrality_map[k])

In [None]:
pos=nx.get_node_attributes(G2,'pos')
pos

In [None]:
nx.draw(G2,with_labels=True)


In [None]:
betweenness_centrality_map = nx.algorithms.centrality.betweenness_centrality(G2)
# print(betweenness_centrality_map)

In [None]:
# plot the node with centrality level
nx.draw_networkx_edges(G2, pos, alpha=0.4)
nx.draw_networkx_nodes(
    G2,
    pos,
    nodelist=list(betweenness_centrality_map.keys()),
    node_size=80,
    node_color=list(betweenness_centrality_map.values())
)


## 2.3 Try on real data

In [None]:
# read the flight data

import pandas as pd

flight_df = pd.read_csv('https://github.com/gyshion/tutorial/raw/main/flights.csv',sep=';')
flight_df.head()

In [None]:
# import the data into a new network

G_flight=nx.Graph() 

pos_flight = dict() #store the location of airports

for index, row in flight_df.iterrows():
    if row['NbFlights']>4:
        dep = str(row['DepLat'])+','+str(row['DepLon'])
        arr = str(row['ArrLat'])+','+str(row['ArrLon'])
        G_flight.add_edge(dep,arr,weight=row['NbFlights'])
        pos_flight[dep] = [row['DepLon'],row['DepLat']]
        pos_flight[arr] = [row['ArrLon'],row['ArrLat']]

In [None]:
nx.draw(G_flight,pos=pos_flight,node_size=40)

In [None]:
# calculate the eigenvector_centrality

eigenvector_centrality_map = nx.algorithms.centrality.eigenvector_centrality(G_flight,weight='weight')
print(eigenvector_centrality_map)

In [None]:
nx.draw(G_flight,pos=pos_flight,node_size=30,node_color=list(eigenvector_centrality_map.values()))

## 2.4 Plot it on folium


If we want to plot the airports with high importance, like larger than 0.1

Can you try by yourselve?

In [None]:
map_sh_5 = folium.Map(location=[sh_lat,sh_lng],tiles=gaode,attr='test')


In [None]:
# all link of a graph
for link in G_flight:
  #print(link)
  link

In [None]:
# all node importance
for k in eigenvector_centrality_map:
  if eigenvector_centrality_map[k]>=0.1:
      print('coor:',k,', eigenvector:',eigenvector_centrality_map[k])

In [None]:
# todo: plot the data on the folium map