## 1. import Libs

In [2]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


## 2. Scrap data from Wikipedia with BeautifulSoup

In [9]:
shdata = requests.get("https://en.wikipedia.org/wiki/List_of_administrative_divisions_of_Shanghai").text
soup = BeautifulSoup(shdata, 'html.parser')
shdistList = []

In [10]:
for row in soup.find_all("table", class_="wikitable")[0].findAll("a"):
    shdistList.append(row.text)

In [17]:
df = pd.DataFrame({"Districts": shdistList})

df= df.drop([16])
df

Unnamed: 0,Districts
0,Huangpu
1,Xuhui
2,Changning
3,Jing'an
4,Putuo
5,Hongkou
6,Yangpu
7,Pudong
8,Baoshan
9,Minhang


## 3. Get the geographical coordinates

In [23]:
pip install geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Note: you may need to restart the kernel to use updated packages.


In [24]:
import geocoder

In [25]:
def get_latlng(districts):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Shanghai, China'.format(districts))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [26]:
coords = [ get_latlng(districts) for districts in df["Districts"].tolist() ]

In [27]:
coords

[[31.23780000000005, 121.47810000000004],
 [31.195940000000064, 121.44709000000012],
 [31.217390000000023, 121.42105000000004],
 [31.220000000000027, 121.41583000000003],
 [31.251000000000033, 121.38970000000006],
 [31.250000000000057, 121.48917000000006],
 [31.261930000000064, 121.51904000000002],
 [31.235130000000026, 121.52759000000003],
 [31.416390000000035, 121.48000000000002],
 [31.10880000000003, 121.37472000000002],
 [31.366370000000074, 121.22153000000003],
 [30.920250000000067, 121.25199000000009],
 [31.03595000000007, 121.21460000000002],
 [31.153940000000034, 121.11408000000006],
 [30.833810000000028, 121.52128000000005],
 [31.61833000000007, 121.55333000000007]]

In [29]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])
df['Latitude'] = df_coords['Latitude']
df['Longitude'] = df_coords['Longitude']
df

Unnamed: 0,Districts,Latitude,Longitude
0,Huangpu,31.2378,121.4781
1,Xuhui,31.19594,121.44709
2,Changning,31.21739,121.42105
3,Jing'an,31.22,121.41583
4,Putuo,31.251,121.3897
5,Hongkou,31.25,121.48917
6,Yangpu,31.26193,121.51904
7,Pudong,31.23513,121.52759
8,Baoshan,31.41639,121.48
9,Minhang,31.1088,121.37472


## 4. Create a map of Shanghai with districts

In [30]:
address = 'Shanghai, China'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Shanghai, China {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Shanghai, China 31.2322735, 121.4691749.


In [36]:
map_sh = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, districts in zip(df['Latitude'], df['Longitude'], df['Districts']):
    label = '{}'.format(districts)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_sh)  
    
map_sh

## 5. Use the Foursquare API to explore the districts

In [37]:
CLIENT_ID = '4NU2C2TAHZSVO2RM14POY5YR0520F4VHTNFLSGTTQYKV3IQH' # your Foursquare ID
CLIENT_SECRET = 'IRCRHM2G2H30Q4BSDDISR05VZYZ5FISUXNJM1AYULEMC4B42' # your Foursquare Secret
VERSION = '20191230' # Foursquare API version


Your credentails:
CLIENT_ID: 4NU2C2TAHZSVO2RM14POY5YR0520F4VHTNFLSGTTQYKV3IQH
CLIENT_SECRET:IRCRHM2G2H30Q4BSDDISR05VZYZ5FISUXNJM1AYULEMC4B42


#### get top 200 venues that are within a radius of 5000 meters.

In [43]:
radius = 5000
LIMIT = 200

venues = []

for lat, long, neighborhood in zip(df['Latitude'], df['Longitude'], df['Districts']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [49]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Districts', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(921, 7)


Unnamed: 0,Districts,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Huangpu,31.2378,121.4781,Grand Central Hotel Shanghai (上海大酒店),31.237379,121.476754,Hotel
1,Huangpu,31.2378,121.4781,The Shanghai EDITION (上海爱迪逊酒店),31.240001,121.481678,Hotel
2,Huangpu,31.2378,121.4781,Épices & Foie-gras,31.237557,121.47958,French Restaurant
3,Huangpu,31.2378,121.4781,Bund Plaza,31.239211,121.479741,Department Store
4,Huangpu,31.2378,121.4781,Campanile Hotel and Restaurant,31.232123,121.479144,Hotel


In [51]:
venues_df.groupby(["Districts"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Districts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Baoshan,8,8,8,8,8,8
Changning,100,100,100,100,100,100
Chongming,2,2,2,2,2,2
Fengxian,4,4,4,4,4,4
Hongkou,100,100,100,100,100,100
Huangpu,100,100,100,100,100,100
Jiading,21,21,21,21,21,21
Jing'an,100,100,100,100,100,100
Jinshan,1,1,1,1,1,1
Minhang,53,53,53,53,53,53


In [47]:
venues_df['VenueCategory'].unique()

array(['Hotel', 'French Restaurant', 'Department Store', 'Waterfront',
       'Cantonese Restaurant', 'Italian Restaurant', 'Hotpot Restaurant',
       'Lounge', 'Yunnan Restaurant', 'Pedestrian Plaza',
       'Chinese Restaurant', 'Restaurant', 'Café', 'Roof Deck',
       'BBQ Joint', 'Theater', 'Shopping Mall', 'Bar', 'Hotel Bar',
       'Seafood Restaurant', 'Cocktail Bar', 'Sculpture Garden',
       'American Restaurant', 'Breakfast Spot', 'Spanish Restaurant',
       'Dumpling Restaurant', 'Spa', 'Shanghai Restaurant', 'Park',
       'Vegetarian / Vegan Restaurant', 'Fast Food Restaurant', 'Garden',
       'Scenic Lookout', 'Coffee Shop', 'Gym', 'Pizza Place',
       'Massage Studio', 'Monument / Landmark', 'Art Gallery',
       'Salad Place', 'History Museum', 'Electronics Store',
       'Other Nightlife', 'Peruvian Restaurant', 'Yoga Studio', 'Bakery',
       'Turkish Restaurant', 'Xinjiang Restaurant', 'Pet Store',
       'Art Museum', 'Taco Place', 'Neighborhood', 'Food Court'

## 6. Analyze the districts

In [52]:
sh_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sh_onehot['Districts'] = venues_df['Districts'] 

# move neighborhood column to the first column
fixed_columns = [sh_onehot.columns[-1]] + list(sh_onehot.columns[:-1])
sh_onehot = sh_onehot[fixed_columns]

print(sh_onehot.shape)
sh_onehot.head()

(921, 136)


Unnamed: 0,Districts,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,...,Video Store,Water Park,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Huangpu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Huangpu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Huangpu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Huangpu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Huangpu,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
sh_grouped = sh_onehot.groupby(["Districts"]).mean().reset_index()

print(sh_grouped.shape)
sh_grouped

(16, 136)


Unnamed: 0,Districts,American Restaurant,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,BBQ Joint,Bagel Shop,Bakery,...,Video Store,Water Park,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Xinjiang Restaurant,Yoga Studio,Yunnan Restaurant,Zhejiang Restaurant
0,Baoshan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Changning,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.04,...,0.01,0.0,0.0,0.01,0.02,0.01,0.01,0.02,0.02,0.0
2,Chongming,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Fengxian,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Hongkou,0.01,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0
5,Huangpu,0.02,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.01,0.0
6,Jiading,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Jing'an,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.04,...,0.01,0.0,0.0,0.01,0.02,0.01,0.01,0.01,0.02,0.0
8,Jinshan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Minhang,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.018868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018868


#### Create new dataframe for stadium

In [55]:
sh_sta = sh_grouped[["Districts","Stadium"]]
sh_sta.head()

Unnamed: 0,Districts,Stadium
0,Baoshan,0.125
1,Changning,0.0
2,Chongming,0.0
3,Fengxian,0.0
4,Hongkou,0.0


## 7. Cluster Neighborhoods

In [56]:
kclusters = 3

sh_clustering = sh_sta.drop(["Districts"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sh_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 0, 0, 0, 0, 2, 0, 0, 0])

In [66]:
sh_merged = sh_sta.copy()

# add clustering labels
sh_merged["Cluster Labels"] = kmeans.labels_
sh_merged.rename(columns={"Districts": "Districts"}, inplace=True)
sh_merged.head()

Unnamed: 0,Districts,Stadium,Cluster Labels
0,Baoshan,0.125,1
1,Changning,0.0,0
2,Chongming,0.0,0
3,Fengxian,0.0,0
4,Hongkou,0.0,0


In [67]:
sh_merged = sh_merged.join(df.set_index("Districts"), on="Districts")

print(sh_merged.shape)
sh_merged

(16, 5)


Unnamed: 0,Districts,Stadium,Cluster Labels,Latitude,Longitude
0,Baoshan,0.125,1,31.41639,121.48
1,Changning,0.0,0,31.21739,121.42105
2,Chongming,0.0,0,31.61833,121.55333
3,Fengxian,0.0,0,30.83381,121.52128
4,Hongkou,0.0,0,31.25,121.48917
5,Huangpu,0.0,0,31.2378,121.4781
6,Jiading,0.047619,2,31.36637,121.22153
7,Jing'an,0.0,0,31.22,121.41583
8,Jinshan,0.0,0,30.92025,121.25199
9,Minhang,0.0,0,31.1088,121.37472


#### Let's visualize the resulting clusters

In [69]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(sh_merged['Latitude'], sh_merged['Longitude'], sh_merged['Districts'], sh_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## 8. Examine the Clusters

In [72]:
sh_merged.loc[sh_merged['Cluster Labels'] == 0]


Unnamed: 0,Districts,Stadium,Cluster Labels,Latitude,Longitude
1,Changning,0.0,0,31.21739,121.42105
2,Chongming,0.0,0,31.61833,121.55333
3,Fengxian,0.0,0,30.83381,121.52128
4,Hongkou,0.0,0,31.25,121.48917
5,Huangpu,0.0,0,31.2378,121.4781
7,Jing'an,0.0,0,31.22,121.41583
8,Jinshan,0.0,0,30.92025,121.25199
9,Minhang,0.0,0,31.1088,121.37472
10,Pudong,0.0,0,31.23513,121.52759
11,Putuo,0.0,0,31.251,121.3897


In [73]:
sh_merged.loc[sh_merged['Cluster Labels'] == 1]


Unnamed: 0,Districts,Stadium,Cluster Labels,Latitude,Longitude
0,Baoshan,0.125,1,31.41639,121.48


In [74]:
sh_merged.loc[sh_merged['Cluster Labels'] == 2]

Unnamed: 0,Districts,Stadium,Cluster Labels,Latitude,Longitude
6,Jiading,0.047619,2,31.36637,121.22153
