In [1]:
from requests import get
from bs4 import BeautifulSoup
from parsel import Selector
import pandas as pd
import numpy as np
import folium
import json
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [2]:
df = pd.read_csv("data/hanoi_urban_districts.csv") 
df

Unnamed: 0,District,Area,Population,Latitude,Longitude,Price,Count
0,Ba Đình,9.224,247100,21.036667,105.836111,252132.67125,1624
1,Bắc Từ Liêm,43.35,333300,21.074832,105.770597,91049.913941,812
2,Cầu Giấy,12.04,266800,21.018907,105.797624,322580.645161,406
3,Đống Đa,9.96,420900,21.012862,105.829642,176190.47619,1218
4,Hai Bà Trưng,10.09,318000,21.006483,105.853338,151364.555256,812
5,Hà Đông,47.917,319800,20.959251,105.765959,115615.296807,620
6,Hoàn Kiếm,5.29,160600,21.028408,105.854064,189143.598834,72
7,Hoàng Mai,41.04,411500,20.978733,105.8634,65789.473684,406
8,Long Biên,60.38,291900,21.026478,105.896822,102968.115281,162
9,Nam Từ Liêm,32.27,236700,21.014968,105.768715,98611.111111,812


In [3]:
map_hanoi = folium.Map(location=[21.029027, 105.834089], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['District']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_hanoi)
    
map_hanoi

In [4]:
config = json.load(open('config.json'))
CLIENT_ID = config['client_id']
CLIENT_SECRET = config['client_secret']
VERSION = config['version']
LIMIT=200

In [5]:
def getNearbyVenues(names, latitudes, longitudes, radius=1500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District', 
                  'District Latitude', 
                  'District Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [6]:
hanoi_venues = getNearbyVenues(names=df['District'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude'])

Ba Đình
Bắc Từ Liêm
Cầu Giấy
Đống Đa
Hai Bà Trưng
Hà Đông
Hoàn Kiếm
Hoàng Mai
Long Biên
Nam Từ Liêm
Tây Hồ
Thanh Xuân


In [7]:
temp = hanoi_venues.copy()
hanoi_venues.head()


Unnamed: 0,District,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Ba Đình,21.036667,105.836111,Lăng Chủ Tịch Hồ Chí Minh (Ho Chi Minh Mausoleum),21.035525,105.83472,Monument / Landmark
1,Ba Đình,21.036667,105.836111,Cộng Càphê,21.033504,105.838189,Coffee Shop
2,Ba Đình,21.036667,105.836111,Đền Quán Thánh,21.043024,105.836395,Temple
3,Ba Đình,21.036667,105.836111,Văn Miếu Quốc Tử Giám (Temple of Literature) (...,21.028707,105.836005,Confucian Temple
4,Ba Đình,21.036667,105.836111,Hotel La Siesta Trendy,21.032244,105.845727,Hotel


In [8]:
temp.groupby(['District']).count()

Unnamed: 0_level_0,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ba Đình,100,100,100,100,100,100
Bắc Từ Liêm,5,5,5,5,5,5
Cầu Giấy,71,71,71,71,71,71
Hai Bà Trưng,95,95,95,95,95,95
Hoàn Kiếm,100,100,100,100,100,100
Hoàng Mai,12,12,12,12,12,12
Hà Đông,7,7,7,7,7,7
Long Biên,10,10,10,10,10,10
Nam Từ Liêm,20,20,20,20,20,20
Thanh Xuân,39,39,39,39,39,39


In [9]:
hanoi_venues['Venue Category'].unique()

array(['Monument / Landmark', 'Coffee Shop', 'Temple', 'Confucian Temple',
       'Hotel', 'Beer Garden', 'Spa', 'Vietnamese Restaurant',
       'Noodle House', 'Fried Chicken Joint', 'Hotpot Restaurant', 'Bar',
       'Beer Bar', 'Church', 'Wings Joint', 'Bookstore', 'Cocktail Bar',
       'Sandwich Place', 'Palace', 'Café', 'Historic Site', 'Hostel',
       'Buddhist Temple', 'BBQ Joint', 'Food', 'Brazilian Restaurant',
       'Dessert Shop', 'Bed & Breakfast', 'French Restaurant', 'Tea Room',
       'Ice Cream Shop', 'Vegetarian / Vegan Restaurant', 'Art Museum',
       'Indian Restaurant', 'Massage Studio', 'Steakhouse',
       'Tennis Court', 'Mobile Phone Shop', 'Park', 'Asian Restaurant',
       'Multiplex', 'Gym / Fitness Center', 'Japanese Restaurant',
       'Korean Restaurant', 'Bubble Tea Shop', 'Szechuan Restaurant',
       'Scenic Lookout', 'Halal Restaurant', 'Shopping Mall',
       'Fast Food Restaurant', 'Peruvian Restaurant', 'Bakery',
       'Pizza Place', 'Buffet', 

In [10]:
# one hot encoding
hanoi_onehot = pd.get_dummies(hanoi_venues[['Venue Category']], prefix="", prefix_sep="")

# add district column back to dataframe
hanoi_onehot['District'] = hanoi_venues['District'] 

# move district column to the first column
fixed_columns = [hanoi_onehot.columns[-1]] + list(hanoi_onehot.columns[:-1])
hanoi_onehot = hanoi_onehot[fixed_columns]

hanoi_onehot.head()

Unnamed: 0,District,Arepa Restaurant,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Bed & Breakfast,...,Thai Restaurant,Theater,Tourist Information Center,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Wedding Hall,Wine Bar,Wings Joint
0,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
hanoi_onehot.shape

(577, 123)

In [12]:
hanoi_grouped = hanoi_onehot.groupby('District').mean().reset_index()
hanoi_grouped

Unnamed: 0,District,Arepa Restaurant,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Bed & Breakfast,...,Thai Restaurant,Theater,Tourist Information Center,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Wedding Hall,Wine Bar,Wings Joint
0,Ba Đình,0.0,0.0,0.01,0.0,0.0,0.02,0.0,0.01,0.01,...,0.0,0.0,0.0,0.0,0.01,0.15,0.0,0.0,0.0,0.01
1,Bắc Từ Liêm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.0,0.0
2,Cầu Giấy,0.0,0.0,0.014085,0.014085,0.0,0.014085,0.042254,0.014085,0.0,...,0.0,0.0,0.0,0.0,0.0,0.098592,0.0,0.0,0.0,0.0
3,Hai Bà Trưng,0.0,0.0,0.0,0.0,0.0,0.031579,0.021053,0.0,0.0,...,0.010526,0.010526,0.0,0.0,0.0,0.252632,0.0,0.0,0.0,0.0
4,Hoàn Kiếm,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.02,0.15,0.0,0.0,0.01,0.0
5,Hoàng Mai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.083333,0.0,0.0
6,Hà Đông,0.0,0.0,0.0,0.142857,0.0,0.0,0.142857,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0
7,Long Biên,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Nam Từ Liêm,0.0,0.0,0.0,0.0,0.05,0.05,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Thanh Xuân,0.0,0.0,0.0,0.0,0.0,0.0,0.128205,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,0.0


In [13]:
hanoi_grouped.shape

(12, 123)

In [14]:
num_top_venues = 5

for hood in hanoi_grouped['District']:
    print("----"+hood+"----")
    temp = hanoi_grouped[hanoi_grouped['District'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Ba Đình----
                   venue  freq
0  Vietnamese Restaurant  0.15
1           Noodle House  0.15
2            Coffee Shop  0.11
3                  Hotel  0.10
4                   Café  0.06


----Bắc Từ Liêm----
                   venue  freq
0  Vietnamese Restaurant   0.4
1                   Café   0.2
2            Coffee Shop   0.2
3           Tennis Court   0.2
4       Arepa Restaurant   0.0


----Cầu Giấy----
                   venue  freq
0            Coffee Shop  0.13
1  Vietnamese Restaurant  0.10
2                   Café  0.08
3   Fast Food Restaurant  0.08
4      Korean Restaurant  0.07


----Hai Bà Trưng----
                   venue  freq
0  Vietnamese Restaurant  0.25
1            Coffee Shop  0.13
2                   Café  0.07
3           Noodle House  0.06
4                  Hotel  0.05


----Hoàn Kiếm----
                   venue  freq
0                  Hotel  0.18
1  Vietnamese Restaurant  0.15
2            Coffee Shop  0.12
3           Noodle House  0.08
4

In [15]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [24]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
districts_venues_sorted = pd.DataFrame(columns=columns)
districts_venues_sorted['District'] = hanoi_grouped['District']

for ind in np.arange(hanoi_grouped.shape[0]):
    districts_venues_sorted.iloc[ind, 1:] = return_most_common_venues(hanoi_grouped.iloc[ind, :], num_top_venues)

districts_venues_sorted.head()

Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Ba Đình,Vietnamese Restaurant,Noodle House,Coffee Shop,Hotel,Café,Dessert Shop,Cocktail Bar,Beer Garden,Hotpot Restaurant,Monument / Landmark
1,Bắc Từ Liêm,Vietnamese Restaurant,Tennis Court,Coffee Shop,Café,Wings Joint,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Food Truck
2,Cầu Giấy,Coffee Shop,Vietnamese Restaurant,Café,Fast Food Restaurant,Korean Restaurant,Bubble Tea Shop,Japanese Restaurant,Bakery,Hotel,Multiplex
3,Hai Bà Trưng,Vietnamese Restaurant,Coffee Shop,Café,Noodle House,Hotel,Japanese Restaurant,Hotpot Restaurant,BBQ Joint,Dessert Shop,Sushi Restaurant
4,Hoàn Kiếm,Hotel,Vietnamese Restaurant,Coffee Shop,Noodle House,Café,Spa,Italian Restaurant,Lounge,Sandwich Place,Hotel Bar


In [25]:
# set number of clusters
kclusters = 3

hanoi_grouped_clustering = hanoi_grouped.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hanoi_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 

array([1, 1, 0, 1, 1, 1, 2, 0, 0, 0, 1, 1], dtype=int32)

In [26]:
# add clustering labels
districts_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

hanoi_merged = df

# merge hanoi_grouped with hanoi_data to add latitude/longitude for each district
hanoi_merged = hanoi_merged.join(districts_venues_sorted.set_index('District'), on='District')
hanoi_merged = hanoi_merged.fillna(value={'Cluster Labels': 0.0})

hanoi_merged.head() # check the last columns!

Unnamed: 0,District,Area,Population,Latitude,Longitude,Price,Count,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Ba Đình,9.224,247100,21.036667,105.836111,252132.67125,1624,1,Vietnamese Restaurant,Noodle House,Coffee Shop,Hotel,Café,Dessert Shop,Cocktail Bar,Beer Garden,Hotpot Restaurant,Monument / Landmark
1,Bắc Từ Liêm,43.35,333300,21.074832,105.770597,91049.913941,812,1,Vietnamese Restaurant,Tennis Court,Coffee Shop,Café,Wings Joint,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant,Food Truck
2,Cầu Giấy,12.04,266800,21.018907,105.797624,322580.645161,406,0,Coffee Shop,Vietnamese Restaurant,Café,Fast Food Restaurant,Korean Restaurant,Bubble Tea Shop,Japanese Restaurant,Bakery,Hotel,Multiplex
3,Đống Đa,9.96,420900,21.012862,105.829642,176190.47619,1218,1,Vietnamese Restaurant,Coffee Shop,Café,Fast Food Restaurant,Supermarket,Seafood Restaurant,Karaoke Bar,Noodle House,Multiplex,Movie Theater
4,Hai Bà Trưng,10.09,318000,21.006483,105.853338,151364.555256,812,1,Vietnamese Restaurant,Coffee Shop,Café,Noodle House,Hotel,Japanese Restaurant,Hotpot Restaurant,BBQ Joint,Dessert Shop,Sushi Restaurant


In [29]:
# create map
map_clusters = folium.Map(location=[21.029027, 105.834089], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hanoi_merged['Latitude'], hanoi_merged['Longitude'], hanoi_merged['District'], hanoi_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters