# Load libraries

In [1]:
from requests import get
from bs4 import BeautifulSoup
from parsel import Selector
import pandas as pd
import numpy as np
import folium
import json
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import plotly.plotly as py
from sklearn import metrics
import plotly.graph_objs as go
import plotly

# Load Hanoi's urban districts data from csv

In [2]:
df = pd.read_csv("data/hanoi_urban_districts.csv") 
df

Unnamed: 0,District,Area,Population,Latitude,Longitude,Price,Count
0,Ba Đình,9.224,247100,21.036667,105.836111,252132.67125,1624
1,Bắc Từ Liêm,43.35,333300,21.074832,105.770597,91049.913941,812
2,Cầu Giấy,12.04,266800,21.018907,105.797624,212135.645161,406
3,Đống Đa,9.96,420900,21.012862,105.829642,176190.47619,1218
4,Hai Bà Trưng,10.09,318000,21.006483,105.853338,151364.555256,812
5,Hà Đông,47.917,319800,20.959251,105.765959,115615.296807,620
6,Hoàn Kiếm,5.29,160600,21.024443,105.849847,147143.598834,72
7,Hoàng Mai,41.04,411500,20.978733,105.8634,65789.473684,406
8,Long Biên,60.38,291900,21.026478,105.896822,102968.115281,162
9,Nam Từ Liêm,32.27,236700,21.014968,105.768715,98611.111111,812


# Create a map of Hanoi to see where the districts locate

In [3]:
map_hanoi = folium.Map(location=[21.029027, 105.834089], zoom_start=12)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['District']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_hanoi)
    
map_hanoi

# Foursquare credentials

In [4]:
config = json.load(open('config.json'))
CLIENT_ID = config['client_id']
CLIENT_SECRET = config['client_secret']
VERSION = config['version']
plotly.tools.set_credentials_file(username=config['plotly_username'], api_key=config['plotly_key'])
LIMIT=200

# Define a function to get nearby venues of a specific coordinate

In [5]:
def getNearbyVenues(names, latitudes, longitudes, radius=1500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District', 
                  'District Latitude', 
                  'District Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

# Get each district nearby venues and store them in hanoi_venues

In [6]:
hanoi_venues = getNearbyVenues(names=df['District'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude'])
len(hanoi_venues.index)

Ba Đình
Bắc Từ Liêm
Cầu Giấy
Đống Đa
Hai Bà Trưng
Hà Đông
Hoàn Kiếm
Hoàng Mai
Long Biên
Nam Từ Liêm
Tây Hồ
Thanh Xuân


571

In [7]:
venue_count = hanoi_venues.groupby('District').count().reset_index().drop(['District Latitude', 'District Longitude', 'Venue Latitude', 'Venue Longitude', 'Venue Category'], axis=1)
df['Venue Count'] = venue_count['Venue']

# Let's see how the venues we got look like

In [8]:
categories = hanoi_venues.groupby('Venue Category').count().sort_values(by=['District'])
data = [go.Bar(
            x=categories.index,
            y=categories['District']
    )]


layout = go.Layout(
    xaxis=dict(
        title='Venue categories',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='Number of venues',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    )
)

py.iplot(go.Figure(data=data,layout=layout), filename='venues-per-district')


Consider using IPython.display.IFrame instead



# Let's visualize number of venues in each district

In [9]:
data = [go.Bar(
            x=hanoi_venues['District'].unique(),
            y=hanoi_venues.groupby(['District']).count()['Venue']
    )]


layout = go.Layout(
    xaxis=dict(
        title='Districts',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='Number of venues',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    )
)

py.iplot(go.Figure(data=data,layout=layout), filename='venues-per-district')

# Visualize number of apartments for rent in each district

In [10]:
data = [go.Bar(
            x=df['District'],
            y=df['Count']
    )]

layout = go.Layout(
    xaxis=dict(
        title='Districts',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='Number of apartments for rent',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    )
)
py.iplot(go.Figure(data=data, layout=layout), filename='district-apartment-rent-price')

# Bar plot for district's apartment rent price

In [11]:
data = [go.Bar(
            x=df['District'],
            y=df['Price']
    )]

layout = go.Layout(
    xaxis=dict(
        title='Districts',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='Average apartment rent price',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    )
)
py.iplot(go.Figure(data=data, layout=layout), filename='district-apartment-price')

# Scatter plot. Each point is a district where its x-value is number of venues and its y-value is average apartment rent

In [12]:
data = [go.Scatter(
    x = hanoi_venues.groupby(['District']).count()['Venue'],
    y = df['Price'],
    mode = 'markers',
    marker=dict(
        size=12,
    )
)]

layout = go.Layout(
    xaxis=dict(
        title='Number of venues',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='Average apartment rent price',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    )
)

py.iplot(go.Figure(data=data,layout=layout), filename='venues-price-scatter')


In [13]:
data = [go.Scatter3d(
    x = hanoi_venues.groupby(['District']).count()['Venue'],
    y = df['Count'],
    z = df['Price'],
    mode = 'markers',
    marker=dict(
        size=12,
        color=np.array(df['Price']),
        colorscale='Viridis',   # choose a colorscale
        opacity=0.8
    )
)]

py.iplot(data, filename='3d-scatter')


In [14]:
# one hot encoding
hanoi_onehot = pd.get_dummies(hanoi_venues[['Venue Category']], prefix="", prefix_sep="")

# add district column back to dataframe
hanoi_onehot['District'] = hanoi_venues['District'] 

# move district column to the first column
fixed_columns = [hanoi_onehot.columns[-1]] + list(hanoi_onehot.columns[:-1])
hanoi_onehot = hanoi_onehot[fixed_columns]

hanoi_onehot.head()

Unnamed: 0,District,Arepa Restaurant,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Bed & Breakfast,...,Temple,Thai Restaurant,Theater,Tourist Information Center,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Wine Bar,Wings Joint
0,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Ba Đình,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Ba Đình,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
hanoi_onehot.shape

(571, 116)

In [16]:
hanoi_grouped = hanoi_onehot.groupby('District').mean().reset_index()
# hanoi_grouped['Price'] = df['Price']
# hanoi_grouped['Nums'] = df['Count']
hanoi_grouped['Count'] = hanoi_venues.groupby(['District']).count()['Venue'].reset_index(drop=True)
hanoi_grouped

Unnamed: 0,District,Arepa Restaurant,Art Gallery,Art Museum,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Bed & Breakfast,...,Thai Restaurant,Theater,Tourist Information Center,Ukrainian Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Water Park,Wine Bar,Wings Joint,Count
0,Ba Đình,0.0,0.0,0.01,0.0,0.0,0.02,0.0,0.01,0.01,...,0.0,0.0,0.0,0.0,0.01,0.15,0.0,0.0,0.01,100
1,Bắc Từ Liêm,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,Cầu Giấy,0.0,0.0,0.014085,0.014085,0.0,0.014085,0.042254,0.014085,0.0,...,0.0,0.0,0.0,0.0,0.0,0.098592,0.0,0.0,0.0,71
3,Hai Bà Trưng,0.0,0.0,0.0,0.010526,0.0,0.031579,0.010526,0.0,0.0,...,0.010526,0.010526,0.0,0.0,0.0,0.252632,0.0,0.0,0.0,95
4,Hoàn Kiếm,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.02,0.13,0.0,0.02,0.0,100
5,Hoàng Mai,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.0,10
6,Hà Đông,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
7,Long Biên,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10
8,Nam Từ Liêm,0.0,0.0,0.0,0.0,0.05,0.05,0.05,0.0,0.0,...,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,20
9,Thanh Xuân,0.0,0.0,0.0,0.0,0.0,0.0,0.135135,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.108108,0.0,0.0,0.0,37


In [17]:
hanoi_grouped.shape

(12, 117)

# Function to sort venues in descending order

In [18]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Create a DF to display the 10 most popular venues for each neighborhood

In [19]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
districts_venues_sorted = pd.DataFrame(columns=columns)
districts_venues_sorted['District'] = hanoi_grouped['District']

for ind in np.arange(hanoi_grouped.shape[0]):
    districts_venues_sorted.iloc[ind, 1:] = return_most_common_venues(hanoi_grouped.iloc[ind, :], num_top_venues)

districts_venues_sorted.head()

Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Ba Đình,Count,Vietnamese Restaurant,Noodle House,Coffee Shop,Hotel,Café,Dessert Shop,Beer Garden,Hostel,Hotpot Restaurant
1,Bắc Từ Liêm,Count,Coffee Shop,Café,Dessert Shop,Electronics Store,Fast Food Restaurant,Food,Food Court,Food Truck,French Restaurant
2,Cầu Giấy,Count,Coffee Shop,Vietnamese Restaurant,Korean Restaurant,Café,Fast Food Restaurant,Bubble Tea Shop,Hotel,Bakery,Japanese Restaurant
3,Hai Bà Trưng,Count,Vietnamese Restaurant,Coffee Shop,Café,Noodle House,Hotel,Japanese Restaurant,BBQ Joint,Sushi Restaurant,Dessert Shop
4,Hoàn Kiếm,Count,Hotel,Coffee Shop,Vietnamese Restaurant,Noodle House,Café,Sandwich Place,Italian Restaurant,Lounge,French Restaurant


In [20]:
# set number of clusters
cluster = []
scores = []
for kclusters in range(2,12):
    hanoi_grouped_clustering = hanoi_grouped.drop('District', 1)

    # run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hanoi_grouped_clustering)

    scores.append(metrics.silhouette_score(hanoi_grouped_clustering,kmeans.labels_))
    cluster.append(kclusters)

data = [go.Scatter(
    x = cluster,
    y = scores,
    mode = 'lines+markers',
    name = 'lines+markers'
)]

layout = go.Layout(
    xaxis=dict(
        title='Number of clusters',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='Average silhouette score',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    )
)

py.iplot(go.Figure(data=data,layout=layout), name='choose-k')


Consider using IPython.display.IFrame instead



In [21]:
kclusters=4
hanoi_grouped_clustering = hanoi_grouped.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hanoi_grouped_clustering)

kmeans.labels_

df['Label'] = pd.Series(kmeans.labels_)
final = df.groupby('Label').mean()
data = [go.Bar(
    x = final.index,
    y = final['Price']/final['Price'].max(),
    name = 'Cluster avg rent'
), go.Bar(
    x = final.index,
    y = final['Count']/final['Count'].max(),
    name = 'Cluster avg number apartments for rent'
), go.Bar(
    x = final.index,
    y = final['Venue Count']/final['Venue Count'].max(),
    name = 'Cluster avg number of venues'
)]

layout = go.Layout(
    barmode = 'group',
    xaxis=dict(
        title='Cluster',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    ),
    yaxis=dict(
        title='Average values',
        titlefont=dict(
            family='Courier New, monospace',
            size=18,
            color='#000000'
        )
    )
)

py.iplot(go.Figure(data=data,layout=layout), name='cluster-summary')


Consider using IPython.display.IFrame instead



In [22]:
# add clustering labels
districts_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

hanoi_merged = df

# merge hanoi_grouped with hanoi_data to add latitude/longitude for each district
hanoi_merged = hanoi_merged.join(districts_venues_sorted.set_index('District'), on='District')
hanoi_merged = hanoi_merged.fillna(value={'Cluster Labels': 0.0})

hanoi_merged

Unnamed: 0,District,Area,Population,Latitude,Longitude,Price,Count,Venue Count,Label,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Ba Đình,9.224,247100,21.036667,105.836111,252132.67125,1624,100,2,2,Count,Vietnamese Restaurant,Noodle House,Coffee Shop,Hotel,Café,Dessert Shop,Beer Garden,Hostel,Hotpot Restaurant
1,Bắc Từ Liêm,43.35,333300,21.074832,105.770597,91049.913941,812,2,0,0,Count,Coffee Shop,Café,Dessert Shop,Electronics Store,Fast Food Restaurant,Food,Food Court,Food Truck,French Restaurant
2,Cầu Giấy,12.04,266800,21.018907,105.797624,212135.645161,406,71,3,3,Count,Coffee Shop,Vietnamese Restaurant,Korean Restaurant,Café,Fast Food Restaurant,Bubble Tea Shop,Hotel,Bakery,Japanese Restaurant
3,Đống Đa,9.96,420900,21.012862,105.829642,176190.47619,1218,95,2,3,Count,Coffee Shop,Café,Vietnamese Restaurant,Fast Food Restaurant,Noodle House,Karaoke Bar,Seafood Restaurant,Asian Restaurant,Multiplex
4,Hai Bà Trưng,10.09,318000,21.006483,105.853338,151364.555256,812,100,2,2,Count,Vietnamese Restaurant,Coffee Shop,Café,Noodle House,Hotel,Japanese Restaurant,BBQ Joint,Sushi Restaurant,Dessert Shop
5,Hà Đông,47.917,319800,20.959251,105.765959,115615.296807,620,10,0,0,Count,Pizza Place,Café,Frozen Yogurt Shop,Multiplex,Bubble Tea Shop,Dim Sum Restaurant,Fast Food Restaurant,Food,Food Court
6,Hoàn Kiếm,5.29,160600,21.024443,105.849847,147143.598834,72,5,0,2,Count,Hotel,Coffee Shop,Vietnamese Restaurant,Noodle House,Café,Sandwich Place,Italian Restaurant,Lounge,French Restaurant
7,Hoàng Mai,41.04,411500,20.978733,105.8634,65789.473684,406,10,0,0,Count,Soccer Field,Lake,Noodle House,Café,Smoke Shop,Snack Place,Grocery Store,Creperie,Vietnamese Restaurant
8,Long Biên,60.38,291900,21.026478,105.896822,102968.115281,162,20,0,0,Count,Bowling Alley,Fast Food Restaurant,Ramen Restaurant,Convenience Store,Food Court,Shopping Mall,Multiplex,Golf Course,Bakery
9,Nam Từ Liêm,32.27,236700,21.014968,105.768715,98611.111111,812,37,1,0,Count,Café,Malay Restaurant,Coffee Shop,Shopping Mall,Furniture / Home Store,Frozen Yogurt Shop,Stadium,Gym Pool,Multiplex


In [23]:
# create map
map_clusters = folium.Map(location=[21.029027, 105.834089], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hanoi_merged['Latitude'], hanoi_merged['Longitude'], hanoi_merged['District'], hanoi_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster-1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters