# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

## By Francisco Vásquez Pozo.- Part 3.

### Importing libraries.

In [1]:
import pandas as pd
import numpy as np
import requests
import lxml.html as lh
import bs4 as bs

### Obtaining and clearing data.

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
reqst = requests.get(url)
soup = bs.BeautifulSoup(reqst.content,'lxml')
table = soup.find_all('table')[0]
df = pd.read_html(str(table))
df_T = pd.read_json(df[0].to_json(orient='records'))
df_T = df_T[df_T['Borough'] != 'Not assigned']
df_T = df_T.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
df_T['Neighbourhood'] = np.where(df_T['Neighbourhood'] == 'Not Assigned', df_T['Borough'], df_T['Neighbourhood'])
print('ready to go!')

ready to go!


In [3]:
df_T.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"


### Obtaining Geospatial Data from Geocoder Package.

In [5]:
geospatial_url = "https://cocl.us/Geospatial_data"
geospatial_data = pd.read_csv(geospatial_url)
new_df = pd.merge(df_T, geospatial_data, on='Postal Code')
print('Ready for final part!')

Ready for final part!


In [6]:
new_df.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


### Exploring and cluster the neighborhoods in Toronto.

In [7]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge geopy --yes
import geopy
from geopy.geocoders import Nominatim

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [9]:
address = 'Toronto, Ontario'
geolocator = geopy.Nominatim(user_agent="ny_explorer", timeout=30)
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

#### Exploring Toronto using Foursquare.

In [10]:
from pandas.io.json import json_normalize
import json 

In [12]:
map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, neighbourhood in zip(new_df['Latitude'], new_df['Longitude'], new_df['Neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

#### Link to screenshot of the map I got: https://www.google.com/search?hl=en-CL&tbs=simg:CAQSiQIJiZb3Ml3opcAa_1QELELCMpwgaYgpgCAMSKMkH_1BL9EsoHwwfVCMIHwQfGB9ECkTn1Pocr4z_1kP-ci5iKMK5QriisaMFeiHvTmQiT1xA7_1a19BY5McuAbIqEkaaLN8m5GiGxXiLSvI0K0VPOfE7GABsAeyGyAEDAsQjq7-CBoKCggIARIE_1VoDrwwLEJ3twQkadgoWCgNkb3TapYj2AwsKCS9tLzAyN2N0ZwomChJuYXZpZ2F0aW9uIGNoYW5uZWzapYj2AwwKCi9tLzBjbjlwMDIKFwoFYXRsYXPapYj2AwoKCC9tLzBjcm5mChsKCGxhbmd1YWdl2qWI9gMLCgkvai8yc2hfeTQM&sxsrf=ALeKk02TsFQKLgiPDEh-gaxkMyeivH7rAA:1607907101276&q=dot&tbm=isch&sa=X&ved=2ahUKEwinsOncoMztAhUMIbkGHXtcCMcQwg4oAHoECAoQMQ
Not the best resolution but you get the idea.

 ### Exploring 4 venues in each neighbourhood within 500 meters radius.

##### I guess I gotta trust you here.

In [13]:
CLIENT_ID = '0IUDXWL4OG11O3WD2EWHJJAK2ACN45NHUPUD4ZJNPIULG1MG' 
CLIENT_SECRET = 'NET11B22KLN55CI5ZH0EXIL45DTKWUAB1MXHCTWKXZHDCVZL' 
VERSION = '20161225'

In [14]:
LIMIT = 4
radius = 500

location_list = [] # initiate a list to store data from Foursquare API requests

for neighbourhood, latitude, longitude in zip(new_df.Neighbourhood, new_df.Latitude, new_df.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighbourhood, latitude, longitude, name, lat, lon, cat)])

In [15]:
venues = pd.DataFrame(x for row in location_list for x in row)
venues.columns = ['Neighbourhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
venues.head()

Unnamed: 0,Neighbourhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Davisville North,43.712751,-79.390197,Homeway Restaurant & Brunch,43.712641,-79.391557,Breakfast Spot
2,"North Toronto West, Lawrence Park",43.715383,-79.405678,Barreworks,43.71407,-79.400109,Yoga Studio
3,Davisville,43.704324,-79.38879,Jules Cafe Patisserie,43.704138,-79.388413,Dessert Shop
4,"Moore Park, Summerhill East",43.689574,-79.38316,Ravine,43.690356,-79.386841,Trail


In [16]:
print("{} venues founder by {} neighbourhood.".format(len(venues.Venue), len(new_df.Neighbourhood)))

100 venues founder by 103 neighbourhood.


### Clustering using K-means unsupervided.

In [17]:
from sklearn.cluster import KMeans

In [21]:
categories = pd.get_dummies(venues.category)
# combining neighbourhoods and categories in one table
df_nyc = pd.concat([venues[['Neighbourhood']], categories], axis=1)

In [22]:
df_nyc.head()

Unnamed: 0,Neighbourhood,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,Brewery,...,Rental Car Location,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Sports Bar,Theme Restaurant,Trail,Warehouse Store,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Davisville North,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"North Toronto West, Lawrence Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Moore Park, Summerhill East",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [24]:
df_nyc2 = df_nyc.drop('Neighbourhood', axis=1)

n_group = 4 # 4 clusters
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df_nyc2)
kmeans.labels_[0:10]

df_nyc.insert(1, 'label', kmeans.labels_)

In [25]:
df_nyc.head() #Checking that DF got Neighbourhoods, labels and categories.

Unnamed: 0,Neighbourhood,label,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,...,Rental Car Location,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Sports Bar,Theme Restaurant,Trail,Warehouse Store,Yoga Studio
0,Lawrence Park,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Davisville North,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,"North Toronto West, Lawrence Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Moore Park, Summerhill East",3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [26]:
Toronto_df = pd.merge(new_df, df_nyc, on='Neighbourhood', how='right')
Toronto_df.head()

Unnamed: 0,Borough,Postal Code,Neighbourhood,Latitude,Longitude,label,Airport,Arts & Crafts Store,Bakery,Bank,...,Rental Car Location,Restaurant,Sandwich Place,Shopping Mall,Skating Rink,Sports Bar,Theme Restaurant,Trail,Warehouse Store,Yoga Studio
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Central Toronto,M4S,Davisville,43.704324,-79.38879,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316,3,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [27]:
# And finally, the map.
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# colors
x = np.arange(n_group)
ys = [i + x + (i*x)**2 for i in range(n_group)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# markers
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Neighborhood'], Toronto_df['label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### I've uploaded the screenshot to Google so you get a peek of my beautiful clustered map.
https://lh3.googleusercontent.com/9zRbchWfOeu66f08otFxGDGV8m77eb2nQiGLoIXJAp0p2Pzv3A9rrzYapz6fmGL65it_=s158

Again, quality is not the best for some reason but as I said, its just a peek.