# Neighborhoods in Toronto

### 1 - Scrape Wikipedia page "Canada Postal Codes"

In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [10]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

In [11]:
table = soup.find('tbody')
rows = table.select('tr')
row = [r.get_text() for r in rows]

### 2 - Create dataframe as shown in picture

In [12]:
df = pd.DataFrame(row)
df = df[0].str.split('\n', expand=True)
df = df.rename(columns=df.iloc[0])
df = df.drop(df.index[0])
df = df.rename(columns={'Postcode':'PostalCode'})
df.head()

Unnamed: 0,Unnamed: 1,PostalCode,Borough,Neighbourhood,Unnamed: 5
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,


### 3 - Clean dataframe

#### Ignore cells with "Not assigned" Borough

In [13]:
df_clean = df[df.Borough != 'Not assigned']
df_clean.reset_index(inplace = True)
df_clean.head()

Unnamed: 0,index,Unnamed: 2,PostalCode,Borough,Neighbourhood,Unnamed: 6
0,3,,M3A,North York,Parkwoods,
1,4,,M4A,North York,Victoria Village,
2,5,,M5A,Downtown Toronto,Harbourfront,
3,6,,M5A,Downtown Toronto,Regent Park,
4,7,,M6A,North York,Lawrence Heights,


#### Combine neighbourhoods with same PostalCode

In [14]:
df_clean = df_clean.groupby(['PostalCode', 'Borough'], sort = False).agg(','.join)
df_clean.reset_index(inplace = True)
df_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


#### Change the value of the Neighbourhood to be like the Borough "Queen's Park"

In [15]:
df_clean = df_clean.replace("Not assigned", "Queen's Park")
df_clean.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### 4 - Shape of the dataframe

In [16]:
df_clean.shape

(103, 3)

### 5 - Geospatial data

In [17]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')
df_geo.columns = ['PostalCode', 'Latitude', 'Longitude']

#### Merge and create as in picture

In [18]:
df_pos = pd.merge(df_clean, df_geo, on=['PostalCode'], how='inner')
df_pos = df_pos[['Borough', 'Neighbourhood', 'PostalCode', 'Latitude', 'Longitude']]

In [19]:
df_pos.head()

Unnamed: 0,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,North York,Parkwoods,M3A,43.753259,-79.329656
1,North York,Victoria Village,M4A,43.725882,-79.315572
2,Downtown Toronto,"Harbourfront,Regent Park",M5A,43.65426,-79.360636
3,North York,"Lawrence Heights,Lawrence Manor",M6A,43.718518,-79.464763
4,Queen's Park,Queen's Park,M7A,43.662301,-79.389494


### 6 - Folium map of Toronto

In [20]:
import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import xml

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          90 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.49   | 32 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving env

In [21]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of the City of Toronto are 43.653963, -79.387207.


  app.launch_new_instance()


In [24]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_pos['Latitude'], df_pos['Longitude'], df_pos['Borough'], df_pos['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### 7 - Exploring Toronto

#### Foursquare Credentials and Version

In [48]:
CLIENT_ID = 'foo' # your Foursquare ID
CLIENT_SECRET = 'foo' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: YZZJBS3QHTVLBGKP4MRQROIPQKC2WZ0MBDPDED30RU4ZCMBG
CLIENT_SECRET:PO0CW2DCZEFRIZWHYQXZB1CDMYXS2OATITQNUQGYMP3CGM15


#### Selecting Boroughs containing Toronto

In [31]:
df_toronto = df_pos[df_pos['Borough'].str.contains('Toronto')]
df_toronto = df_toronto.reset_index(drop=True)
df_toronto

Unnamed: 0,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,Downtown Toronto,"Harbourfront,Regent Park",M5A,43.65426,-79.360636
1,Downtown Toronto,"Ryerson,Garden District",M5B,43.657162,-79.378937
2,Downtown Toronto,St. James Town,M5C,43.651494,-79.375418
3,East Toronto,The Beaches,M4E,43.676357,-79.293031
4,Downtown Toronto,Berczy Park,M5E,43.644771,-79.373306
5,Downtown Toronto,Central Bay Street,M5G,43.657952,-79.387383
6,Downtown Toronto,Christie,M6G,43.669542,-79.422564
7,Downtown Toronto,"Adelaide,King,Richmond",M5H,43.650571,-79.384568
8,West Toronto,"Dovercourt Village,Dufferin",M6H,43.669005,-79.442259
9,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",M5J,43.640816,-79.381752


#### Map of Toronto Neighbourhoods only

In [29]:
# create map of Toronto using latitude and longitude values
map_tohood = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_tohood)  
    
map_tohood

#### Explore the first neighbourhood

In [30]:
df_toronto.loc[0, 'Neighbourhood']

'Harbourfront,Regent Park'

#### Extracting neighbourhood long and lat values

In [32]:
neighbourhood_latitude = df_toronto.loc[0, 'Latitude'] # neighbourhood latitude value
neighbourhood_longitude = df_toronto.loc[0, 'Longitude'] # neighbourhood longitude value
neighbourhood_name = df_toronto.loc[0, 'Neighbourhood'] # neighbourhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, neighbourhood_latitude, neighbourhood_longitude))

Latitude and longitude values of Harbourfront,Regent Park are 43.6542599, -79.3606359.


#### Top 100 venues within a radius from the centroid of 500 meters

In [33]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighbourhood_latitude, 
    neighbourhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=YZZJBS3QHTVLBGKP4MRQROIPQKC2WZ0MBDPDED30RU4ZCMBG&client_secret=PO0CW2DCZEFRIZWHYQXZB1CDMYXS2OATITQNUQGYMP3CGM15&v=20180605&ll=43.6542599,-79.3606359&radius=500&limit=100'

In [34]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5d1748a6f129b50025bf5d80'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-54ea41ad498e9a11e9e13308-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/bakery_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d16a941735',
         'name': 'Bakery',
         'pluralName': 'Bakeries',
         'primary': True,
         'shortName': 'Bakery'}],
       'id': '54ea41ad498e9a11e9e13308',
       'location': {'address': '362 King St E',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'Trinity St',
        'distance': 143,
        'formattedAddress': ['362 King St E (Trinity St)',
         'Toronto ON M5A 1K9',
         'Canada'],
        'labeledLatLngs': [{'label': 'display',
 

#### Extracting info from JSON and structuring into a pandas df

In [37]:
# extracting category of the venue form the JSON
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [39]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues = nearby_venues.rename(columns={
    'name':'VenueName',
    'categories':'Category',
    'lat':'Latitude',
    'lng':'Longitude'
})
nearby_venues.head()

Unnamed: 0,VenueName,Category,Latitude,Longitude
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Toronto Cooper Koo Family Cherry St YMCA Centre,Gym / Fitness Center,43.653191,-79.357947
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149


#### Venues in that neighourhood?

In [40]:
nearby_venues.shape[0]

48

#### How many venues for each category?

In [46]:
df_categories = nearby_venues.groupby(['Category'], sort = False).count()[['VenueName']]
df_categories = df_categories.rename(columns={'VenueName':'Count'})
df_categories

Unnamed: 0_level_0,Count
Category,Unnamed: 1_level_1
Bakery,3
Coffee Shop,8
Gym / Fitness Center,1
Spa,1
Breakfast Spot,2
Restaurant,2
Pub,3
Park,3
Historic Site,1
Chocolate Shop,1


### 8 - Clustering Toronto

In [66]:
df_toronto.Borough.value_counts()

Downtown Toronto    18
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [67]:
# set number of clusters
kclusters = 4

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_toronto[['Latitude','Longitude']])

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([0, 0, 0, 3, 0, 0, 2, 0, 2, 0, 2, 3, 0, 2, 3, 0, 3, 1, 1, 1, 1, 2, 1,
       0, 2, 1, 0, 2, 1, 0, 1, 0, 0, 0, 0, 0, 0, 3], dtype=int32)

#### Did clustering grouped Neighbourhoods respecting Boroughs?

In [68]:
# inserting labels
# df_toronto.insert(0, 'Cluster Labels', kmeans.labels_)
df_toronto

Unnamed: 0,Cluster Labels,Borough,Neighbourhood,PostalCode,Latitude,Longitude
0,0,Downtown Toronto,"Harbourfront,Regent Park",M5A,43.65426,-79.360636
1,0,Downtown Toronto,"Ryerson,Garden District",M5B,43.657162,-79.378937
2,0,Downtown Toronto,St. James Town,M5C,43.651494,-79.375418
3,4,East Toronto,The Beaches,M4E,43.676357,-79.293031
4,0,Downtown Toronto,Berczy Park,M5E,43.644771,-79.373306
5,0,Downtown Toronto,Central Bay Street,M5G,43.657952,-79.387383
6,1,Downtown Toronto,Christie,M6G,43.669542,-79.422564
7,0,Downtown Toronto,"Adelaide,King,Richmond",M5H,43.650571,-79.384568
8,2,West Toronto,"Dovercourt Village,Dufferin",M6H,43.669005,-79.442259
9,0,Downtown Toronto,"Harbourfront East,Toronto Islands,Union Station",M5J,43.640816,-79.381752


In [69]:
import numpy as np
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood'], df_toronto['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters