# Data preparation

In [2]:
#import libraries:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests
from bs4 import BeautifulSoup

In [3]:
#scrape the following Wikipedia page:
source=requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text
soup=BeautifulSoup(source,"lxml")
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":890001695,"wgRevisionId":890001695,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

In [4]:
#get the text of table:
table_scource=soup.table
text=table_scource.text
text_split=text.split("\n")
del(text_split[0])
del(text_split[0])
text_split
text_postcode=text_split[::5]
text_postcode
text_borough=text_split[1::5]
text_borough
text_neighborhood=text_split[2::5]
text_neighborhood


['Neighbourhood',
 'Not assigned',
 'Not assigned',
 'Parkwoods',
 'Victoria Village',
 'Harbourfront',
 'Regent Park',
 'Lawrence Heights',
 'Lawrence Manor',
 'Not assigned',
 'Not assigned',
 'Islington Avenue',
 'Rouge',
 'Malvern',
 'Not assigned',
 'Don Mills North',
 'Woodbine Gardens',
 'Parkview Hill',
 'Ryerson',
 'Garden District',
 'Glencairn',
 'Not assigned',
 'Not assigned',
 'Cloverdale',
 'Islington',
 'Martin Grove',
 'Princess Gardens',
 'West Deane Park',
 'Highland Creek',
 'Rouge Hill',
 'Port Union',
 'Not assigned',
 'Flemingdon Park',
 'Don Mills South',
 'Woodbine Heights',
 'St. James Town',
 'Humewood-Cedarvale',
 'Not assigned',
 'Not assigned',
 'Bloordale Gardens',
 'Eringate',
 'Markland Wood',
 'Old Burnhamthorpe',
 'Guildwood',
 'Morningside',
 'West Hill',
 'Not assigned',
 'Not assigned',
 'The Beaches',
 'Berczy Park',
 'Caledonia-Fairbanks',
 'Not assigned',
 'Not assigned',
 'Not assigned',
 'Woburn',
 'Not assigned',
 'Not assigned',
 'Leaside',


In [5]:
#put texts into dataframe:
neighborhood= {"Postcode":text_postcode,"Borough":text_borough,"Neighborhood":text_neighborhood}
df_neighborhood=pd.DataFrame(neighborhood)
df_neighborhood.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [6]:
#drop the first row(which is equal to column names)
df_neighborhood1=df_neighborhood.drop([0])
df_neighborhood1.head()

Unnamed: 0,Postcode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [7]:
#Ignore cells with a borough that is Not assigned
df_clean=df_neighborhood1[df_neighborhood1["Borough"]!="Not assigned"]
df_clean.head()

Unnamed: 0,Postcode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights


In [8]:
#reset index
df_clean.reset_index(drop=True,inplace=True)

In [9]:
#combine rows with the same postcode into one row , the neighborhoods separated with a comma :
df_combine=df_clean.groupby("Postcode").agg(lambda x:','.join(set(x)))
df_combine.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Morningside,Guildwood,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [10]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough:
df_combine.loc[df_combine['Neighborhood']=="Not assigned",'Neighborhood']=df_combine.loc[df_combine['Neighborhood']=="Not assigned",'Borough']
df_combine.head()

Unnamed: 0_level_0,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern,Rouge"
M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
M1E,Scarborough,"Morningside,Guildwood,West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [11]:
#convert index to column
df_combine.reset_index(level=0,inplace=True)
df_combine.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Morningside,Guildwood,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
#for the Not assigned neighborhood, the neighborhood should be the same as the borough.a "not assigned Neighborhood" example is Queen's Park.
#check whether the value of the the Neighborhood columns has changed correctly
df_combine.loc[df_combine['Borough']=="Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighborhood
85,M7A,Queen's Park,Queen's Park


## show data shape scraped from wikipedia

In [13]:
df_combine.shape

(103, 3)

# Add latitude and the longitude coordinates of each neighborhood.

In [15]:
#read latitude and longtitude csv document:
path="https://public.boxcloud.com/d/1/b1!LLvoB77FuKHJoE23-Ny-ijQHSl0ZEWac0UMszkSba_XasIZiACMrOjm93lcIAPa8PN-jTOMdGyUlVcgXsJ3pd32Nj1XcBaOKxyJi3nU-Ka1fHNmOpFwZOp1xw7vjkt6nHblprVC6o7yQLe2FDUdwgVMH5fF0TBqUF1C5Pyz23qcV1aVrTIxqMq5NY0tGQjIHXRtM2bvJ4xhersJBvywWP5mhaOYJtTLTN08_Hh7IoBM-NTzzR6mXrb-Tj4aFiUCXaZfwbVhYtmJfh7E93EzyuDdxVje3fp18xY3uNk3O8jTRXFRX7_1ihiRIud_W_I7mHxgYHkvnRu4HsxEZCf_B4kiJgso291U5HDNlVEK0LVlkbV-T_RIcQ9ZWl2gPa_flUGs4a-LpSKZcoBOS_tbsTbnXpfk9LFfYbForPfle_ebry5tIraB8azr1OlUHyFcNMJuwWKJSkz6D6IKp-Q6F-w7roYPGxigoYvzSj1Hb3xbMRzsUzM4mLNyZZAMZuI2k7oSrubZK70dGUGD-AypCnAXxHkaZ3W1Bb7BKumf4KxlD8KG_lVawCv2BqbMoPXD-H_zh8B26xN6jUzFk1l06rbqvbr2BcpOfe2CMw-hxnwQzFUgkOA0UIQvzBstZK5KFvPR1l0_XB_y7KBhJXqgEqrfeG9As-XIlVXJ9tkQhveCz_hetsrXqqErTH8vfpH7ZexR3RIkESzMy_ckrwXBq83oFEuY9IZ6PyYCozhtlYZ53N7DScUr1fh80KDTq0xaXd_Di0cjBFPHuVUjinW8XDEzIrBQeIfvXDxvmyA_La6kGen2DARwarfSw53mdW1Vzz7b9TNN48t5V5ACQfbyYCA3PoHmCYY8JS4S061458037Y7P5tAZxko1U314NkYtbn_inohhBhK09GpSULWS2AsHHZ8p12wrU6jhm5ygTxh5YjmCs2v80LwBleucPrja7EN9Woh3IBvbtDjKyoaqJkbpy6hjKWB54KrfGg1ti9U6zYa21R0T99WYqlaq16Ryg9tQbzfZHeb2ked3sJm25qrLygrRryO4PLmFNBQ2cgBKGM4_jqQNsFItvu3tLfhHD_dqHh9Tia8ko1cYTxi46Wsm1QTe_GQhFcLJnbnBgK5jX23m_vRGFGx0gjZ9IPkFID2pQRti_If87ks3l8uylEmfi2lrXpAw4SeLNjvwrC1VQAt4-rXspTyvpHGTXR3NZmMY9gDLjP_BLCJFkubsfC_sBKBwVKCUqrV2dWoLRRwh64tWSQyKvYp2B-3K4FAN1Y7LxNp7XUqpdSj0QyJYqv0L_t9kTjsjLuebTdwpLpwS5Liu0exDDdKbtE8CYhCuLQS10OtVAUoyWh0C36fJ-2mOg_TKkHyWJF3sB2c9HHbLhbGOocxjtff06oRi7Mg../download"
df_geo=pd.read_csv(path)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [16]:
#merge neighborhood and geography data:
df_geo.rename(columns={"Postal Code":"PostalCode"},inplace=True)
df_combine.rename(columns={"Postcode":"PostalCode"},inplace=True)
df_merge=pd.merge(df_combine,df_geo,on="PostalCode")
df_merge.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,Guildwood,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Explore and cluster the neighborhoods in Toronto

In [17]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## the latitude and longitude values of Toronto

In [18]:
#get the latitude and longitude values of Toronto
address = 'Toronto'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


## Create a map of Toronto with neighborhoods superimposed on top

In [124]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_merge['Latitude'], df_merge['Longitude'], df_merge['Borough'], df_merge['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version

In [20]:
CLIENT_ID = 'B2Q0PJUQSMBZQKZ2EOZ3DFC4WWYD5R0YQUBHSZ0HOS34ULRS' # your Foursquare ID
CLIENT_SECRET = 'KJZL0F1YSWCOG5ZBZDHWEIRGXTZYZQLCAMZW4YQWILBT1O4D' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: B2Q0PJUQSMBZQKZ2EOZ3DFC4WWYD5R0YQUBHSZ0HOS34ULRS
CLIENT_SECRET:KJZL0F1YSWCOG5ZBZDHWEIRGXTZYZQLCAMZW4YQWILBT1O4D


## explore the neighborhoods in our Toronto.

In [64]:
LIMIT = 100 # limit of number of venues returned by Foursquare API

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [65]:

toronto_venues = getNearbyVenues(names=df_merge['Neighborhood'],
                                   latitudes=df_merge['Latitude'],
                                   longitudes=df_merge['Longitude']
                                  )


Malvern,Rouge
Highland Creek,Rouge Hill,Port Union
Morningside,Guildwood,West Hill
Woburn
Cedarbrae
Scarborough Village
Ionview,Kennedy Park,East Birchmount Park
Golden Mile,Clairlea,Oakridge
Cliffside,Scarborough Village West,Cliffcrest
Cliffside West,Birch Cliff
Scarborough Town Centre,Wexford Heights,Dorset Park
Wexford,Maryvale
Agincourt
Sullivan,Clarks Corners,Tam O'Shanter
Agincourt North,Steeles East,L'Amoreaux East,Milliken
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview,Oriole,Henry Farm
Bayview Village
Silver Hills,York Mills
Willowdale,Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South,Flemingdon Park
Downsview North,Wilson Heights,Bathurst Manor
Northwood Park,York University
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
Riverdale,The Danforth West
The Beaches West,Indi

In [66]:
print(toronto_venues.shape)
toronto_venues.head()

(2244, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern,Rouge",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"Morningside,Guildwood,West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Morningside,Guildwood,West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


### find out how many unique categories can be curated from all the returned venues

In [67]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 275 uniques categories.
