# Section 1 - Web Scraping to Get Toronto Postal Codes off of Wikipedia

In [1]:
# make the imports
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

Retrieve the wiki page with the requests module and import into beautiful soup for scraping

In [2]:
wiki = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(wiki.content,'html')
#soup



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Process the html for the table elements and insert into a pandas dataframe

In [3]:
# get the table reference
table = soup.find_all('table')[0]

# iterate through the table headers and assign them to columns
cols = []
for con in table.find_all('th'):
    # replace any new line characters
    c = con.contents[0].strip()
    # add to the column line
    cols.append(c)

# create an empty dataframe with the columns discovered
df = pd.DataFrame()

# for the length of the columns, add table elements in groups of 3, appending to the dataframe that exists
num_cols = len(cols)
i=0
row=[]

for td in table.find_all('td'):
    # if it's a name with a link, get the name out of the link otherwise just append it to the row
    if td.find('a')==None:
        row.append(td.contents[0].strip())
    else:
        row.append(td.find('a').contents[0].strip())
    i+=1
    # once the number of elements in the rows equals the number of columns, add it to the df and reset the row
    if i==num_cols:
        i=0
        df = df.append(pd.DataFrame(row).T)
        row=[]
        
df.columns = cols
df.reset_index(drop=True,inplace=True)
df.rename(columns={'Neighbourhood':'Neighborhood'},inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Data is in the dataframe, now we need to clean it per the assignment instructions

First, remove the Not assigned Boroughs

In [4]:
df = df[ df['Borough']!='Not assigned' ]
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Next, if a neighborhood is not assigned, give it the boroughs name

In [5]:
df['Neighborhood']=df.apply(lambda x:  x['Borough'] if x['Neighborhood']=='Not assigned' else x['Neighborhood'], axis=1)
df.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Finally, combine neighborhoods of like postcode

In [6]:
grouped = df.groupby('Postcode').agg({'Postcode':'first', 'Borough':'first','Neighborhood':', '.join})
grouped.head()

Unnamed: 0_level_0,Postcode,Borough,Neighborhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M1B,M1B,Scarborough,"Rouge, Malvern"
M1C,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
M1E,M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,M1G,Scarborough,Woburn
M1H,M1H,Scarborough,Cedarbrae


In [7]:
df_tor = grouped.reset_index(drop=True)
df_tor.head()

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Get the number of rows of the final dataframe

In [8]:
df_tor.shape

(103, 3)

# Section 2 - Get the latitude and longitude for each postal code

In [9]:
# Install a pip package in the current Jupyter kernel
#import sys
#!{sys.executable} -m pip install geocoder
# additional imports for the geocoder
import geocoder

Not getting reliable results with geocoder, switching to csv

In [10]:
geo = pd.read_csv('Geospatial_Coordinates.csv')

# rename the geocode data postal code to postcode
geo.rename(columns={'Postal Code':'Postcode'}, inplace=True)
geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merge the data set with toronto df based on postal code

In [11]:
df_final = pd.merge(df_tor,geo,on='Postcode',how='inner')
df_final.head(10)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [12]:
df_final.shape

(103, 5)

# Section 3 - Use K-Means clustering to group the neighborhoods and plot on map

Import the modules required for this section of the course

In [14]:
from sklearn.cluster import KMeans
import sys
#!{sys.executable} -m pip install folium
!pip install folium
import folium
from pandas.io.json import json_normalize 

Collecting folium
  Using cached https://files.pythonhosted.org/packages/43/77/0287320dc4fd86ae8847bab6c34b5ec370e836a79c7b0c16680a3d9fd770/folium-0.8.3-py2.py3-none-any.whl
Collecting six (from folium)
  Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl
Collecting jinja2 (from folium)
  Using cached https://files.pythonhosted.org/packages/1d/e7/fd8b501e7a6dfe492a433deb7b9d833d39ca74916fa8bc63dd1a4947a671/Jinja2-2.10.1-py2.py3-none-any.whl
Collecting branca>=0.3.0 (from folium)
  Using cached https://files.pythonhosted.org/packages/a1/37/675c85871b923bb35ea9a5b516a1841428bd753d7f885d5921060dfd3c41/branca-0.3.1-py2-none-any.whl
Collecting requests (from folium)
  Using cached https://files.pythonhosted.org/packages/51/bd/23c926cd341ea6b7dd0b2a00aba99ae0f828be89d72b2190f27c11d4b7fb/requests-2.22.0-py2.py3-none-any.whl
Collecting numpy (from folium)
  Using cached https://files.pythonhost

Let's first plot the map of the neighborhoods

In [15]:
# create map using latitude and longitude values of toronto
lat = 43.70011
long = -79.4163
map_tor = folium.Map(location=[lat, long], zoom_start=11)

In [16]:
# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_tor)  

map_tor

Connect to the foursqure API to get information about the neighborhoods

In [17]:
CLIENT_ID = 'X1RDABIJGUOLEDZZHTFOOIME4KPAMKJPOXRRZZLYWJD2NBHT' # your Foursquare ID
CLIENT_SECRET = 'JJ35MDOBWT40B44T4UD3WMMU4F0RSU5TFO2F5AYQMJCMLYG4' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('FourSquare credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

FourSquare credentails:
CLIENT_ID: X1RDABIJGUOLEDZZHTFOOIME4KPAMKJPOXRRZZLYWJD2NBHT
CLIENT_SECRET:JJ35MDOBWT40B44T4UD3WMMU4F0RSU5TFO2F5AYQMJCMLYG4


Define a module to get the category type out of Foursquare API

In [18]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Defining a module to get nearby venues within 500 meters for all of the neighborhoods in Toronto

In [19]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [20]:
# pull the toronto venues
toronto_venues = getNearbyVenues(df_final.Neighborhood,df_final.Latitude,df_final.Longitude)

toronto_venues.head()
toronto_venues.shape

Rouge, Malvern
Highland Creek, Rouge Hill, Port Union
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount Park, Ionview, Kennedy Park
Clairlea, Golden Mile, Oakridge
Cliffcrest, Cliffside, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Scarborough Town Centre, Wexford Heights
Maryvale, Wexford
Agincourt
Clarks Corners, Sullivan, Tam O'Shanter
Agincourt North, L'Amoreaux East, Milliken, Steeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
Silver Hills, York Mills
Newtonbrook, Willowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon Park, Don Mills South
Bathurst Manor, Downsview North, Wilson Heights
Northwood Park, York University
CFB Toronto, Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens, Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth West, 

(2255, 7)

One-hot encode the categories data for implementing the KMeans clustering

In [21]:
tor_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
tor_onehot.head()
tor_onehot.shape

(2255, 274)

Add the neighborhood column back into the dataframe

In [22]:
tor_onehot['Neighborhood']=toronto_venues['Neighborhood']
# rearrange the columns to get neighborhood back in front
new_cols = list(tor_onehot.columns[:])
new_cols.remove('Neighborhood')
new_cols.insert(0,'Neighborhood')
tor_onehot = tor_onehot[new_cols]
tor_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# get average type of venue for each neighborhood
tor_grouped = tor_onehot.groupby('Neighborhood').mean().reset_index()
tor_grouped.head()
tor_grouped.shape

(100, 274)

Now we want to filter the neighborhoods down to get the 8 most common venues in each 

In [24]:
# recording top 8 venues
num_venues = 8

# write a function to return the top 8 venues for a category, assumes neighborhood is the first element in the row
def top_venues(row,num_venues):
    categories = row.iloc[1:]
    categories_sorted = categories.sort_values(ascending=False)
    
    return categories_sorted.index.values[0:num_venues]


Create a new dataframe containing only the neighborhood and its top 8 venue types

In [25]:
columns = ['Neighborhood']
for i in range(num_venues):
    columns.append('Venue_'+str(i+1))
venues_sorted = pd.DataFrame(columns=columns)
venues_sorted['Neighborhood']=tor_grouped['Neighborhood']

# loop through the neighborhoods and add the top venue type information
for i in range(len(tor_grouped)):
    venues_sorted.iloc[i,1:] = top_venues(tor_grouped.iloc[i,:], num_venues)

venues_sorted
venues_sorted.head()

Unnamed: 0,Neighborhood,Venue_1,Venue_2,Venue_3,Venue_4,Venue_5,Venue_6,Venue_7,Venue_8
0,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,American Restaurant,Burger Joint,Thai Restaurant,Restaurant
1,Agincourt,Lounge,Sandwich Place,Breakfast Spot,Skating Rink,Chinese Restaurant,Drugstore,Discount Store,Dog Run
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Yoga Studio,Drugstore,Dim Sum Restaurant,Diner,Discount Store,Dog Run
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pharmacy,Fried Chicken Joint,Pizza Place,Coffee Shop,Sandwich Place,Beer Store,Fast Food Restaurant
4,"Alderwood, Long Branch",Pizza Place,Pharmacy,Skating Rink,Coffee Shop,Pool,Pub,Sandwich Place,Gym


Time to cluster the neighborhood, using 6 clusters in this case

In [26]:
# assuming 6 clusters
kclusters = 6

km = KMeans(n_clusters=6,random_state=5)
km

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=6, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=5, tol=0.0001, verbose=0)

Drop the neighborhood column for fitting the data then fit the data

In [27]:
k_venues_sorted = tor_grouped.drop('Neighborhood',1)
k_venues_sorted.head()

Unnamed: 0,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,...,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Fit the model to the data and return the labels

In [28]:
print(k_venues_sorted.shape)
km.fit(k_venues_sorted)
km.labels_

(100, 273)


array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 4, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 4, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0], dtype=int32)

With the labels defined, time to add them and the longitude/latitude information to the venues_sorted dataframe

In [29]:
# add the labels
venues_sorted['Labels'] = km.labels_

# merge with the df_final dataframe to get lat/longitude
tor_merged = df_final.join(venues_sorted.set_index('Neighborhood'),on='Neighborhood').dropna()
tor_merged.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Venue_1,Venue_2,Venue_3,Venue_4,Venue_5,Venue_6,Venue_7,Venue_8,Labels
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Fast Food Restaurant,Dumpling Restaurant,Diner,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,1.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Bar,Yoga Studio,Dumpling Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,3.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Mexican Restaurant,Intersection,Pizza Place,Breakfast Spot,Electronics Store,Medical Center,Rental Car Location,Yoga Studio,1.0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,Coffee Shop,Korean Restaurant,Eastern European Restaurant,Discount Store,Dog Run,Doner Restaurant,Donut Shop,Drugstore,1.0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,Hakka Restaurant,Bakery,Caribbean Restaurant,Athletics & Sports,Thai Restaurant,Bank,Fried Chicken Joint,Yoga Studio,1.0


Plot the final clustering

In [30]:
# create the map centered around toronto
lat = 43.70011
long = -79.4163
map_clusters = folium.Map(location=[lat, long], zoom_start=11)

In [31]:
# import the color scheme for the plot
import matplotlib.cm as cm
import matplotlib.colors as colors

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

In [32]:
# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_merged['Latitude'], tor_merged['Longitude'], tor_merged['Neighborhood'], tor_merged['Labels'].astype('int')):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters