# Scraping Toronto Postal Codes From Wikipedia

In [1]:
import pandas as pd
import numpy as np
import requests

import json

# using BeautifulSoup for parsing the html
from bs4 import BeautifulSoup

## First, get the source

I use the requests library to get the page's text using the url. Then, using beautiful soup, I can begin to parse the data.

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

source = requests.get(wiki_url).text
soup = BeautifulSoup(source, 'lxml')

## Next, find the data

The data is in a table on the wikipedia page. So, going row by row through the data, I find what data to keep.

If the borough is 'Not assigned' then the entry is skipped.
if there is more than one neighborhood in a postal code, they should be in the same 'neighborhood' entry separated by commas.

If a cell has a borough, but the neighborhood is 'Not assigned' then it will be the same as the borough.

In [3]:
neighborhood_data = []
existingPostal = False # boolean to determine whether or not it is a new code

# get rows
rows = soup('tr')
for row in rows[1:290]:
    existingPostal = False
    
    row_vals = row('td') # list of data values within a row
    
    # if the borough is not assigned, skip this row!
    if row_vals[1].text == 'Not assigned':
        continue
    # if the neighborhood is not assigned, change it to the borough name!
    if row_vals[2].text[:-1] == 'Not assigned':
        row_vals[2] = row_vals[1].text
    else:
        row_vals[2] = row_vals[2].text[:-1] # slice to get rid of '\n'
    # check if the postal code already exists. If it does, then just add the neighborhood to existing entry
    for postal_code in neighborhood_data:
        if row_vals[0].text == postal_code['PostalCode']:
            postal_code['Neighborhood'] = postal_code['Neighborhood'] + ', ' + row_vals[2]
            existingPostal = True
            break
    
    # if it is a new code, add new row.
    if not existingPostal:        
        neighborhood_data.append({
            'PostalCode' : row_vals[0].text,
            'Borough' : row_vals[1].text,
            'Neighborhood' : row_vals[2]
        })

# create dataframe for neighborhoods
neighborhood_df = pd.DataFrame(neighborhood_data, columns=['PostalCode','Borough','Neighborhood'])

neighborhood_df.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [4]:
neighborhood_df.shape

(103, 3)

## Get geographical data

The geocoder api was being tricky, so I used the geospatial coordinates csv provided.

In [5]:
import io

# Get CSV
coords_url = 'http://cocl.us/Geospatial_data'
coords_content = requests.get(coords_url).content

# Convert to DataFrame
coords_df = pd.read_csv(io.StringIO(coords_content.decode('utf-8')))

coords_df

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


## Combine and clean

This next step combines the data into the neighborhood dataframe so that the latitude and longitude are part of the original dataframe.

In [6]:
# make sure the datasets are in the correct order to transfer
coords_df.sort_values(by=['Postal Code'], inplace=True)
neighborhood_df.sort_values(by=['PostalCode'], inplace=True)

# adjust indices to ensure proper copying 
coords_df.index = neighborhood_df.index

# add columns to neighborhood_df
neighborhood_df['Latitude'] = coords_df['Latitude']
neighborhood_df['Longitude'] = coords_df['Longitude']

# return to initial indices
neighborhood_df.sort_index(inplace=True)

In [7]:
neighborhood_df.head(15)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
7,M3B,North York,Don Mills North,43.745906,-79.352188
8,M4B,East York,"Woodbine Gardens, Parkview Hill",43.706397,-79.309937
9,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


## Defining Foursquare Credentials and Version


In [8]:
CLIENT_ID = 'TNRANFLIRZLGJKHVNIFB3G3LCKS1ZXNU1ICKP0Q1AKODF0ZH' # your Foursquare ID
CLIENT_SECRET = 'EQCLWMJBSHKDQYWKHYAQA1Z0DITJVK2DOYDSW1Q5FOOIM0ZH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TNRANFLIRZLGJKHVNIFB3G3LCKS1ZXNU1ICKP0Q1AKODF0ZH
CLIENT_SECRET:EQCLWMJBSHKDQYWKHYAQA1Z0DITJVK2DOYDSW1Q5FOOIM0ZH


### Next, setting venue limit and a search radius. Followed by defining two functions
get_category_type gets the category type of a venue
get_nearby_venues takes a list of latitudes and longitudes and finds the nearby venues.

In [9]:
LIMIT = 100
radius = 1000

In [10]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
# uses the above methods to find the venues

toronto_venues = getNearbyVenues(names=neighborhood_df['Neighborhood'],
                                 latitudes=neighborhood_df['Latitude'],
                                 longitudes=neighborhood_df['Longitude'],
                                 radius = radius)

Parkwoods
Victoria Village
Harbourfront, Regent Park
Lawrence Heights, Lawrence Manor
Queen's Park
Islington Avenue
Rouge, Malvern
Don Mills North
Woodbine Gardens, Parkview Hill
Ryerson, Garden District
Glencairn
Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park
Highland Creek, Rouge Hill, Port Union
Flemingdon Park, Don Mills South
Woodbine Heights
St. James Town
Humewood-Cedarvale
Bloordale Gardens, Eringate, Markland Wood, Old Burnhamthorpe
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Downsview North, Wilson Heights
Thorncliffe Park
Adelaide, King, Richmond
Dovercourt Village, Dufferin
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto
Harbourfront East, Toronto Islands, Union Station
Little Portugal, Trinity
East Birchmount Park, Ionview, Kennedy Park
Bayview Village
CFB Toronto, Downsview East
The D

### Checking what venues are in a neighborhood

In [14]:
toronto_venues.groupby('Neighborhood').count().head()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Agincourt,49,49,49,49,49,49
"Agincourt North, L'Amoreaux East, Milliken, Steeles East",29,29,29,29,29,29
"Albion Gardens, Beaumond Heights, Humbergate, Jamestown, Mount Olive, Silverstone, South Steeles, Thistletown",15,15,15,15,15,15
"Alderwood, Long Branch",24,24,24,24,24,24


### Apply one hot encoding

In [23]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Zoo,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Group the encoded dataframe by neighborhoods

In [24]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

In [25]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Check the most common venues per neighborhood

In [94]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] =toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Café,Hotel,Theater,Coffee Shop,American Restaurant,Japanese Restaurant,Restaurant,Concert Hall,Sushi Restaurant,Cosmetics Shop
1,Agincourt,Chinese Restaurant,Shopping Mall,Bakery,Pizza Place,Supermarket,Caribbean Restaurant,Coffee Shop,Bank,Discount Store,Japanese Restaurant
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Chinese Restaurant,Pizza Place,Bakery,Pharmacy,Park,Korean Restaurant,Dessert Shop,Coffee Shop,Noodle House,Caribbean Restaurant
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Pizza Place,Coffee Shop,Bus Line,Fast Food Restaurant,Sandwich Place,Pharmacy,Hardware Store,Fried Chicken Joint,Park
4,"Alderwood, Long Branch",Discount Store,Pizza Place,Grocery Store,Pharmacy,Liquor Store,Sandwich Place,Park,Intersection,Trail,Coffee Shop


### Finally, cluster the neighborhoods

In [95]:
from sklearn.cluster import KMeans
import folium

k = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood',1)

k_means = KMeans(n_clusters=k,random_state=0)

k_means.fit(toronto_grouped_clustering)

k_means.labels_

array([1, 4, 4, 4, 4, 4, 0, 0, 1, 0, 4, 1, 1, 1, 1, 1, 4, 1, 4, 1, 1, 1,
       1, 0, 4, 0, 4, 1, 1, 1, 1, 0, 1, 0, 0, 1, 3, 0, 4, 0, 1, 4, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 4, 1, 4, 4, 4, 4, 4, 1, 0, 0, 1, 0, 1,
       4, 0, 1, 1, 1, 4, 0, 1, 4, 1, 4, 1, 0, 1, 1, 0, 2, 1, 1, 1, 1, 1,
       1, 1, 4, 1, 1, 1, 0, 4, 1, 4, 4, 0, 4, 0])

### Label the neighborhoods with the cluster labels.

In [96]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', k_means.labels_)

toronto_merged = neighborhood_df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.dropna(inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

toronto_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,4,Park,Convenience Store,Bus Stop,Pharmacy,Shopping Mall,Pizza Place,Supermarket,Tennis Court,Chinese Restaurant,Caribbean Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,1,Coffee Shop,Portuguese Restaurant,Hockey Arena,Golf Course,Pizza Place,Lounge,Men's Store,Park,Athletics & Sports,Café
2,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,1,Coffee Shop,Café,Italian Restaurant,Pub,Diner,Park,Breakfast Spot,Theater,Bakery,Restaurant
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763,0,Coffee Shop,Fast Food Restaurant,Furniture / Home Store,Fried Chicken Joint,Miscellaneous Shop,Restaurant,Vietnamese Restaurant,Dessert Shop,Bank,Rental Car Location
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,1,Coffee Shop,Sushi Restaurant,Gastropub,Park,Ramen Restaurant,Bubble Tea Shop,Burger Joint,Pizza Place,Japanese Restaurant,Seafood Restaurant


## Plot data

The clusters are shown on a map of toronto.

In [97]:
import matplotlib.cm as cm
import matplotlib.colors as colors
%matplotlib inline
toronto_lat = 43.6532
toronto_lng = -79.3832
map_clusters = folium.Map(location=[toronto_lat,toronto_lng], zoom_start=11)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []

for lat,lon,poi,cluster in zip(toronto_merged['Latitude'],toronto_merged['Longitude'], toronto_merged['Neighborhood'],toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Note that there are 3 common distributions of venues. 2 Neighborhoods are different from all of the others.