# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto

Imports request and pandas  
Imports BeautifulSoup which contains functions for scraping website tables

In [14]:
import requests
import pandas as pd
from bs4 import BeautifulSoup


Scrapes Wikipedia site and prints HTML contents

In [15]:
website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(website_url,'lxml')

print('Wikipedia HTML retrieved')

Wikipedia HTML retrieved


Using BeautifulSoup, located wikitable

In [16]:
My_table = soup.find('table',{'class':'wikitable sortable'})
#My_table

print('Wikipedia Table located')

Wikipedia Table located


Find all of the row/column elements in the table

In [17]:
links = My_table.findAll('td')
#links

print('Wikipedia Table-Elements located')

Wikipedia Table-Elements located


Loops the elements and assumes every set of 3 are a row of data from the Wikipedia table.  The first element in a group is for Postcode, the second for Borough, and the third for Neighborhood

In [18]:
Postcode = []
Borough = []
Neighborhood = []
i = 0
for link in links:
    i = i+1
    if i == 1:
        Postcode.append(link.text)
    elif i == 2:
        Borough.append(link.text)
    elif i == 3:
        Neighborhood.append(link.text.replace("\n",""))
        i = 0

#print(Postcode)
#print(Borough)
#print(Neighborhood)

print('Wikipedia Table-Elements parsed')

Wikipedia Table-Elements parsed


Creates and formats the dataframe

In [19]:
toronto_df = pd.DataFrame({'Postcode':Postcode,'Borough':Borough,'Neighborhood':Neighborhood})
toronto_df = toronto_df.reindex(['Postcode','Borough','Neighborhood'], axis=1)
toronto_df = toronto_df[toronto_df['Borough'] != "Not assigned"]
toronto_df['Neighborhood'][toronto_df['Neighborhood'] == "Not assigned"] = toronto_df['Borough'][toronto_df['Neighborhood'] == "Not assigned"]
#toronto_df

print('Wikipedia Table recreated')

Wikipedia Table recreated


Groups the dataframe by Postcode and Borough and subsequently concatenates the Neighborhoods

In [20]:
toronto_df = toronto_df.groupby(['Postcode','Borough'])['Neighborhood'].apply(lambda x: "%s" % ', '.join(x)).reset_index(name ='Neighborhoods')
toronto_df

Unnamed: 0,Postcode,Borough,Neighborhoods
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Shape of the final dataframe, toronto_df

In [21]:
toronto_df.shape

(103, 3)

In [22]:
#from geopy.geocoders import Nominatim
#geolocator = Nominatim()
#geocode = []
#
#for postal_code in toronto_df['Postcode']:
#    print(postal_code)
#    address = postal_code + ', Toronto, Ontario'
#    location = geolocator.geocode(address)
#    latitude = location.latitude
#    longitude = location.longitude
#    print(address + ': ' + str(latitude) + ' : ' + str(longitude))
#    geocode.append(pd.DataFrame({'Postcode':[postal_code],'latitude':[latitude],'longitude':[longitude]}))
#
#
#geocode = pd.concat(appended_data, axis=1)

In [23]:
geocode = pd.read_csv('https://cocl.us/Geospatial_data')


In [24]:
geocode

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [25]:
toronto_df_geocode = pd.merge(toronto_df, geocode, how='left', left_on=['Postcode'], right_on=['Postal Code'])
toronto_df_geocode

Unnamed: 0,Postcode,Borough,Neighborhoods,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,M1J,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",M1K,43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",M1L,43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",M1M,43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",M1N,43.692657,-79.264848


In [27]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_geocode['Latitude'], toronto_df_geocode['Longitude'], toronto_df_geocode['Borough'], toronto_df_geocode['Neighborhoods']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto



The boroughs seem to be evenly spaced out with the exception around the Financial District which is a bit more concentrated.