# Segmenting and Clustering Neighborhoods in Toronto


In [1]:
!pip install BeautifulSoup4
!pip install requests

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/3b/c8/a55eb6ea11cd7e5ac4bacdf92bac4693b90d3ba79268be16527555e186f0/beautifulsoup4-4.8.1-py3-none-any.whl (101kB)
[K     |████████████████████████████████| 102kB 25.0MB/s ta 0:00:01
[?25hCollecting soupsieve>=1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.8.1 soupsieve-1.9.5


In [2]:
#libraries
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests

# Part 1

In [3]:
#URL
wiki_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

#scrape
soupstuff = BeautifulSoup(wiki_url.text,'html.parser') 

#pull out scraped table
My_table = soupstuff.find('table','wikitable sortable') 

In [4]:
#push scrape into list
data = []
columns = []

for index, tr in enumerate(My_table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
toronto_df = pd.DataFrame(data = data,columns = columns)

#performs cleanup
toronto_df = toronto_df.rename(columns = {'Postcode':'PostalCode', 'Neighbourhood':'Neighborhood'})
toronto_df = toronto_df[toronto_df.Borough != 'Not assigned']
toronto_df['Neighborhood'].replace("Not assigned", toronto_df["Borough"],inplace=True)
toronto_df_clean = toronto_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
toronto_df_clean

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [5]:
toronto_df_clean.shape

(103, 3)

# Part 2

In [20]:
latlon = pd.read_csv(  'http://cocl.us/Geospatial_data')
latlong = pd.DataFrame(latlon)
latlon = latlon.rename(columns = {'Postal Code':'PostalCode'})
latlon.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [21]:
toronto_df_latlon = toronto_df_clean.merge(latlon, left_on='PostalCode', right_on='PostalCode')
toronto_df_latlon.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
