## Segmenting and Clustering Neighborhoods in Toronto - Part 1

Starting by importing the data:

In [14]:
from bs4 import BeautifulSoup
import requests

website_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(website_url,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of postal codes of Canada: M - Wikipedia
  </title>
  <script>
   document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );
  </script>
  <script>
   (window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":815646878,"wgRevisionId":815646878,"wgArticleId":539066,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Communications in Ontario","Postal codes in Canada","Toronto","Ontario-related lists"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wg

Isolating the table, which is the only part we will need:

In [15]:
table = soup.find('table')
print(table)

<table cellpadding="2" cellspacing="0" rules="all" style="width:100%; border-collapse:collapse; border:1px solid #ccc;">
<tbody><tr>
<td style="width:11%; vertical-align:top; color:#ccc;">
<p><b>M1A</b><br/><span style="font-size:80%;"><i>Not assigned</i></span>
</p>
</td>
<td style="width:11%; vertical-align:top; color:#ccc;">
<p><b>M2A</b><br/><span style="font-size:80%;"><i>Not assigned</i></span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M3A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M4A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>
</p>
</td>
<td style="width:11%; vertical-align:top;">
<p><b>M5A</b><br/><span style="font-size:80%;"><a hr

We only need the active (black color text) cells, so lets select them:

In [16]:
# There are two types of active cells, so lets select them and put them together

td1 = table.find_all('td',{'style':"width:11%; vertical-align:top;"})
td2 = table.find_all('td',{'style':"vertical-align:top;"})

table_black=td1+td2 

# Clear the result a bit
table_clr = ''.join(map(str, table_black))
table_str=str(table_clr)
table_str

'<td style="width:11%; vertical-align:top;">\n<p><b>M3A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>)</span>\n</p>\n</td><td style="width:11%; vertical-align:top;">\n<p><b>M4A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/>(<a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>)</span>\n</p>\n</td><td style="width:11%; vertical-align:top;">\n<p><b>M5A</b><br/><span style="font-size:80%;"><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a><br/>(<a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a> / <a class="mw-redirect" href="/wiki/Harbourfront,_Toronto" title="Harbourfront, Toronto">Harbourfront</a>)</span>\n</p>\n</td><td style="width:11%; vertical-align:top;">\n<p><b>M6A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">N

Selecting the neighbourhoods:

In [17]:
# Sort out the content within brackets - neighbourghoods
import re

regex = re.compile(".*?\((.*?)\)")

table_txt = BeautifulSoup(table_str,'html.parser').getText(' ', strip=True)

neighb = re.findall(regex, table_txt)

neighb 

[' Parkwoods ',
 ' Victoria Village ',
 ' Regent Park / Harbourfront ',
 ' Lawrence Manor / Lawrence Heights ',
 ' Islington Avenue ',
 ' Malvern / Rouge ',
 ' Don Mills ',
 ' Parkview Hill / Woodbine Gardens ',
 ' Garden District , Ryerson ',
 'Glencairn',
 ' West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale',
 ' Rouge Hill / Port Union / Highland Creek ',
 ' Don Mills ',
 ' Flemingdon Park ',
 ' Woodbine Heights ',
 ' St. James Town ',
 ' Humewood-Cedarvale ',
 'Eringate / Bloordale Gardens / Old Burnhamthorpe / Markland Wood ',
 ' Guildwood / Morningside / West Hill ',
 ' The Beaches ',
 'Berczy Park',
 ' Caledonia-Fairbanks ',
 ' Woburn ',
 ' Leaside ',
 'Central Bay Street ',
 'Christie',
 ' Cedarbrae ',
 ' Hillcrest Village ',
 ' Bathurst Manor / Wilson Heights / Downsview North',
 ' Thorncliffe Park ',
 ' Richmond / Adelaide / King ',
 ' Dufferin / Dovercourt Village ',
 ' Scarborough Village ',
 'Fairview / Henry Farm / Oriole',
 ' Northwood Park / York

Now remove the neighbourhoods values from the rest of the table:

In [18]:
m_and_b_temp = re.sub("[\(\[].*?[\)\]]", "", table_str)
m_and_b = BeautifulSoup(str(m_and_b_temp),'html.parser')
print(m_and_b)

<td style="width:11%; vertical-align:top;">
<p><b>M3A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/></span>
</p>
</td><td style="width:11%; vertical-align:top;">
<p><b>M4A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/></span>
</p>
</td><td style="width:11%; vertical-align:top;">
<p><b>M5A</b><br/><span style="font-size:80%;"><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a><br/></span>
</p>
</td><td style="width:11%; vertical-align:top;">
<p><b>M6A</b><br/><span style="font-size:80%;"><a href="/wiki/North_York" title="North York">North York</a><br/></span>
</p>
</td><td style="width:11%; vertical-align:top;">
<p><b>M7A</b><br/><span style="font-size:80%;"><a href="/wiki/Queen%27s_Park_" title="Queen's Park ">Queen's Park</a><br/>
</span></p>
<hr/><b>Ontario Provincial Government</b>
</td><td style="width:11%; vertical-align:top;">
<p><b>M9A</b><br

Now get the postal codes:

In [19]:
# Eliminate noise by only processing <p> fields
postal_c_s=m_and_b.findAll('p')
postal_c = BeautifulSoup(str(postal_c_s),'html.parser')

# Get the vaules for postal numbers
postal=[]
for n in postal_c.findAll("b"):
    postal.append(n.getText())
postal

['M3A',
 'M4A',
 'M5A',
 'M6A',
 'M7A',
 'M9A',
 'M1B',
 'M3B',
 'M4B',
 'M5B',
 'M6B',
 'M9B',
 'M1C',
 'M3C',
 'M4C',
 'M5C',
 'M6C',
 'M9C',
 'M1E',
 'M4E',
 'M5E',
 'M6E',
 'M1G',
 'M4G',
 'M5G',
 'M6G',
 'M1H',
 'M2H',
 'M3H',
 'M4H',
 'M5H',
 'M6H',
 'M1J',
 'M2J',
 'M3J',
 'M4J',
 'M5J',
 'M6J',
 'M1K',
 'M2K',
 'M3K',
 'M4K',
 'M5K',
 'M6K',
 'M1L',
 'M2L',
 'M3L',
 'M4L',
 'M5L',
 'M6L',
 'M9L',
 'M1M',
 'M2M',
 'M3M',
 'M4M',
 'M5M',
 'M6M',
 'M9M',
 'M1N',
 'M2N',
 'M3N',
 'M4N',
 'M5N',
 'M6N',
 'M9N',
 'M1P',
 'M2P',
 'M4P',
 'M5P',
 'M6P',
 'M9P',
 'M1R',
 'M2R',
 'M4R',
 'M5R',
 'M6R',
 'M7R',
 'M9R',
 'M1S',
 'M4S',
 'M5S',
 'M6S',
 'M1T',
 'M4T',
 'M5T',
 'M1V',
 'M4V',
 'M5V',
 'M8V',
 'M9V',
 'M1W',
 'M4W',
 'M5W',
 'M8W',
 'M9W',
 'M1X',
 'M4X',
 'M5X',
 'M8X',
 'M4Y',
 'M7Y',
 'M8Y',
 'M8Z']

Finally get the boroughs values:

In [20]:
boroughs = []
for line in m_and_b.findAll("td"):
    for b in line.findAll("span"):
        head, sep, tail = b.getText(' ', strip=True).partition('\"') #remove html leftover
        cell = head.replace("Queen's Park\n","Queen's Park").replace("Mississauga Canada Post Gateway Processing Centre", "Mississauga").replace("East Toronto Business reply mail Processing Centre 969 Eastern", "East Toronto").replace("Downtown Toronto Stn A PO Boxes 25 The Esplanade","Downtown Toronto").replace("EtobicokeNorthwest","Etobicoke Northwest")
        boroughs.append(cell)

boroughs

['North York',
 'North York',
 'Downtown Toronto',
 'North York',
 "Queen's Park",
 'Etobicoke',
 'Scarborough',
 'North York North',
 'East York',
 'Downtown Toronto',
 'North York',
 'Etobicoke',
 'Scarborough',
 'North York South',
 'East York',
 'Downtown Toronto',
 'York',
 'Etobicoke',
 'Scarborough',
 'East Toronto',
 'Downtown Toronto',
 'York',
 'Scarborough',
 'East York',
 'Downtown Toronto',
 'Downtown Toronto',
 'Scarborough',
 'North York',
 'North York',
 'East York',
 'Downtown Toronto ',
 'West Toronto',
 'Scarborough',
 'North York',
 'North York',
 'East York East Toronto',
 'Downtown Toronto ',
 'West Toronto',
 'Scarborough',
 'North York',
 'North York East',
 'East Toronto',
 'Downtown Toronto',
 'West Toronto',
 'Scarborough',
 'North York',
 'North York West',
 'East Toronto ',
 'Downtown Toronto ',
 'North York',
 'North York',
 'Scarborough',
 'North York',
 'North York Central',
 'East Toronto',
 'North York',
 'York',
 'North York',
 'Scarborough',
 'North 

A cleanup before making the dataframe:

In [21]:
del neighb[12]
del neighb[12] # This is the previous 13 that slided down
neighb.insert(12, "Don Mills / Flemingdon Park")

neighb.insert(4, "Queen's Park")

del neighb[41]
del neighb[41]
neighb.insert(41, "Downsview / CFB Toronto")

# Add commas
neighb = [word.replace(' / ',', ') for word in neighb]

# Check the legnths
print("postal: ",len(postal))
print("boroughs: ",len(boroughs))
print("neighb: ",len(neighb))

postal:  103
boroughs:  103
neighb:  103


Time for a dataframe:

In [22]:
# Create pandas dataframe
import pandas as pd

print("postal: ",len(postal))
print("boroughs: ",len(boroughs))
print("neighb: ",len(neighb))

df = pd.DataFrame(
    {'PostalCode': postal,
     'Borough': boroughs,
     'Neighborhood': neighb
    })

# Arrange in correct order
df = df[['PostalCode','Borough','Neighborhood']]

df

postal:  103
boroughs:  103
neighb:  103


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Queen's Park
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York North,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District , Ryerson"


Last part; check the shape:

In [23]:
df.shape

(103, 3)