In [2]:
import pandas as pd

## Part 1: Scraping Postal Codes from Wikipidea
"Explore and cluster the neighborhoods in Toronto."
Scraping list of postal codes in Canada where the first letter is M. Postal codes beginning with M are located within the city of Toronto in the province of Ontario. 
The following script will scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M,
Data content of postal codes table will be obtained and then transformed into pandas dataframe.

In [3]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup

In [4]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = requests.get(url)

In [5]:
response#200 means it went through the website

<Response [200]>

In [6]:
soup = BeautifulSoup(response.text, 'html.parser')

In [7]:
My_table = soup.find("table",{"class":"wikitable sortable"})

In [11]:
rows = My_table.find_all('td')
rows

[<td>M1A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M2A</td>, <td>Not assigned</td>, <td>Not assigned
 </td>, <td>M3A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
 </td>, <td>M4A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
 </td>, <td>M5A</td>, <td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>, <td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
 </td>, <td>M6A</td>, <td><a href="/wiki/North_York" title="North York">North York</a></td>, <td><a href="/wiki/Lawrence_Heights" title="Lawrence Heights">Lawrence Heights

In [12]:
b=[]
for i in range(0,len(rows)):
    a = rows[i].get_text()
    b.append(a)


## Data Cleanup and re-grouping.
The scraped wikipedia table contains some un-wanted entries and needs some cleanup. The following tasks will be performed:

Drop/ignore cells with un-assigned boroughs.
If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
Group the table by PostalCode/Borough, Neighbourhood belonging to same borough will be combined in 'Neighbourhood' column as separated with 'comma'.


In [13]:
a1=[] 
a2 =[]
a3=[]
for i in range(0,len(rows),3):
    a1.append(b[i])

for i in range(1,len(rows),3):
    a2.append(b[i])

for i in range(2,len(rows),3):
    a3.append(b[i])

a4=[]
for i in range(0,len(a3)):
    myString = a3[i]
    myString = myString[:-1]
    a4.append(myString)


In [14]:
raw_data = pd.DataFrame({'PostCode':a1, 'Borough': a2, 'Neighbourhood': a4})

In [15]:
raw_data.head(5)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [16]:
raw_data = raw_data[raw_data.Borough!='Not assigned']

In [17]:
raw_data.head()#here Borough does not have value 'Not assigned'

Unnamed: 0,PostCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


In [18]:
# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, you will notice that M5A is listed twice 
# and has two neighborhoods: Harbourfront and Regent Park. 
# These two rows will be combined into one row with the neighborhoods separated with a comma.
raw_data = raw_data.groupby(['PostCode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()

In [19]:
raw_data.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [20]:
raw_data.head(4)

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn


In [21]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
# For example the value of the Borough and the Neighborhood columns will be Queen's Park.

raw_data.loc[raw_data['Neighbourhood'] == 'Not assigned', ['Neighbourhood']] = raw_data['Borough']

raw_data.head()

Unnamed: 0,PostCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [204]:
raw_data.to_csv('Toronto.TASK_1_df.csv',index=False)

In [205]:
raw_data.shape

(103, 3)