In [1]:
import urllib.request
import bs4 as bs
import pandas as pd

In [2]:
# Read the wiki page
# create the above dataframe:
# The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
source = urllib.request.urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').read()
soup = bs.BeautifulSoup(source,'lxml')
table = soup.find('table', attrs={'class':'wikitable sortable'})
table_rows = table.find_all('tr')
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    l.append(row)
df = pd.DataFrame(l, columns=["PostalCode", "Borough","Neighborhood"])
# df.head()
# print(  "1. count: " , df.shape[0])

In [3]:
# Remove 1st row whic is all None
df = df[~df.isna().any(axis=1)]
# print(  "2. count: " , df.shape[0] )
# df.head()

In [4]:
# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
df.loc[df['Borough'].str.contains('Not assigned'), 'Borough'] = None
#df = df[~df.isna().any(axis=1)]
df = df.dropna(subset=['Borough'])
# print(  "3. count: " , df.shape[0] )
# df.head()

In [5]:
# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. 
# These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.
# But, becuase the wiki alredy remvoved duplicate postal codes we are only going to cleanup the data in the Neighborhood column
# Also remove new line from HTML
for i in df.index: 
     df['PostalCode'][i] = df['PostalCode'][i].replace('\n','')
     df['Borough'][i] = df['Borough'][i].replace('\n','')
     df['Neighborhood'][i] = df['Neighborhood'][i].replace('\n','')
     df['Neighborhood'][i] = df['Neighborhood'][i].replace(' / ',', ')

In [6]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
df.loc[df['Neighborhood'].str.contains('Not assigned'), 'Neighborhood'] 

Series([], Name: Neighborhood, dtype: object)

In [7]:
# In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.
print(  "3. count: " , df.shape[0] )

3. count:  103


In [8]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')


Data downloaded!


In [9]:
header_list = ["PostalCode", "Latitude", "Longitude"]
ll_df = pd.read_csv('Geospatial_Coordinates.csv', names=header_list, skiprows=1)
ll_df.set_index('PostalCode')
ll_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [10]:
# Function to get Latitude from dataframe created from Geospatial_Coordinates.csv 
def find_latitude(postal_code):
    return ll_df.loc[ll_df['PostalCode'] == postal_code]['Latitude'].item()

In [11]:
# Function to get Longitude from dataframe created from Geospatial_Coordinates.csv 
def find_longitude(postal_code):
    return ll_df.loc[ll_df['PostalCode'] == postal_code]['Longitude'].item()

In [12]:
df['Latitude'] = None
df['Longitude'] = None
for i in df.index: 
     df['Latitude'][i] = find_latitude( df['PostalCode'][i] )
     df['Longitude'][i] = find_longitude( df['PostalCode'][i] )

In [13]:
some_values = ['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V', 'M9L','M5V','M1B','M5A']
df.loc[df['PostalCode'].isin(some_values)]
#df.loc[df['PostalCode'] == 'M9W']

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
5,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6543,-79.3606
10,M1B,Scarborough,"Malvern, Rouge",43.8067,-79.1944
13,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7064,-79.3099
40,M4G,East York,Leaside,43.7091,-79.3635
41,M5G,Downtown Toronto,Central Bay Street,43.658,-79.3874
47,M2H,North York,Hillcrest Village,43.8038,-79.3635
55,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
81,M9L,North York,Humber Summit,43.7563,-79.566
85,M4M,East Toronto,Studio District,43.6595,-79.3409
109,M1R,Scarborough,"Wexford, Maryvale",43.7501,-79.2958
