In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
#Get data from wikipedia site#
data= requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
soup=BeautifulSoup(data.content, 'html.parser')


In [3]:
#create table#
tDot=soup.find('tbody')
rows=tDot.select('tr')
row=[r.get_text() for r in rows]

In [4]:
#take data and put in a dataframe(clean)#
df_Tdot=pd.DataFrame(row)
#split the data into rows
df_rows=df_Tdot[0].str.split('\n', expand=True)
#move index0 to column names
df_cols=df_rows.rename(columns=df_rows.iloc[0])
#drop unncessary row
df=df_cols.drop(df_cols.index[0])
df.head()

Unnamed: 0,Unnamed: 1,Postcode,Borough,Neighbourhood,Unnamed: 5
1,,M1A,Not assigned,Not assigned,
2,,M2A,Not assigned,Not assigned,
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,


In [5]:
#Remove not assigned Borough Rows
df_clean=df[df.Borough != 'Not assigned']

df_clean.head()

Unnamed: 0,Unnamed: 1,Postcode,Borough,Neighbourhood,Unnamed: 5
3,,M3A,North York,Parkwoods,
4,,M4A,North York,Victoria Village,
5,,M5A,Downtown Toronto,Harbourfront,
6,,M5A,Downtown Toronto,Regent Park,
7,,M6A,North York,Lawrence Heights,


In [6]:
#Need to merge multiple Neigh with same postcode (M5A--> Harbourfront, Regent Park)
df_comb=df_clean.groupby(['Postcode', 'Borough']).agg(','.join)
#reset index
df_comb.reset_index(inplace=True)
#set to 86, so I can see Queen's Park row
df_comb.head(86)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [7]:
#Need to change Neighbourhood to equal borough if unassigned
df_comb2=df_comb.replace("Not assigned", 'Queens Park')
df_comb2.head(86)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


In [8]:
#final step, print shape
df_comb2.shape

(103, 3)

In [13]:
#add geo data file
geo_file='https://cocl.us/Geospatial_data'
df_geo=pd.read_csv(geo_file)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
#rename Postal Code to Postcode to match--> I though I need this but I guess not.
df_geo.rename(columns={'Postal Code': 'Postcode2'}, inplace=True)
df_geo.head()

Unnamed: 0,Postcode2,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [23]:
df_all=pd.concat([df_comb2, df_geo], axis=1)
df_all.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Postcode2,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [26]:
#thought I needed to have a match, but it doesn't seem like I do.
df_all.drop(['Postcode2'], axis=1, inplace=True)
df_all.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
