In [75]:
from bs4 import BeautifulSoup
import requests

In [76]:
website_url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

soup = BeautifulSoup(website_url,"lxml")
print(soup.prettify())

In [77]:
My_table = soup.find("table",{"class":"wikitable sortable"})

My_table

In [78]:
#creating lists for each of the columns I know to be in my table.
A=[]
B=[]
C=[]

In [79]:
#utilizing HTML tags for rows <tr> and elements <td> to iterate through each row of data and append data elements to their appropriate lists:
for row in My_table.findAll("tr"):
    cells = row.findAll('td')
    if len(cells)==3: #Only extract table body not heading
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))

In [80]:
#import pandas to convert list to data frame
import pandas as pd
df=pd.DataFrame(A,columns=['Postcode'])
df['Borough']=B
df['Neighbourhood']=C

In [81]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [84]:
#To drop the rows which has "not assigned" in them, I change the data type into string 
df.dtypes
df = df.astype(str)
df_clean = df[~df.Borough.str.contains("assigned")]

In [85]:
df_clean

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned\n
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


In [86]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough:
import numpy as np
df_clean['Neighbourhood'] = np.where(df_clean['Neighbourhood'] == "Not assigned\n", df_clean['Borough'], df_clean['Neighbourhood'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


df_clean

In [88]:
# More than one neighborhood can exist in one postal code area:

df_new = df_clean.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()

df_new

In [16]:
#  number of rows of your dataframe.
df_new.shape[0]

103

In [31]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [32]:
# Change the name of the Postal Code column to Postcode (same name as in our first dataframe)

df_geo.rename(columns={'Postal Code':'Postcode'}, 
                 inplace=True)
df_geo.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [33]:
# merge the 2 dataframe

df_merged = pd.merge(df_new, df_geo, on='Postcode')
df_merged

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood\n,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park\n,Ionview,Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West\n",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West\n",43.692657,-79.264848
