In [10]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd

In [351]:
page_wiki_can = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup_can = BeautifulSoup(page_wiki_can.content, 'html.parser')

#### I used the read_html to directly read from the actual table. 
#### By reviewing the source code of the page, I found that the class of the table is "wikitable sortable"

In [352]:
list_can = pd.read_html(str(soup_can.find_all(class_="wikitable sortable")[0]),header=0)

In [353]:
df_can = pd.DataFrame(list_can[0])

In [354]:
df_can.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [355]:
df_can.tail(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
269,M8Y,Etobicoke,Kingsway Park South East
270,M8Y,Etobicoke,Mimico NE
271,M8Y,Etobicoke,Old Mill South
272,M8Y,Etobicoke,The Queensway East
273,M8Y,Etobicoke,Royal York South East
274,M8Y,Etobicoke,Sunnylea
275,M9Y,Not assigned,Not assigned
276,M1Z,Not assigned,Not assigned
277,M2Z,Not assigned,Not assigned
278,M3Z,Not assigned,Not assigned


#### Filtered the Data Frame to exclude all values different to "Not Assigned" 

In [356]:
df_can = df_can[df_can.Borough != 'Not assigned']

In [357]:
mask = df_can.Neighbourhood == 'Not assigned'
column_name = 'Neighbourhood'
df_can.loc[mask, column_name] = df_can.Borough

In [358]:
df_can.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


#### To have all the neighborhoods corresponding to a Post Code and Borough. I had concatenated them and added a comma and a space after each one

In [359]:
df_can_sum=df_can.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: (x + ', ').sum())

In [360]:
df_can_sum=pd.DataFrame(df_can_sum)

In [361]:
df_can_sum.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M1B,Scarborough,"Rouge, Malvern,"
M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union,"
M1E,Scarborough,"Guildwood, Morningside, West Hill,"
M1G,Scarborough,"Woburn,"
M1H,Scarborough,"Cedarbrae,"
M1J,Scarborough,"Scarborough Village,"
M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park,"
M1L,Scarborough,"Clairlea, Golden Mile, Oakridge,"
M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West,"
M1N,Scarborough,"Birch Cliff, Cliffside West,"


In [362]:
df_can_sum.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighbourhood
Postcode,Borough,Unnamed: 2_level_1
M6R,West Toronto,"Parkdale, Roncesvalles,"
M6S,West Toronto,"Runnymede, Swansea,"
M7A,Queen's Park,"Queen's Park,"
M7R,Mississauga,"Canada Post Gateway Processing Centre,"
M7Y,East Toronto,Business reply mail Processing Centre969 Easte...
M8V,Etobicoke,"Humber Bay Shores, Mimico South, New Toronto,"
M8W,Etobicoke,"Alderwood, Long Branch,"
M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North,"
M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."
M8Z,Etobicoke,"Kingsway Park South West, Mimico NW, The Queen..."


In [363]:
df_can_sum.shape[0]

103

#### We need to remove the comma and space from the end of the list of Neighborhoods

In [364]:
df_can_sum['Neighbourhood'][0]

'Rouge, Malvern, '

In [365]:
df_can_sum['Neighbourhood'][1]

'Highland Creek, Rouge Hill, Port Union, '

#### To do that, I had obtained the substring of the Neighborhood list by removing the last two characters (space and comma)

In [366]:
i=0
while i < df_can_sum.shape[0]:
    df_can_sum['Neighbourhood'][i]=df_can_sum['Neighbourhood'][i][:len(df_can_sum['Neighbourhood'][i])-2]
    i=i+1

In [393]:
df_can_sum['Neighbuorhood'][0]

'Rouge, Malvern'

In [368]:
df_can_sum['Neighbourhood'][1]

'Highland Creek, Rouge Hill, Port Union'

In [369]:
df_can_sum['Neighbourhood'][102]

'Northwest'

#### I copied the index of the summarized dataframe (Postal Code and Borough) to a new Column called Index.

In [370]:
df_can_sum['Index']=df_can_sum.index
df_can_sum = df_can_sum.reset_index(drop=True)

In [371]:
df_can_sum['PostalCode']=''
df_can_sum['Borough']=''

In [380]:
df_can_sum.head()

Unnamed: 0,Neighbourhood,Index,PostalCode,Borough
0,"Rouge, Malvern","(M1B, Scarborough)",M1B,Scarborough
1,"Highland Creek, Rouge Hill, Port Union","(M1C, Scarborough)",M1C,Scarborough
2,"Guildwood, Morningside, West Hill","(M1E, Scarborough)",M1E,Scarborough
3,Woburn,"(M1G, Scarborough)",M1G,Scarborough
4,Cedarbrae,"(M1H, Scarborough)",M1H,Scarborough


#### Then I can split both of the columns of the index

In [374]:
df_can_sum['Index'][0][0]

'M1B'

In [375]:
df_can_sum['Index'][0][1]

'Scarborough'

#### Populated the PostalCode and Borough Columns

In [377]:
i=0
while i < df_can_sum.shape[0]:
    df_can_sum['PostalCode'][i]=df_can_sum['Index'][i][0]
    df_can_sum['Borough'][i]=df_can_sum['Index'][i][1]
    i=i+1

In [379]:
df_can_sum.head()

Unnamed: 0,Neighbourhood,Index,PostalCode,Borough
0,"Rouge, Malvern","(M1B, Scarborough)",M1B,Scarborough
1,"Highland Creek, Rouge Hill, Port Union","(M1C, Scarborough)",M1C,Scarborough
2,"Guildwood, Morningside, West Hill","(M1E, Scarborough)",M1E,Scarborough
3,Woburn,"(M1G, Scarborough)",M1G,Scarborough
4,Cedarbrae,"(M1H, Scarborough)",M1H,Scarborough


#### Included only the necessary columns, in the expected order

In [383]:
df_can_sum=df_can_sum[['PostalCode','Borough','Neighbourhood']]

In [384]:
df_can_sum.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Renamed the column names to the ones expected

In [385]:
df_can_sum.columns=['PostalCode','Borough','Neighborhood']

In [387]:
df_can_sum.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [395]:
df_can_sum.tail()

Unnamed: 0,PostalCode,Borough,Neighborhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."
102,M9W,Etobicoke,Northwest


#### Obtained a 103 x 3 Data Frame

In [390]:
df_can_sum.shape

(103, 3)