In [1]:
import pandas as pd # import required library
# read the data using pandas read_html
tables = pd.read_html("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [2]:
# since it's the first table, let's add it to a dataframe
df0 = tables[0]
df0.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
# remove cells with a borough that is Not assigned.
df0 = df0[df0['Borough'] != 'Not assigned']
df0[:10]

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


In [4]:
# now, let's create a new data frame with each postal code per row
# initialize the DF with unique postal codes
# note: this is an alternative to groupby

df = pd.DataFrame({'PostalCode' : list(df0['Postcode'].unique())})
df

Unnamed: 0,PostalCode
0,M3A
1,M4A
2,M5A
3,M6A
4,M7A
...,...
98,M8X
99,M4Y
100,M7Y
101,M8Y


In [5]:
# helper functions for getting boroughs and neighborhoods

# the borough is located in the 2rd column of df0
def getBorough(code):
    return(df0[df0['Postcode'] == code].iloc[0,1])

# the neighborhood is located on the 3th column
# extract the values and join them using a comma
def getNeighborhood(code):
    tempList = df0[df0['Postcode'] == code].iloc[:,2].values
    return ','.join(tempList)



In [6]:
# apply the functions to df0

# get the borough for each postal code
df['Borough'] = df['PostalCode'].apply(getBorough)

# get the neighborhood for each postal code
df['Neighborhood'] = df['PostalCode'].apply(getNeighborhood)

df[:10]

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Not assigned
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [7]:
for i in range(len(df)):
    if df.iloc[i,2] == 'Not assigned': # if the value is Not assigned
        df.iloc[i,2] = df.iloc[i,1] # assign the value of the borough

df[:10]       

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Downtown Toronto,Queen's Park
5,M9A,Queen's Park,Queen's Park
6,M1B,Scarborough,"Rouge,Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens,Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson,Garden District"


In [8]:
df.shape

(103, 3)