In [1]:
import pandas as pd

### Use pandas to read the data from wikipedia and load it to a dataframe

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', skiprows = 1)[0]

In [3]:
df.head()

Unnamed: 0,0,1,2
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Update the column names to the approriate listing

In [4]:
df = df.rename(columns = {0: 'PostalCode', 1: 'Borough', 2: 'Neighborhood'})
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Delete any rows where the Borough is "Not assigned"

In [5]:
df = df[df['Borough'] != 'Not assigned']
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront
5,M6A,North York,Lawrence Manor / Lawrence Heights
6,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [6]:
df[df['PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhood
4,M5A,Downtown Toronto,Regent Park / Harbourfront


### Replace the "/" in the Neighborhood column to a "," signifying the split of multiple neighborhoods assigned to the same Borough.

In [7]:
df['Neighborhood'] = df['Neighborhood'].str.replace('/', ',')
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
5,M6A,North York,"Lawrence Manor , Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


### If a cell has a Borough but not an assigned Neighborhood, then the Neighborhood will be the Borough value

In [14]:
unassigned_neighborhoods = 0
for ind, row in df.iterrows():
    n = row['Neighborhood']
    if n is None:
        unassigned_neighborhoods +=1
    else:
        unassigned_neighborhoods  = unassigned_neighborhoods + 0
        
print('There are a total of {} unassigned neighborhoods.'.format(unassigned_neighborhoods))

There are a total of 0 unassigned neighborhoods.


### There is no need to do additional cleansing as we can see all Neighborhoods have an assigned Borough so there is no need to replace any values

In [15]:
df[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


In [16]:
df.shape

(103, 3)