## <span style='color:Green'> First, I installed some libraries in order to proceed with the scraping. </span>

In [2]:
pip install lxml html5lib beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd

In [41]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [42]:
df = pd.read_html(url)

## <span style='color:Green'> The dataframe from the Wikipedia page: </span>

In [43]:
df[0].head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [44]:
df = df[0]

## <span style='color:Green'> In the code below, the cells with a Borough that is 'Not assigned' are ignored. </span>

In [45]:
(df == 'Not assigned').any()

Postal Code      False
Borough           True
Neighbourhood     True
dtype: bool

In [49]:
import numpy as np
df.loc[:, 'Borough'].replace('Not assigned', np.nan, inplace=True)

In [50]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,Not assigned
1,M2A,,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [52]:
df.dropna(subset=['Borough'], axis=0, inplace=True)

In [53]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [54]:
df.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## <span style='color:Green'> More than one Neighborhood can exist in one postal code area; so, let us group by Postal Code: </span>

In [55]:
df.groupby('Postal Code')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1220a1e750>

In [56]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## <span style='color:Green'> In the cell below, I show that there are no cell in Neighborhood columns with 'Not assigned' value, so we can skip the replace of values. </span>

In [58]:
(df == 'Not assigned').any()

Postal Code      False
Borough          False
Neighbourhood    False
dtype: bool

## <span style='color:Green'> In conclusion, the dataframe has 103 rows and 3 columns, as shown below: </span>

In [59]:
df.shape

(103, 3)