## <span style='color:Green'> First, I installed some libraries in order to proceed with the scraping. </span>

In [6]:
pip install lxml html5lib beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd

In [8]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [9]:
df = pd.read_html(url)

## <span style='color:Green'> The dataframe from the Wikipedia page: </span>

In [10]:
df[0].head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [11]:
df = df[0]

## <span style='color:Green'> In the code below, the cells with a Borough that is 'Not assigned' are ignored. </span>

In [12]:
(df == 'Not assigned').any()

Postal Code      False
Borough           True
Neighbourhood     True
dtype: bool

In [13]:
import numpy as np
df.loc[:, 'Borough'].replace('Not assigned', np.nan, inplace=True)

In [14]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,,Not assigned
1,M2A,,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [15]:
df.dropna(subset=['Borough'], axis=0, inplace=True)

In [16]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [17]:
df.reset_index(drop=True)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


## <span style='color:Green'> More than one Neighborhood can exist in one postal code area; so, let us group by Postal Code: </span>

In [18]:
df.groupby('Postal Code')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fafaa4d8b10>

In [19]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## <span style='color:Green'> In the cell below, I show that there are no cell in Neighborhood columns with 'Not assigned' value, so we can skip the replace of values. </span>

In [20]:
(df == 'Not assigned').any()

Postal Code      False
Borough          False
Neighbourhood    False
dtype: bool

## <span style='color:Green'> In conclusion, the dataframe has 103 rows and 3 columns, as shown below: </span>

In [21]:
df.shape

(103, 3)

# <span style='color:Blue'> -> The second assignment begins from this point. </span>

## <span style='color:Blue'> I used the .csv file because I could not install nor import 'geocoder'. </span>

In [26]:
import pandas as pd
url = 'https://cocl.us/Geospatial_data'
geo_data = pd.read_csv(url)

In [24]:
geo_data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## <span style='color:Blue'> So, we can do the join between the two dataframes, on the common key 'Postal Code': </span>

In [28]:
df.join(geo_data.set_index('Postal Code'), on='Postal Code')

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.753259,-79.329656
3,M4A,North York,Victoria Village,43.725882,-79.315572
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
165,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
