## Importing all the Required Library

In [13]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
print('Importing Done......')

Importing Done......


## Getting the tabel from Wikipedia

In [14]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

## Extraction of the table from html text

In [15]:
soup = BeautifulSoup(source, 'lxml')
match = soup.find('table', {'class':'wikitable sortable'})
link = match.findAll('td')
brough = []
postcode = []
neighbourhood =[]
for i in range(0,len(link),3):
    postcode.append(link[i].text)
    brough.append(link[i+1].text)
    neighbourhood.append(link[i+2].text.rstrip())

## Making the dataframe from list of values

In [16]:
df = pd.DataFrame()
df['Postcode'] = postcode
df['Brough'] = brough
df['Neighbourhood'] = neighbourhood
df.head(5)

Unnamed: 0,Postcode,Brough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Ignoring Not assigned
#### As per question no 1
<p>  - Ignore cells with a borough that is Not assigned.</p>
<p>  - If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.</p>

In [17]:
df_toronto = df.copy()
df_toronto = df_toronto.drop(df_toronto[(df_toronto.Brough == 'Not assigned') & (df.Brough == 'Not assigned')].index)
df_toronto.loc[df_toronto['Neighbourhood'] == 'Not assigned', ['Neighbourhood']] = df_toronto['Brough']
print('Row no in df_toronto data frame {} & columns {}'.format(df_toronto.shape[1],df_toronto.shape[0]))
df_toronto.head(10)

Row no in df_toronto data frame 3 & columns 210


Unnamed: 0,Postcode,Brough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Queen's Park,Queen's Park
9,M9A,Queen's Park,Queen's Park
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


## Group by the data by Psdt code and Brough

In [18]:
df_toronto_groupby = df_toronto.groupby(['Postcode','Brough'])['Neighbourhood'].apply(', '.join).reset_index()
df_toronto_groupby.head(5)

Unnamed: 0,Postcode,Brough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## Building a coordinate data frame of all the Postcode

In [19]:
df_geo_coord = pd.read_csv('https://cocl.us/Geospatial_data')
df_geo_coord.columns = ['Postcode', 'Latitude', ' Longitude']
df_geo_coord.head(5)

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Now merge the two data frame to get the coordinate of all Post code

In [20]:
df_toronto_with_coord = pd.merge(df_toronto_groupby, df_geo_coord, on = 'Postcode', how = 'inner')
df_toronto_with_coord.columns = ['Postcode', 'Brough', 'Neighbourhood', 'Latitude', 'Longitude']
df_toronto_with_coord.head(5)

Unnamed: 0,Postcode,Brough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
