# Part 1

#### Import necessary modules

In [20]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd
from geopy.geocoders import Nominatim 

#### Use BeautifulSoup to scrap the webpage and find the required table 

In [21]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")
all_tables=soup.find_all("table")
right_table=soup.find('table', class_='wikitable sortable')

#### Copy the contents of the table to three separate arrays corresponding to the three columns

In [22]:
A, B, C = [], [], []
for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells) == 3:
        A.append(str(cells[0].find(text=True)).strip())
        B.append(str(cells[1].find(text=True)).strip())
        C.append(str(cells[2].find(text=True)).strip().replace(' / ', ', '))


#### Convert the arrays to dataframe

In [23]:
df = pd.DataFrame(A, columns=['PostalCode'])
df['Borough'] = B
df['Neighborhood'] = C
#df.replace(r'\s+|\\n', ' ', regex=True, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Drop all the rows containing 'Not assigned' Boroughs

In [24]:
indexNames = df[df['Borough'] == 'Not assigned'].index
df.drop(indexNames , inplace=True)
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Check for duplicates in the PostalCode so that Neighborhoods can be aggregated, if duplicates are present

In [25]:
 df['PostalCode'].duplicated().any()

False

#### ==> Since there are no duplicates, and Neighborhoods are already aggregated no rearrangemets are required

#### The code to make rearrangements anyway is as follows

In [26]:
result = df.groupby(['PostalCode','Borough'], sort=False).agg( ', '.join)
result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
PostalCode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Check for 'Not assigned' or empty Boroughs

In [27]:
df.loc[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


#### ==> All the Boroughs are assigned, so no rearrangements are required

#### The shape of the final dataframe

In [28]:
print(df.shape)

(103, 3)


# Part 2

In [29]:
df_lat_lon = pd.read_csv('Geospatial_Coordinates.csv')
df_lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
df_lat_lon.columns = ['PostalCode', 'Latitude', 'Longitude']
df_lat_lon.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [33]:
df_cord= pd.merge(df, df_lat_lon[['PostalCode', 'Latitude', 'Longitude']], on='PostalCode')
df_cord

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
