# Part 1

#### Import necessary modules

In [14]:
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

#### Use BeautifulSoup to scrap the webpage and find the required table 

In [15]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "lxml")
all_tables=soup.find_all("table")
right_table=soup.find('table', class_='wikitable sortable')

#### Copy the contents of the table to three separate arrays corresponding to the three columns

In [16]:
A, B, C = [], [], []
for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells) == 3:
        A.append(str(cells[0].find(text=True)).strip())
        B.append(str(cells[1].find(text=True)).strip())
        C.append(str(cells[2].find(text=True)).strip().replace(' / ', ', '))


#### Convert the arrays to dataframe

In [17]:
df = pd.DataFrame(A, columns=['PostalCode'])
df['Borough'] = B
df['Neighborhood'] = C
#df.replace(r'\s+|\\n', ' ', regex=True, inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### Drop all the rows containing 'Not assigned' Boroughs

In [18]:
indexNames = df[df['Borough'] == 'Not assigned'].index
df.drop(indexNames , inplace=True)
df = df.reset_index(drop=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


#### Check for duplicates in the PostalCode so that Neighborhoods can be aggregated, if duplicates are present

In [19]:
 df['PostalCode'].duplicated().any()

False

#### ==> Since there are no duplicates, and Neighborhoods are already aggregated no rearrangemets are required

#### The code to make rearrangements anyway is as follows

In [20]:
result = df.groupby(['PostalCode','Borough'], sort=False).agg( ', '.join)
result.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Neighborhood
PostalCode,Borough,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### Check for 'Not assigned' or empty Boroughs

In [21]:
df.loc[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


#### ==> All the Boroughs are assigned, so no rearrangements are required

#### The shape of the final dataframe

In [22]:
print(df.shape)

(103, 3)
