### import the required libraries

In [169]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests
from bs4 import BeautifulSoup

### parse the wikipedia page to get html data, use the package BeautifulSoup to make data extraction easier

In [170]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

### define the dataframe columns and instantiate the dataframe

In [171]:
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
wikipedia_data = pd.DataFrame(columns=column_names)

### extract the table data from html tags and append in dataframe

In [172]:
for table in soup.find_all('table',class_='wikitable sortable'):
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        postcode, borough, neighbourhood = [td.text.strip() for td in tds]
        wikipedia_data = wikipedia_data.append({'PostalCode':postcode,
                     'Borough':borough,
                     'Neighborhood':neighbourhood},ignore_index=True)

### initial raw data shape

In [173]:
wikipedia_data.shape

(289, 3)

### process the cells that have an assigned borough, ignore cells with a borough that is Not assigned

In [174]:
df_data = wikipedia_data[wikipedia_data.Borough != 'Not assigned'].reset_index(drop=True)

### shape of the new derived dataframe after removing the rows

In [175]:
df_data.shape

(212, 3)

### comma seperated neighborhood with same postalcode

In [176]:
df_data_grouped=df_data.groupby(['PostalCode','Borough'])['Neighborhood'].agg(', '.join).reset_index(name='Neighborhood')

### shape of the new derived dataframe after grouping the rows

In [177]:
df_data_grouped.shape

(103, 3)

### for Not assigned Neighborhood, replacing the value with Borough

In [178]:
df_data_sliced = df_data_grouped[df_data_grouped['Neighborhood']=='Not assigned']
for i in list(df_data_sliced.index.values):
    df_data_grouped.iloc[i]['Neighborhood']=df_data_grouped.iloc[i]['Borough']

### final shape of the dataframe

In [179]:
df_data_grouped.shape

(103, 3)