### import the required libraries

In [51]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests
from bs4 import BeautifulSoup

### parse the wikipedia page to get html data, use the package BeautifulSoup to make data extraction easier

In [52]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

### define the dataframe columns and instantiate the dataframe

In [53]:
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 
wikipedia_data = pd.DataFrame(columns=column_names)

### extract the table data from html tags and append in dataframe

In [54]:
for table in soup.find_all('table',class_='wikitable sortable'):
    for tr in table.find_all('tr'):
        tds = tr.find_all('td')
        if not tds:
            continue
        postcode, borough, neighbourhood = [td.text.strip() for td in tds]
        wikipedia_data = wikipedia_data.append({'PostalCode':postcode,
                     'Borough':borough,
                     'Neighborhood':neighbourhood},ignore_index=True)

### initial raw data shape

In [173]:
wikipedia_data.shape

(289, 3)

### process the cells that have an assigned borough, ignore cells with a borough that is Not assigned

In [55]:
df_data = wikipedia_data[wikipedia_data.Borough != 'Not assigned'].reset_index(drop=True)

### shape of the new derived dataframe after removing the rows

In [175]:
df_data.shape

(212, 3)

### comma seperated neighborhood with same postalcode

In [56]:
df_data_grouped=df_data.groupby(['PostalCode','Borough'])['Neighborhood'].agg(', '.join).reset_index(name='Neighborhood')

### shape of the new derived dataframe after grouping the rows

In [177]:
df_data_grouped.shape

(103, 3)

### for Not assigned Neighborhood, replacing the value with Borough

In [57]:
df_data_sliced = df_data_grouped[df_data_grouped['Neighborhood']=='Not assigned']
for i in list(df_data_sliced.index.values):
    df_data_grouped.iloc[i]['Neighborhood']=df_data_grouped.iloc[i]['Borough']

### final shape of the dataframe

In [18]:
df_data_grouped.shape

(103, 3)

### taking the co-ordinates from the csv file

In [58]:
latlong_data = pd.read_csv('http://cocl.us/Geospatial_data')

### merging the two dataframes to get the required dataframe

In [59]:
df_data_merged=pd.merge(df_data_grouped,latlong_data,left_on='PostalCode',right_on='Postal Code')

In [62]:
df_data_merged.drop('Postal Code',axis=1,inplace=True)

In [63]:
df_data_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
