### Library's

In [None]:
import numpy as np
import pandas as pd 
import requests

#### We will use pandas method "set_option" when we need to analyze dataframes whit a lot of columns or rows

In [77]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### The most simple way for scraping web it's the method read_html, but we can use library's like beautifulsoup/requests

In [78]:
import pandas as pd
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
torontoDF=pd.read_html(url, header=0)[0]
print(torontoDF.shape)
torontoDF.head()

(288, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


#### As you can see the dataframe have many "Not assigned" values at the Borough and Neighbourhood columns, we should delete these rows, because they don't give us usefull information

In [79]:
torontoDF = torontoDF[~torontoDF['Borough'].isin(['Not assigned'])]
print(torontoDF.shape)
torontoDF.reset_index(inplace=True)
torontoDF.drop('index', axis=1, inplace=True)
torontoDF.head()

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### If we find 'Not assigned' values in the Neighbourhood column we should replace them with the Borough value, the best way to do this could be the for statment because you can create conditions and check row by row

In [80]:
for index, row in torontoDF.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        row['Neighbourhood'] = row['Borough']


print(torontoDF.shape)
torontoDF.head(15)

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Queen's Park
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


#### The last step is the more complicated, first of all we group the information by the Postcode and then use the aggregate method for the neighbourhood's, with this we can make the neighbourhood's join between the duplicated Postcodes

In [81]:
torontoDF = torontoDF.groupby('Postcode').agg({'Borough':'first',
                               'Neighbourhood': ', '.join}).reset_index()

print(torontoDF.shape)
torontoDF.head()

(103, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [82]:
torontoDF.shape

(103, 3)