# Libraries  

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import csv

# Getting the source code webpage

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text 
soup = BeautifulSoup(source, 'lxml')

# Creating a csv File and adding the columns name to it

In [3]:
csv_file = open('toronto_postal_codes.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Postcode', 'Borough', 'Neighbourhood'])

32

# Scrapping the webpage to only get the table

In [5]:
table = soup.find('table', class_ = 'wikitable sortable') # Gets the table from the webpage
rows = table.find_all('tr') # Gets the table rows

postcodes = [] # Initializes the raw postcodes list
boroughs = [] # Initializes the raw boroughs list
neighbourhoods = [] # Initializes the raw neighbourhoods list

for row in rows:    
    columns = row.find_all('td')
    try :
        if columns[1].text != 'Not assigned':  # To skip if the borough name is 'Not Assigned'
            
            postcode = columns[0].text
            postcodes.append(postcode)
            
            borough = columns[1].text
            boroughs.append(borough)
            
            neighbourhood = columns[2].text.split('\n')[0] # Removing the newline character at the end     
            
            if neighbourhood == 'Not assigned': # Assigning the same name to neighbourhood if it is 'Not Assigned'
                neighbourhood = borough            
                
            neighbourhoods.append(neighbourhood)
             
    except Exception as e : # To skip the first row which contains column names
        pass 
    
postcode_explored = [] # Initializing the list of explored postcodes
for index_i, postcode_i in enumerate(postcodes) :   
    if postcode_i not in postcode_explored :
        nbds = neighbourhoods[index_i]
        for index_f, postcode_f in enumerate(postcodes) :
            if postcode_i == postcode_f and index_i != index_f:
                nbds = nbds + ', ' + neighbourhoods[index_f] # Concatenating the neighbourhood names
        csv_writer.writerow([postcode_i, boroughs[index_i], nbds]) # Writing the rows in the csv file
        postcode_explored.append(postcode_i)

# Close the csv file

In [6]:
csv_file.close()

# Creating the dataframe

In [8]:
df = pd.read_csv('toronto_postal_codes.csv')

# Checking the dataframe

In [9]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A\r\n,Not assigned\r\n,Not assigned\r\n
1,M2A\r\n,Not assigned\r\n,Not assigned\r\n
2,M3A\r\n,North York\r\n,Parkwoods
3,M4A\r\n,North York\r\n,Victoria Village
4,M5A\r\n,Downtown Toronto\r\n,"Regent Park, Harbourfront"


Notice that there are "\r\n" in all the dataframe, let's remove them

# Removing the "\r\n"

In [10]:
df['Postcode'] = df['Postcode'].replace('\r\n;','', regex=True)
df['Borough'] = df['Borough'].replace('\r\n;','', regex=True)
df['Neighbourhood'] = df['Neighbourhood'].replace('\r\n;','', regex=True)

# Result:

In [11]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A\r\n,Not assigned\r\n,Not assigned\r\n
1,M2A\r\n,Not assigned\r\n,Not assigned\r\n
2,M3A\r\n,North York\r\n,Parkwoods
3,M4A\r\n,North York\r\n,Victoria Village
4,M5A\r\n,Downtown Toronto\r\n,"Regent Park, Harbourfront"
