In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

### Gathering the data:
Create a get request for the wiki page then create a BeautifulSoup instance.

In [2]:
# This is the url to the latest version of the wiki page. As a consequence, the data is no longer in the original order.
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

results = requests.get(url).text

soup = BeautifulSoup(results, features='html5lib')

### Web Scraping:
Scrape the desired table from the wiki page and save the information as a list of tuples then convert it into a dataframe.

In [3]:
toronto_list=[] # Will hold the data for processing into a dataframe.

for td in soup.table.find_all('td'):
    """ Loop through all of the rows, identify and save all of the 
    desired information in their designated variables """
    # Using Try/Except incase of any empty cells in the wiki table.
    try:
        postal_code = td.find('b').text
                
        borough = td.text.split('(')[0][4:].strip()
        # Change any borough with a value of "Not Assigned" to NaN:
        if borough == 'Not assigned':
            borough = np.nan
        
        hoods_list = td.text.strip().split('(')[-1].strip(')').split('/')
        if hoods_list[0][3:] == 'Not assigned':
            # Change any neighborhood list with a value of "Not assigned" to NaN (borough):
            hoods_list = borough
        else:
            # loop through the hoods_list and strip any spaces from either side of each element.
            hoods_list = [hood.strip() for hood in hoods_list]

    except:
        None
    
    toronto_list.append((postal_code, borough, hoods_list))

# Create a dataframe from the list of tuples, "toronto_list":
toronto_df = pd.DataFrame(toronto_list, columns=['PostalCode', 'Borough', 'Neighborhoods'])
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M1A,,
1,M2A,,
2,M3A,North York,[Parkwoods]
3,M4A,North York,[Victoria Village]
4,M5A,Downtown Toronto,"[Regent Park, Harbourfront]"


## Data Wrangling:

In [4]:
# drop any row that has up to 2 NaN values (Borrough and Neighborhoods).
toronto_df.dropna(thresh=2, inplace=True)

# Casting the list of Neighborhoods as a string then joining the elements:
toronto_df.Neighborhoods = toronto_df.Neighborhoods.str.join(',')

toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park,Harbourfront"
5,M6A,North York,"Lawrence Manor,Lawrence Heights"
6,M7A,Queen's Park,Ontario Provincial Government


___
## Data Exploration:
Looking for anything unusual and correcting any errors.

### Inspecting the boroughs for irregularities:

In [5]:
# Looking for anomolies:
toronto_df.Borough.unique()

array(['North York', 'Downtown Toronto', "Queen's Park", 'Etobicoke',
       'Scarborough', 'East York', 'York', 'East Toronto', 'West Toronto',
       'East YorkEast Toronto', 'Central Toronto',
       'MississaugaCanada Post Gateway Processing Centre',
       'Downtown TorontoStn A PO Boxes25 The Esplanade',
       'EtobicokeNorthwest',
       'East TorontoBusiness reply mail Processing Centre969 Eastern'],
      dtype=object)

In [6]:
# finding the indexes of the anomolies for correction:
anomolies = ['East YorkEast Toronto',
       'MississaugaCanada Post Gateway Processing Centre',
       'Downtown TorontoStn A PO Boxes25 The Esplanade',
       'EtobicokeNorthwest',
       'East TorontoBusiness reply mail Processing Centre969 Eastern']

anom_indexs = []
for index, bor in zip(toronto_df.index,toronto_df.Borough):
    if bor in anomolies:
        anom_indexs.append(index)
anom_indexs

[57, 114, 148, 152, 168]

In [7]:
# Visually examining the anomalous rows to identify the corrections needed:
for ind in anom_indexs:
    print(toronto_df.loc[ind,:], '\n')

PostalCode                         M4J
Borough          East YorkEast Toronto
Neighborhoods       The Danforth  East
Name: 57, dtype: object 

PostalCode                                                    M7R
Borough          MississaugaCanada Post Gateway Processing Centre
Neighborhoods                                      Enclave of L4W
Name: 114, dtype: object 

PostalCode                                                  M5W
Borough          Downtown TorontoStn A PO Boxes25 The Esplanade
Neighborhoods                                    Enclave of M5E
Name: 148, dtype: object 

PostalCode                                                     M9W
Borough                                         EtobicokeNorthwest
Neighborhoods    Clairville,Humberwood,Woodbine Downs,West Humb...
Name: 152, dtype: object 

PostalCode                                                     M7Y
Borough          East TorontoBusiness reply mail Processing Cen...
Neighborhoods                                      

In [8]:
# make the corrections to the data:
toronto_df.loc[57,'Neighborhoods'] = 'The Danforth'
toronto_df.loc[57,'Borough'] = 'East York'

toronto_df.loc[114,'Borough'] = 'Mississauga Canada Post Gateway Processing Centre'
toronto_df.loc[148,'Borough'] = 'Downtown Toronto Stn A PO Boxes 25 The Esplanade'
toronto_df.loc[152,'Borough'] = 'Etobicoke'
toronto_df.loc[168,'Borough'] = 'East Toronto Business reply mail Processing Centre 969 Eastern'

### Inspecting the neighborhoods for irregularities:

In [9]:
# Reset the index:
toronto_df.reset_index(drop=True, inplace=True)

In [10]:
# Skim over and inspect the Neighborhood values for any irregularities:
toronto_df.Neighborhoods[:10]

0                          Parkwoods
1                   Victoria Village
2           Regent Park,Harbourfront
3    Lawrence Manor,Lawrence Heights
4      Ontario Provincial Government
5                   Islington Avenue
6                      Malvern,Rouge
7                    Don Mills)North
8     Parkview Hill,Woodbine Gardens
9           Garden District, Ryerson
Name: Neighborhoods, dtype: object

In [11]:
# Remove the parenthesis observed in some of the Neighboorhood names:
hoodlist=[]
import re
for hood in toronto_df.Neighborhoods:
    hood = re.sub('\)'," ", str(hood)) # substitute the parenthesis
    hoodlist.append(hood)

# cast the list to Series and make it the new Neighborhoods column:
toronto_df.Neighborhoods = pd.Series(hoodlist)

# Verify that the parenthesis has been removed:
toronto_df.Neighborhoods[:10]

0                          Parkwoods
1                   Victoria Village
2           Regent Park,Harbourfront
3    Lawrence Manor,Lawrence Heights
4      Ontario Provincial Government
5                   Islington Avenue
6                      Malvern,Rouge
7                    Don Mills North
8     Parkview Hill,Woodbine Gardens
9           Garden District, Ryerson
Name: Neighborhoods, dtype: object

### Verification:
Verifying the quality of the datawithin the dataframe.

In [12]:
# Check if there are any duplicate Postal Codes:
toronto_df.loc[toronto_df.loc[:,'PostalCode'].duplicated()==True]

Unnamed: 0,PostalCode,Borough,Neighborhoods


In [13]:
# Verifying that the Neighborhoods of Postal Code = 'M5A' matches the specifications in the assignment:
toronto_df.loc[toronto_df.loc[:,'PostalCode'] == 'M5A']

Unnamed: 0,PostalCode,Borough,Neighborhoods
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"


### The Final Data Frame:

In [14]:
# As stated before, the original wiki page has been updated and the order of the postal codes is now different from before.
toronto_df.head(12)

Unnamed: 0,PostalCode,Borough,Neighborhoods
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park,Harbourfront"
3,M6A,North York,"Lawrence Manor,Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern,Rouge"
7,M3B,North York,Don Mills North
8,M4B,East York,"Parkview Hill,Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [15]:
# Saving the dataframe as a csv file for later manipulation:
toronto_df.to_csv('toronto_wrangled.csv', index=False)

In [16]:
toronto_df.shape

(103, 3)