In [1]:
#Import Packages
import pandas as pd
import numpy as np

In [2]:
#Scrap data from Wikipedia
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html =pd.read_html(url, header=0)[0]

df = pd.DataFrame(html)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [3]:
# Transformation to be considered before building the model
# 1) Process only where BOROUGH is assigned
# 2) If a postal code has more than one neighborhood, combine them separated by commas
# 3) If a Neighborhood is not assigned then BOROUGH becomes the neighborhood

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postal code     180 non-null object
Borough         180 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


### **`DATA PREPROCESSING`**

In [5]:
# 1) process only where borough is assigned 

df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True) # Dropping rows matching 'Not assigned'
df.reset_index(drop = True, inplace=True) # Resetting the Index

In [6]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [7]:
# 2) If a postal code has more than one neighborhood, combine them separated by commas
# They are already separated by forward slash now just change it to commas
counter = 0
for i in df.Neighborhood:
    df.Neighborhood[counter] = i.replace('/', ',')
    counter += 1

df.Neighborhood

0                                              Parkwoods
1                                       Victoria Village
2                             Regent Park , Harbourfront
3                      Lawrence Manor , Lawrence Heights
4           Queen's Park , Ontario Provincial Government
                             ...                        
98       The Kingsway , Montgomery Road , Old Mill North
99                                  Church and Wellesley
100                Business reply mail Processing CentrE
101    Old Mill South , King's Mill Park , Sunnylea ,...
102    Mimico NW , The Queensway West , South of Bloo...
Name: Neighborhood, Length: 103, dtype: object

In [8]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [14]:
# 3) If a Neighborhood is not assigned then BOROUGH becomes the neighborhood
# Check if any Neighborhood is not assigned

print(df[df['Neighborhood'] == 'Not Assigned'])
print(df[df['Neighborhood'] == ' '])

# There are no Neighborhoods which are empty and Not assigned

Empty DataFrame
Columns: [Postal code, Borough, Neighborhood]
Index: []
Empty DataFrame
Columns: [Postal code, Borough, Neighborhood]
Index: []


In [17]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [18]:
df.shape

(103, 3)