In [2]:
#Import Packages
import pandas as pd
import numpy as np

In [3]:
#Scrap data from Wikipedia
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

html =pd.read_html(url, header=0)[0]

df = pd.DataFrame(html)
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [4]:
# Transformation to be considered before building the model
# 1) Process only where BOROUGH is assigned
# 2) If a postal code has more than one neighborhood, combine them separated by commas
# 3) If a Neighborhood is not assigned then BOROUGH becomes the neighborhood

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
Postal code     180 non-null object
Borough         180 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


### **`DATA PREPROCESSING`**

In [6]:
# 1) process only where borough is assigned 

df.drop(df[df['Borough'] == 'Not assigned'].index, inplace=True) # Dropping rows matching 'Not assigned'
df.reset_index(drop = True, inplace=True) # Resetting the Index

In [7]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Regent Park / Harbourfront
3,M6A,North York,Lawrence Manor / Lawrence Heights
4,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government


In [8]:
# 2) If a postal code has more than one neighborhood, combine them separated by commas
# They are already separated by forward slash now just change it to commas
counter = 0
for i in df.Neighborhood:
    df.Neighborhood[counter] = i.replace('/', ',')
    counter += 1

df.Neighborhood

0                                              Parkwoods
1                                       Victoria Village
2                             Regent Park , Harbourfront
3                      Lawrence Manor , Lawrence Heights
4           Queen's Park , Ontario Provincial Government
                             ...                        
98       The Kingsway , Montgomery Road , Old Mill North
99                                  Church and Wellesley
100                Business reply mail Processing CentrE
101    Old Mill South , King's Mill Park , Sunnylea ,...
102    Mimico NW , The Queensway West , South of Bloo...
Name: Neighborhood, Length: 103, dtype: object

In [9]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [10]:
# 3) If a Neighborhood is not assigned then BOROUGH becomes the neighborhood
# Check if any Neighborhood is not assigned

print(df[df['Neighborhood'] == 'Not Assigned'])
print(df[df['Neighborhood'] == ' '])

# There are no Neighborhoods which are empty and Not assigned

Empty DataFrame
Columns: [Postal code, Borough, Neighborhood]
Index: []
Empty DataFrame
Columns: [Postal code, Borough, Neighborhood]
Index: []


In [11]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government"


In [12]:
df.shape

(103, 3)

In [36]:
# Read the Coordinates for the postal codes from the csv
url = 'http://cocl.us/Geospatial_data'

df_coordinates = pd.read_csv(url)
df_coordinates.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [26]:
# Check the column names as the same in the 1st data frame to merge
df_coordinates.columns

Index(['Postal Code', 'Latitude', 'Longitude'], dtype='object')

In [41]:
# Renaming the column - Postal Code
df_coordinates.rename(columns = {'Postal Code':'Postal code'}, inplace=True)

In [45]:
# Merge both the dataframes
df_new = pd.merge(df, df_coordinates, how='inner', on=['Postal code'])

In [47]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
Postal code     103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
Latitude        103 non-null float64
Longitude       103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


In [48]:
df_new

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park , Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway , Montgomery Road , Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,Business reply mail Processing CentrE,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South , King's Mill Park , Sunnylea ,...",43.636258,-79.498509
