## Load and Preprocess Dataframe

#### import libraries

In [4]:
import pandas as pd
import numpy as np

#### read dataframe via url, and have an initial view

In [5]:
d = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
df = d[0]

In [6]:
df.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [7]:
df.nunique()

Postal code     180
Borough          11
Neighborhood     98
dtype: int64

#### Clean data: 
1. check if 'Not assigned' in Borough has Neighborhood
2. if yes, assign Borough
3. remove the 'Not assigned' columns in Borough
4. merge the Neighborhood with the same Borough

In [8]:
df[df['Borough']=='Not assigned']

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
7,M8A,Not assigned,
10,M2B,Not assigned,
15,M7B,Not assigned,
...,...,...,...
174,M4Z,Not assigned,
175,M5Z,Not assigned,
176,M6Z,Not assigned,
177,M7Z,Not assigned,


In [9]:
df.drop(df[df['Borough']=='Not assigned'].index, inplace=True)

In [10]:
df.columns = ['Postalcode','Borough','Neighborhood'] # change column names

In [11]:
df.reset_index(drop=True, inplace=True) # reset indexes

In [12]:
df['Postalcode'].nunique() # the Wiki page has been updated and no need to merge the Neighborhood with the same Postalcode

103

In [13]:
df['Neighborhood'] = df['Neighborhood'].apply(lambda x: eval(repr(x).replace(' / ', ', ')))

In [14]:
df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [15]:
df.shape

(103, 3)

## Add location information

In [17]:
!pip install geocoder 
import geocoder # import geocoder

Looking in indexes: http://pypi.douban.com/simple


In [18]:
# all postal codes
postal_codes = df['Postalcode']
postal_codes

0      M3A
1      M4A
2      M5A
3      M6A
4      M7A
      ... 
98     M8X
99     M4Y
100    M7Y
101    M8Y
102    M8Z
Name: Postalcode, Length: 103, dtype: object

#### find the location information by using geocoder

In [None]:
# It takes too long to get the latitude and longitude information using geocoder, here only gices the code but not excuted.
# The location information is added by using the provided csv file.

latitude = []
longitude = []

for postal_code in postal_codes:
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = latitude.append(lat_lng_coords[0])
    longitude = longitude.append(lat_lng_coords[1])

df['Latitude'] = latitude
df['Longitude'] = longitude
df.head()

#### find the location information by using the provided csv

In [20]:
df_geo = pd.read_csv('Geospatial_Coordinates.csv')

In [21]:
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [22]:
df_geo.columns = ['Postalcode', 'Latitude','Longitude']

In [28]:
df_g = pd.merge(df, df_geo, on='Postalcode')
df_g.head(10)

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
