## Importing dependencies

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis

# Part 1 - Import and clean data
## Import the data

In [256]:
source_url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M" #As given in the assignment

raw_page = pd.read_html(source_url)

toronto_data_raw = raw_page[0] #The 1st table from the raw page
print("Raw data shape",toronto_data_raw.shape)
toronto_data_raw.head(10)

Raw data shape (180, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


## Clean the dataframe

In [257]:
#Remove not assigned boroughs
toronto_data = toronto_data_raw[toronto_data_raw['Borough'] != 'Not assigned'].reset_index(drop=True)

print("Data shape",toronto_data.shape)
toronto_data.head(10)

Data shape (103, 3)


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [258]:
#Replace not assigned neighbohoods with the borough's name
changes = 0

for index, row in toronto_data.iterrows():
    if row['Neighborhood'] == 'Not assigned':
        row['Neighborhood'] = row['Borough']
        changes = changes + 1


print(changes, "changes were made")
toronto_data.head(10)

0 changes were made


Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [9]:
print("Clean data shape",toronto_data.shape)

Clean data shape (103, 3)


# Part 2 - Add Location Data

In [259]:
# Using the code to fetch coordinates from the csv file as published in the assignment
file_name = "Geospatial_Coordinates.csv"
postal_codes_coord = pd.read_csv(file_name)
postal_codes_coord.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### As column names are the same we'll just merger the 2 dataframes

In [260]:
toronto_data = pd.merge(toronto_data, postal_codes_coord, on='Postal Code', how='inner')
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [172]:
postal_codes_coord['Longitude']

0     -79.194353
1     -79.160497
2     -79.188711
3     -79.216917
4     -79.239476
         ...    
98    -79.518188
99    -79.532242
100   -79.554724
101   -79.588437
102   -79.594054
Name: Longitude, Length: 103, dtype: float64

In [171]:
postal_codes_coord['Longitude'][0]

-79.19435340000001

In [192]:
temp_row = postal_codes_coord.loc[postal_codes_coord['Postal Code'] == 'M1B']
temp_row

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353


In [194]:
k = temp_row['Longitude'][0]

In [195]:
k

-79.19435340000001

In [197]:
print(k)

-79.19435340000001


In [163]:
q[[0]]

KeyError: "None of [Int64Index([0], dtype='int64')] are in the [columns]"

In [166]:
temp_row = postal_codes_coord.loc[postal_codes_coord['Postal Code'] == 'M3A']
temp_row

Unnamed: 0,Postal Code,Latitude,Longitude
25,M3A,43.753259,-79.329656


In [169]:
type(temp_row['Longitude'])

pandas.core.series.Series

In [158]:
q['Latitude'] = temp_row['Latitude'].astype(float)
q


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",,
6,M1B,Scarborough,"Malvern, Rouge",,
7,M3B,North York,Don Mills,,
8,M4B,East York,"Parkview Hill, Woodbine Gardens",,
9,M5B,Downtown Toronto,"Garden District, Ryerson",,


In [147]:
t = q.loc[q['Postal Code'] == 'M5G']
q['Latitude'] = lat
q.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


In [108]:
h = 0.34


In [109]:
h = lat

In [110]:
h

57    50.657952
Name: Latitude, dtype: float64

In [111]:
type(h)

pandas.core.series.Series

In [97]:
type(temp_row['Latitude'])


pandas.core.series.Series

In [84]:
type(temp_row['Latitude'].astype(float))

pandas.core.series.Series

In [88]:
f = temp_row.to_numeric(s, errors='coerce')
f

AttributeError: 'DataFrame' object has no attribute 'to_numeric'

In [53]:
temp_row = postal_codes_coord.loc[postal_codes_coord['Postal Code'] == 'M5G']
type(temp_row)
temp_row['Latitude']
# row['Longitude'] = temp_row['Longitude']

57    43.657952
Name: Latitude, dtype: float64