# Applied Data Science Capstone Project

We will start by importing all the necessary libraries. 

In [None]:
import json


!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim 
import folium

import pandas as pd

In [75]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [76]:
df = pd.read_html(url, header = 0)

Check to see the number of tables on the webpage

In [77]:
len(df)

3

Out of the 3 tables, we only want the table with the info about different neighbourhoods

In [78]:
df[0]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Create a dataframe for canadian neighbouthoods using the first table and get it's shape. We can see that there are 180 rows and 3 columns in this dataframe

In [137]:
df_ca = df[0]
df_ca.shape

(180, 3)

We clean the data, only keeping the Boroughs with an assigned name.

In [139]:
df_ca= df_ca.loc[df_ca['Borough']!='Not assigned']

In [140]:
df_ca.reset_index(inplace=True)

In [None]:
df_ca.drop(columns=['index'], inplace=True)

In [142]:
#what the new df looks like
df_ca.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Check the shape of the new datafram. We now have 103 rows and 3 columns.

In [143]:
df_ca.shape

(103, 3)

Load the Geocoordinates and have a look at the first few lines. I am doing it using the csv file.

In [88]:
geo_df = pd.read_csv('http://cocl.us/Geospatial_data')
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Create a new dataframe by merging the 2 dataframes and check if we have the correct number of rows. We can see that we did not lose any rows during this merge/join.

In [144]:
new_df =pd.merge(df_ca, geo_df, left_on='Postal Code', right_on='Postal Code', how='left') 
new_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [145]:
new_df.shape

(103, 5)

Next, get all the unique Borough names. This is in order to create a dataframe that only contains Toronto Boroughs. 

In [146]:
new_df.Borough.unique()

array(['North York', 'Downtown Toronto', 'Etobicoke', 'Scarborough',
       'East York', 'York', 'East Toronto', 'West Toronto',
       'Central Toronto', 'Mississauga'], dtype=object)

In [149]:
#boroughs that aren't in Toronto, doing this just to double check my data
else_data = new_df[~new_df['Borough'].isin(['Downtown Toronto', 'East Toronto','West Toronto', 'Central Toronto'])]
else_data.shape

(64, 5)

In [206]:
#boroughs in Toronto
toronto_data = new_df[new_df['Borough'].isin(['Downtown Toronto', 'East Toronto','West Toronto', 'Central Toronto'])]
toronto_data.shape

(39, 5)

If we add the number of rows in the 2 dataframes, to be sure that we have the total number of rows of the original dataframe. ie, 103 rows. Below are the first few rows of the dataframe with only Toronto Neighbourhoods

In [209]:
toronto_data.reset_index(drop=True, inplace=True)
toronto_data.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In order to create map, start by getting the Geographic coordinates of Toronto.

In [210]:
from geopy.geocoders import Nominatim
address = 'Toronto'

geolocator = Nominatim(user_agent="explore_toronto")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Next, create the Folium map for Toronto by using the latitudes and longitudes. Add markers. I would like the popup to show both borough and neighbourhood. 

In [208]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

for lat, lng,borough, neighbourhood in zip(toronto_data['Latitude'], toronto_data['Longitude'],toronto_data['Borough'], toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Thank you for your feedback!