# Applied Data Science Capstone Week 3

## Toronto WIKI Page Wrangling

In [33]:
import pandas as pd

In [34]:
#URL for the postcal codes of Canada
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"


In [35]:
import requests 
from bs4 import BeautifulSoup
response = requests.get(URL).text

In [36]:
#load the html contents of the URL into a response object
toronto = BeautifulSoup(response, 'lxml')

In [37]:
toronto_table = toronto.find('table',{'class': 'wikitable sortable'})
toronto_table_rows = toronto_table.find_all('tr')

In [38]:
#Get the data needed to create a dataframe
data = []
for row in toronto_table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])


In [39]:
#Create the dataframe and delete null values
df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


In [40]:
#Delete dataframs that have a Neighbourhood of "Not Assigned"
df = df[~df['Borough'].str.contains('Not assigned')]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [41]:
#group the dataframe by Postcode and Borough and concatenate all neighborhoods into comma separated list
df = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(list).apply(lambda x: ', '.join(x)).to_frame()
df.reset_index(inplace = True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [42]:
#create a list of neighborhoods, replacing the borough where neighborhood is 'Not assigned'
new_neigh = df['Neighbourhood'].where(df['Neighbourhood'] != 'Not assigned', other = df['Borough'], axis = 0)
#construct new dataframe using postcode and borough from the previous dataframe and neighborhood from the above list
df = pd.concat([df['PostalCode'], df['Borough'], new_neigh], axis = 1)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [43]:
#get the shape
df.shape

(103, 3)

## Geocoding 

In [12]:
#install the geocoder package
!conda install -c conda-forge geocoder -y
import geocoder

Solving environment: done

# All requested packages already installed.



In [44]:
#function to return lat/long for a Postal Code
#Google doesn't work very well, so we are using arcgis as the service

def get_lat_long(postal_code):

    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

In [45]:
## Use the new function to get the Lat/Long for each row of the dataframe
df[['Latitude','Longitude']] = df.apply(
    lambda row: pd.Series(get_lat_long(row['PostalCode'])), axis = 1)


In [77]:
df.head

<bound method NDFrame.head of     PostalCode           Borough  \
0          M1B       Scarborough   
1          M1C       Scarborough   
2          M1E       Scarborough   
3          M1G       Scarborough   
4          M1H       Scarborough   
5          M1J       Scarborough   
6          M1K       Scarborough   
7          M1L       Scarborough   
8          M1M       Scarborough   
9          M1N       Scarborough   
10         M1P       Scarborough   
11         M1R       Scarborough   
12         M1S       Scarborough   
13         M1T       Scarborough   
14         M1V       Scarborough   
15         M1W       Scarborough   
16         M1X       Scarborough   
17         M2H        North York   
18         M2J        North York   
19         M2K        North York   
20         M2L        North York   
21         M2M        North York   
22         M2N        North York   
23         M2P        North York   
24         M2R        North York   
25         M3A        North York  

# Using Foursquare to explore and cluster

Import the necessary libraries

In [48]:
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 
    
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          91 KB

The following NEW packages will be INSTALLED:

    geographiclib: 1.50-py_0   conda-forge
    geopy:         1.20.0-py_0 conda-forge


Downloading and Extracting Packages
geopy-1.20.0         | 57 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environ

In [198]:
# Create the map and visualize the boroughs
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) #

# add a red circle marker to represent Toronto Center City
folium.features.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Toronto Center',
    fill = True,
    fill_color = 'red',
    fill_opacity = 0.6
).add_to(venues_map)

# add the Boroughs as blue circle markers
for lat, lng, Borough in zip(toronto_borough.Latitude, toronto_borough.Longitude, toronto_borough.Borough):
    folium.features.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=Borough,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)
    

venues_map

In [199]:
#Get neigborhoods that area in Toronto
toronto_borough = df[df['Borough'].str.contains("Toronto")]

In [200]:
#Add my foursquare developer account info
CLIENT_ID = 'KLAFVCRKMQGPB453J343CKDZ410JEOGZSZ2VIWOA3GJYHKES' # your Foursquare ID
CLIENT_SECRET = '5FVMTXLCMLB5BI2VG00ABYZPI5ZAPMTSTXP42MPE2KVJRF2N' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: KLAFVCRKMQGPB453J343CKDZ410JEOGZSZ2VIWOA3GJYHKES
CLIENT_SECRET:5FVMTXLCMLB5BI2VG00ABYZPI5ZAPMTSTXP42MPE2KVJRF2N


## Identify medical facilities around toronto within a 500 meter 

In [211]:
## Identify Emergency Rooms around Toronto within 500 meters of lat/long

search_query = 'Medical'
radius = 20000
print(search_query + ' .... OK!')

Medical .... OK!


In [212]:
address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [213]:
#Define the Foursquire URL we will use for our Queries
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, search_query, radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/search?client_id=KLAFVCRKMQGPB453J343CKDZ410JEOGZSZ2VIWOA3GJYHKES&client_secret=5FVMTXLCMLB5BI2VG00ABYZPI5ZAPMTSTXP42MPE2KVJRF2N&ll=43.653963,-79.387207&v=20180604&query=Medical&radius=20000&limit=30'

In [214]:
#Get Request and examine results
results = requests.get(url).json()


In [215]:
#Get the relevant part of the returned JSON and convert it to a pandas dataframe
# assign relevant part of JSON to venues
venues = results['response']['venues']

# tranform venues into a dataframe
df_venues = json_normalize(venues)
df_venues.head()

Unnamed: 0,categories,hasPerk,id,location.address,location.cc,location.city,location.country,location.crossStreet,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.neighborhood,location.postalCode,location.state,name,referralId
0,"[{'id': '4bf58dd8d48988d1b3941735', 'name': 'M...",False,4adc80b5f964a520bb2c21e3,1 King's College Circle,CA,Toronto,Canada,University of Toronto,984,[1 King's College Circle (University of Toront...,"[{'label': 'display', 'lat': 43.66136154313333...",43.661362,-79.3939,,M5S 1A8,ON,Medical Sciences Building,v-1571421973
1,"[{'id': '4bf58dd8d48988d104941735', 'name': 'M...",False,4df0e138d4c04d0392c7e652,180 Dundas St. W,CA,Toronto,Canada,at Chestnut St.,170,"[180 Dundas St. W (at Chestnut St.), Toronto O...","[{'label': 'display', 'lat': 43.655395, 'lng':...",43.655395,-79.386459,,,ON,CIRA Medical Centre,v-1571421973
2,"[{'id': '4bf58dd8d48988d104941735', 'name': 'M...",False,4bd8761309ecb713ffe6487c,"150 York Street, Suite 910",CA,Toronto,Canada,Adelaide Street,533,"[150 York Street, Suite 910 (Adelaide Street),...","[{'label': 'display', 'lat': 43.64960424693243...",43.649604,-79.384442,,M5H 3S5,ON,The Toronto Centre For Medical Imaging,v-1571421973
3,"[{'id': '4bf58dd8d48988d104941735', 'name': 'M...",False,50772cc5498eb6cb971c71b8,14 College St,CA,Toronto,Canada,Yonge St,879,"[14 College St (Yonge St), Toronto ON, Canada]","[{'label': 'display', 'lat': 43.66144809396126...",43.661448,-79.383711,,,ON,Maple Leaf Medical Clinic,v-1571421973
4,"[{'id': '4bf58dd8d48988d104941735', 'name': 'M...",False,4cdf5495f8cdb1f738339112,30 Bond St.,CA,Toronto,Canada,,685,"[30 Bond St., Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65368444376282...",43.653684,-79.378706,,,ON,St Michael's Hospital Medical Imaging,v-1571421973


In [216]:
#Define data of interest and filter dataframe for just what we want
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in df_venues.columns if col.startswith('location.')] + ['id']
df_filtered = df_venues.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
df_filtered['categories'] = df_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
df_filtered.columns = [column.split('.')[-1] for column in df_filtered.columns]

df_filtered.head()

Unnamed: 0,name,categories,address,cc,city,country,crossStreet,distance,formattedAddress,labeledLatLngs,lat,lng,neighborhood,postalCode,state,id
0,Medical Sciences Building,Medical School,1 King's College Circle,CA,Toronto,Canada,University of Toronto,984,[1 King's College Circle (University of Toront...,"[{'label': 'display', 'lat': 43.66136154313333...",43.661362,-79.3939,,M5S 1A8,ON,4adc80b5f964a520bb2c21e3
1,CIRA Medical Centre,Medical Center,180 Dundas St. W,CA,Toronto,Canada,at Chestnut St.,170,"[180 Dundas St. W (at Chestnut St.), Toronto O...","[{'label': 'display', 'lat': 43.655395, 'lng':...",43.655395,-79.386459,,,ON,4df0e138d4c04d0392c7e652
2,The Toronto Centre For Medical Imaging,Medical Center,"150 York Street, Suite 910",CA,Toronto,Canada,Adelaide Street,533,"[150 York Street, Suite 910 (Adelaide Street),...","[{'label': 'display', 'lat': 43.64960424693243...",43.649604,-79.384442,,M5H 3S5,ON,4bd8761309ecb713ffe6487c
3,Maple Leaf Medical Clinic,Medical Center,14 College St,CA,Toronto,Canada,Yonge St,879,"[14 College St (Yonge St), Toronto ON, Canada]","[{'label': 'display', 'lat': 43.66144809396126...",43.661448,-79.383711,,,ON,50772cc5498eb6cb971c71b8
4,St Michael's Hospital Medical Imaging,Medical Center,30 Bond St.,CA,Toronto,Canada,,685,"[30 Bond St., Toronto ON, Canada]","[{'label': 'display', 'lat': 43.65368444376282...",43.653684,-79.378706,,,ON,4cdf5495f8cdb1f738339112


In [217]:
#add medical facilities to map 

for lat, lon, neigh, flg in zip(df_filtered['lat'], df_filtered['lng'], df_filtered['name'], df_filtered['categories']):
    label = folium.Popup(neigh, parse_html=True)
    folium.Marker(
    [lat, lon],
    popup=label).add_to(venues_map)
       
venues_map