In [238]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


# Part 1: Get a table of postal codes as Pandas dataframe

In [239]:
#Install beautifulsoup requirements
#!conda install beautifulsoup4
#!conda install lxml
#!conda install requests

from bs4 import BeautifulSoup
import lxml
import requests

In [240]:
#Get the data from the Wikipedia website
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
tables = soup.findChildren('table')
my_table = tables[0]
table_rows = my_table.findChildren(['th', 'tr'])

In [241]:
#Transform json to pandas df
a = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    a.append(row)
df = pd.DataFrame(a, columns=column_names)

In [242]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,,,
2,,,
3,,,
4,M1A,Not assigned,Not assigned\n


In [243]:
#Strip trailing spaces
df['PostalCode'] = df['PostalCode'].str.strip()
df['Borough'] = df['Borough'].str.strip()
df['Neighborhood'] = df['Neighborhood'].str.strip()

In [244]:
#Clean up dataframe, remove Not Assigned buroughs
df2 = df.loc[df['Borough'] != 'Not assigned']
#Drop None values
df2 = df2.dropna()
#Assign the borough to Not assigned neighborhoods
df2['Neighborhood'] = pd.np.where(df2.Neighborhood.str.contains('Not assigned'), df2.Borough, df2.Neighborhood)

In [245]:
#List the neighborhoods per postalcode and borough
df3 = df2.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(np.hstack).to_frame().reset_index()
df3.rename(columns={ df3.columns[2]: "Neighborhood" }, inplace = True)
display(df3)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"[Rouge, Malvern]"
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]"
3,M1G,Scarborough,[Woburn]
4,M1H,Scarborough,[Cedarbrae]
5,M1J,Scarborough,[Scarborough Village]
6,M1K,Scarborough,"[East Birchmount Park, Ionview, Kennedy Park]"
7,M1L,Scarborough,"[Clairlea, Golden Mile, Oakridge]"
8,M1M,Scarborough,"[Cliffcrest, Cliffside, Scarborough Village West]"
9,M1N,Scarborough,"[Birch Cliff, Cliffside West]"


In [246]:
#Print the shape of the cleaned dataframe
df3.shape

(103, 3)

# Part 2: add latitude and longitude

In [247]:
#!pip install geocoder
import geocoder
df4 = df3

In [264]:
#Create function to retrieve coordinates for a postal code in Toronto
def get_geocoder(postal_code_from_df):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code_from_df))
        lat_lng_coords = g.latlng
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
        return latitude,longitude

#Use the function to fill the latitude and longitude to the df
df4['Latitude'], df4['Longitude'] = zip(*df4['PostalCode'].apply(get_geocoder))

df4.head()

KeyboardInterrupt: 

In [267]:
#The code above is taking very long, so I'll read in the csv data for coordinates
geo = pd.read_csv('https://cocl.us/Geospatial_data')
#Rename the column for the merge
geo.rename(columns={'Postal Code': 'PostalCode'}, inplace=True)
#Merge geo with df
df5 = pd.merge(df4, geo, how = 'left')
df5.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"[Rouge, Malvern]",43.806686,-79.194353
1,M1C,Scarborough,"[Highland Creek, Rouge Hill, Port Union]",43.784535,-79.160497
2,M1E,Scarborough,"[Guildwood, Morningside, West Hill]",43.763573,-79.188711
3,M1G,Scarborough,[Woburn],43.770992,-79.216917
4,M1H,Scarborough,[Cedarbrae],43.773136,-79.239476


# Part 3: visualize the clusters of neighborhoods on a map

In [268]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [273]:
#Create map of Toronto using latitude and longitude values from the previously created df
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

#Add markers to map
for lat, lng, borough, neighborhood in zip(df5['Latitude'], df5['Longitude'], df5['Borough'], df5['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
#Show map   
map_toronto