# PART 1: Scrape wikipedia page for Toronto data

In [None]:
#!conda update -n base -c defaults conda
#!conda install -c conda-forge folium=0.5.0 --yes
#!conda install -c conda-forge geopy --yes
#!conda install -c conda-forge geocoder --yes

In [1]:
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import pandas as pd
import numpy as np
import requests
import re
import folium
import os
import json
import matplotlib.cm as cm
import matplotlib.colors as colors
#import geocoder

<b>Use beautiful soup to get the data from wikipedia</b>

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html_doc = requests.get(url).content
soup = BeautifulSoup(html_doc, 'html.parser')

<b>Extract the data from the HTML page returned by beautiful, clean it up a bit, and put it in a list named "rows"</b>

In [3]:
columns = ['Postcode', 'Borough', 'Neighbourhood']
rows = []
for i in soup.table.tbody.find_all('tr')[1:]:
    row = []
    for j in i.find_all('td'):
        j = str(j.contents[0]).replace('\n', '')
        j = re.sub('<a.*?>', '', j)
        j = re.sub('</a>', '', j)
        row.append(j)
    rows.append(row)

<b>Create a dataframe from the list</b>

In [4]:
df = pd.DataFrame(data = rows, columns = columns)

<b>Only keep rows with an assigned borough</b>

In [5]:
df_cleaned = df[df['Borough'].apply(str.lower) != 'not assigned']
# if a row has a borough but an unassigned neighborhood, set the neighborhood to the borough
def replace_not_assigned(df_row):
    if df_row.Neighbourhood.lower() == 'not assigned':
        df_row.Neighbourhood = df_row.Borough
    return df_row
df_cleaned = df_cleaned.apply(lambda x: replace_not_assigned(x), axis=1)

<b>Group neighborhoods that share a common postcode</b>

In [6]:
df_cleaned = df_cleaned.groupby(['Postcode', 'Borough']).agg({'Neighbourhood': ', '.join})
df_cleaned.reset_index(inplace=True)

In [7]:
df_cleaned.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


<b>Print the dataframe's dimensions</b>

In [8]:
df_cleaned.shape

(103, 3)

# PART 2: Get latitude and longitude for each row of dataframe

<b>Extract the coordinate data from the provided csv file on coursera</b>

In [9]:
geo = pd.read_csv('Geospatial_Coordinates.csv')#.rename(columns={'Postal Code':'Postcode'})
geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


<b>Join the coordinates dataframe to the dataframe made in part 1</b>

In [10]:
df_final = df_cleaned.set_index('Postcode').join(geo.set_index('Postal Code')).reset_index()
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


<b>Geocoder didn't seem to work so the provided csv file had to be used</b>

In [11]:
"""
latitude = []
longitude = []
lat_long = None

# loop until you get the coordinates
for postal_code in df_cleaned['Postcode']:
    while(lat_long is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        print(g.latlng)
        lat_long = g.lat_long
    latitude.append(lat_long[0])
    longitude.append(lat_long[1])
""";

# PART 3: Clustering and visualizing the data

<b>Get Toronto's coordinates</b>

In [12]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto, Ontario are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto, Ontario are 43.653963, -79.387207.


<b>Here is a map of Toronto with markers for each of its neighborhoods</b>

In [13]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_final['Latitude'], df_final['Longitude'], df_final['Borough'], df_final['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

<b>Read Foursquare credentials from a locall file on my machine</b>

In [14]:
local_filepath = '..\\foursquare_credentials.txt'
f = open(local_filepath, "r")
contents = f.read()
credentials = json.loads(contents)
f.close()

CLIENT_ID = credentials['CLIENT_ID']
CLIENT_SECRET = credentials['CLIENT_SECRET']
VERSION = credentials['VERSION']

<b>Define function for getting nearby venues for a particular neighborhood</b>

In [15]:
LIMIT = 100
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()["response"]["groups"][0]["items"]
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

<b>Get nearby venues for Toronto neighborhoods and store them in the dataframe toronto_venues</b>

In [16]:
toronto_venues = getNearbyVenues(names=df_final['Neighbourhood'],
                                   latitudes=df_final['Latitude'],
                                   longitudes=df_final['Longitude']
                                  )
toronto_venues.head()

Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


<b>One hot encode the venue category column to prepare it for analysis</b>

In [17]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Rouge, Malvern",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Highland Creek, Rouge Hill, Port Union",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Guildwood, Morningside, West Hill",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<b>Group the one hot encoded dataframe by neighborhood</b>

In [18]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,...,0.0,0.02,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<b>Define a function which takes in a dataframe row and returns the venue types most common in its neighborhood</b>

In [19]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

<b>Creates a dataframe containing the neighborhoods and their top venues types</b>

In [20]:
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai Restaurant
1,Agincourt,Latin American Restaurant,Lounge,Skating Rink
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Park,Playground,Yoga Studio
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",Grocery Store,Sandwich Place,Liquor Store
4,"Alderwood, Long Branch",Pizza Place,Gym,Coffee Shop


<b>Set the number of clusters and run kmeans on the dataframe after dropping its neighborhood column</b>

In [21]:
kclusters = 2
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

<b>Add the cluster labels back to the toronto_merged dataframe</b>

In [22]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = df_final
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')
toronto_merged.dropna(inplace=True)
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype('int32')
toronto_merged.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,0,Fast Food Restaurant,Department Store,Event Space
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,0,Bar,Yoga Studio,Drugstore
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0,Medical Center,Electronics Store,Rental Car Location
3,M1G,Scarborough,Woburn,43.770992,-79.216917,0,Coffee Shop,Korean Restaurant,Mexican Restaurant
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,0,Fried Chicken Joint,Gas Station,Hakka Restaurant


<b>Mark the clusters on the map, where the color is determined by the cluster label</b>

In [23]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

<b>Print out the number of rows assigned to each cluster</b>

In [24]:
for i in range(kclusters):
    cluster = toronto_merged[toronto_merged['Cluster Labels'] == i]
    print('Cluster {} has {} rows'.format(i, str(cluster.shape[0]).rjust(3)))

Cluster 0 has  86 rows
Cluster 1 has  15 rows


<b>Print out the most popular venues in each cluster</b>

<b><font color="red">It appears that cluster 0 mainly features places to eat or drink while cluster 1 mainly includes recreational places and activities</font></b>

In [25]:
for i in range(kclusters):
    print('Top 5 venue types in Cluster {}:\n'.format(i))
    cluster = toronto_merged[toronto_merged['Cluster Labels'] == i]
    first = cluster['1st Most Common Venue'].value_counts()
    second = cluster['2nd Most Common Venue'].value_counts()
    third = cluster['3rd Most Common Venue'].value_counts()
    print(first.add(second, fill_value=0).add(third, fill_value=0).sort_values(ascending=False)[:5], '\n')
    print('___________________________________\n')

Top 5 venue types in Cluster 0:

Coffee Shop       36.0
Café              18.0
Park              10.0
Pizza Place        9.0
Sandwich Place     8.0
dtype: float64 

___________________________________

Top 5 venue types in Cluster 1:

Park           13.0
Playground      3.0
Yoga Studio     2.0
Trail           2.0
Bus Line        2.0
dtype: float64 

___________________________________



<b>Print out the Boroughs and neighborhoods that belong to each cluster</b>

In [26]:
for i in range(kclusters):
    cluster = toronto_merged[toronto_merged['Cluster Labels'] == i]
    print('Boroughs in Cluster {}:\n'.format(i))
    boroughs = set()
    grouped_boroughs = list(toronto_merged[toronto_merged['Cluster Labels'] == i]['Borough'].unique());
    for group in grouped_boroughs:
        for borough in group.split(', '):
            boroughs.add(borough)
    print(boroughs, '\n')
    print('Neighborhoods in Cluster {}:\n'.format(i))
    print('___________________________________\n')

Boroughs in Cluster 0:

{'York', 'Scarborough', 'Mississauga', "Queen's Park", 'North York', 'Central Toronto', 'Etobicoke', 'West Toronto', 'East Toronto', 'Downtown Toronto', 'East York'} 

Neighborhoods in Cluster 0:

___________________________________

Boroughs in Cluster 1:

{'York', 'Scarborough', 'North York', 'Central Toronto', 'Etobicoke', 'Downtown Toronto', 'East York'} 

Neighborhoods in Cluster 1:

___________________________________



In [27]:
for i in range(kclusters):
    cluster = toronto_merged[toronto_merged['Cluster Labels'] == i]
    print('Neighborhoods in Cluster {}:\n'.format(i))
    neighs = set()
    grouped_neighborhoods = list(toronto_merged[toronto_merged['Cluster Labels'] == i]['Neighbourhood'].unique());
    for group in grouped_neighborhoods:
        for neighborhood in group.split(', '):
            neighs.add(neighborhood)
    print(neighs)
    print('___________________________________\n')

Neighborhoods in Cluster 0:

{'Cabbagetown', 'Underground city', 'Thistletown', 'Willowdale South', 'Roselawn', "King's Mill Park", 'Bedford Park', 'The Queensway West', 'Adelaide', 'Beaumond Heights', 'Victoria Hotel', 'Hillcrest Village', 'Cliffcrest', 'Cliffside West', 'Summerhill West', 'Roncesvalles', 'King', 'Lawrence Manor East', 'The Junction South', 'Humber Summit', 'Birch Cliff', 'Forest Hill SE', 'Woodbine Gardens', 'Bathurst Quay', 'Humewood-Cedarvale', 'Sunnylea', 'Railway Lands', 'Harbourfront East', 'Ryerson', 'Oriole', 'The Junction North', 'Silverstone', 'CN Tower', 'North Toronto West', 'Dufferin', 'Exhibition Place', 'Guildwood', 'India Bazaar', 'The Queensway East', 'Albion Gardens', 'West Hill', 'Keelesdale', 'Trinity', 'Riverdale', 'Maryvale', 'Agincourt', 'The Beaches', 'Rouge Hill', 'North Midtown', 'Dovercourt Village', 'Little Portugal', 'Downsview North', 'The Beaches West', 'Toronto Islands', "Queen's Park", 'Bloordale Gardens', 'Northwest', 'Harbord', 'Clov