# Toronto Clustering Assignment

## Part 1

In [168]:
import numpy as np
import pandas as pd 
import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.



In [3]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [4]:
parse = BeautifulSoup(data, 'html.parser')

In [5]:
postalcode = []
borough = []
neighborhood = []

In [6]:
# append the data into the respective lists
for row in parse.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcode.append(cells[0].text)
        borough.append(cells[1].text)
        neighborhood.append(cells[2].text.rstrip('\n'))

In [7]:
toronto = pd.DataFrame({"Postalcode": postalcode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})

toronto_df = toronto[toronto.Borough != "Not assigned"].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Borough,Neighborhood,Postalcode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,Harbourfront,M5A
3,Downtown Toronto,Regent Park,M5A
4,North York,Lawrence Heights,M6A


In [8]:
toronto_df_agg = toronto_df.groupby(["Postalcode", "Borough"], as_index=False).agg(lambda a: ", ".join(a))
toronto_df_agg.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
for index, row in toronto_df_agg.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
toronto_df_agg.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
toronto_df_agg.shape

(103, 3)

## Part 2

In [11]:
coordinates = pd.read_csv("http://cocl.us/Geospatial_data")
coordinates.rename(columns={"Postal Code": "Postalcode"}, inplace=True)
coordinates.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
toronto_coor_df = toronto_df_agg.merge(coordinates, on="Postalcode", how="left")
toronto_coor_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Part 3

In [13]:
borough_list = list(toronto_coor_df.Borough.unique())
toronto_only = []
for a in borough_list:
    if "toronto" in a.lower():
        toronto_only.append(a)
        
toronto_coor_df = toronto_coor_df[toronto_coor_df['Borough'].isin(toronto_only)].reset_index(drop=True)
toronto_coor_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [19]:
geolocator = Nominatim(user_agent="my-application")
address = 'Toronto'
location = geolocator.geocode(address)

toronto_map = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)

for latitude1, longitude1, borough, neighborhood in zip(toronto_coor_df['Latitude'], toronto_coor_df['Longitude'], toronto_coor_df['Borough'], toronto_coor_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude1, longitude1],
        radius=3,
        popup=label,
        color='red').add_to(toronto_map)  
    
toronto_map

In [22]:
CLIENT_ID = 'PI4RVCMHSFBLP5J502OMYO0OZ32YSLX2GDE15VSIKCQKGM1O'
CLIENT_SECRET = 'I3U1KDCGSZ4LVGHZGEHKS33YA1LXXZ0IVESAEHPMOFFIVNLP'
VERSION = '20180605'

radius = 300
max = 100
venuelist = []

for latitude, longitude, postalcode, borough, neighborhood in zip(toronto_coor_df['Latitude'], toronto_coor_df['Longitude'], toronto_coor_df['Postalcode'], toronto_coor_df['Borough'], toronto_coor_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        latitude,
        longitude,
        radius, 
        max)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    for venue in results:
        venuelist.append((
            postalcode, 
            borough,
            neighborhood,
            latitude, 
            longitude,
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [23]:
toronto_df_venue = pd.DataFrame(venuelist)
toronto_df_venue.columns = ['Postalcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude',  'Venue','VenueLat', 'VenueLong', 'VenueCat']
toronto_df_venue.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Venue,VenueLat,VenueLong,VenueCat
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
3,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant
4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Dolce Gelato,43.677773,-79.351187,Ice Cream Shop


In [126]:
toronto_summary = toronto_df_venue.groupby(["Postalcode", "Borough", "Neighborhood"]).count().reset_index()
toronto_summary.sort_values(["VenueCat"], inplace=True,ascending=False)
toronto_summary.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Venue,VenueLat,VenueLong,VenueCat
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",80,80,80,80,80,80
25,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",74,74,74,74,74,74
28,M5X,Downtown Toronto,"First Canadian Place, Underground city",72,72,72,72,72,72
20,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",72,72,72,72,72,72
19,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",66,66,66,66,66,66


In [183]:
clusters = 4
toronto_summary2 = toronto_summary.drop(["Postalcode", "Borough", "Neighborhood"], 1)
kmeans = KMeans(n_clusters=clusters, random_state=0).fit(toronto_summary2)
toronto_summary["Clusters"] = kmeans.labels_
toronto_summary.reset_index()
toronto_summary.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Venue,VenueLat,VenueLong,VenueCat,Clusters
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",80,80,80,80,80,80,0
25,M5T,Downtown Toronto,"Chinatown, Grange Park, Kensington Market",74,74,74,74,74,74,0
28,M5X,Downtown Toronto,"First Canadian Place, Underground city",72,72,72,72,72,72,0
20,M5K,Downtown Toronto,"Design Exchange, Toronto Dominion Centre",72,72,72,72,72,72,0
19,M5J,Downtown Toronto,"Harbourfront East, Toronto Islands, Union Station",66,66,66,66,66,66,0


In [184]:
toronto_final = toronto_summary.copy()
toronto_final = toronto_final.drop(["Latitude", "Longitude", "Venue", "VenueLat", "VenueLong", "VenueCat"], 1)
toronto_final = toronto_final.join(toronto_df_venue.drop(["Borough", "Neighborhood"], 1).set_index("Postalcode"), on="Postalcode")
toronto_final.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Clusters,Latitude,Longitude,Venue,VenueLat,VenueLong,VenueCat
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",0,43.648198,-79.379817,Equinox Bay Street,43.6481,-79.379989,Gym
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",0,43.648198,-79.379817,Canoe,43.647452,-79.38132,Restaurant
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",0,43.648198,-79.379817,Mos Mos Coffee,43.648159,-79.378745,Café
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",0,43.648198,-79.379817,Walrus Pub & Beer Hall,43.647375,-79.379515,Pub
21,M5L,Downtown Toronto,"Commerce Court, Victoria Hotel",0,43.648198,-79.379817,Maman,43.648309,-79.382253,Café


In [191]:
import matplotlib.colors as colors
final_map = folium.Map(location=[location.latitude, location.longitude], zoom_start=12)

a = np.arange(clusters)
b = [i+a+(i*a)**2 for i in range(clusters)]
diffcolor = cm.rainbow(np.linspace(0, 1, len(b)))
colorscheme = [colors.rgb2hex(i) for i in diffcolor]

for postal, borough, neighbor, lat, long, cluster in zip(toronto_final['Postalcode'], toronto_final['Borough'], toronto_final['Neighborhood'], toronto_final['Latitude'], toronto_final['Longitude'], toronto_final['Clusters']):
    label = '({}) {}: {} - Cluster {}'.format(postal, borough, neighbor, cluster)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius = 3,
        popup = label,
        color = colorscheme[cluster]).add_to(final_map)
       
final_map

Toronto was clustered by number of venues within each neighborhood.