# Toronto Clustering Assignment

In [2]:
import numpy as np
import pandas as pd 
import json
from geopy.geocoders import Nominatim
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will

In [3]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [4]:
parse = BeautifulSoup(data, 'html.parser')

In [5]:
postalcode = []
borough = []
neighborhood = []

In [6]:
# append the data into the respective lists
for row in parse.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalcode.append(cells[0].text)
        borough.append(cells[1].text)
        neighborhood.append(cells[2].text.rstrip('\n'))

In [7]:
toronto = pd.DataFrame({"Postalcode": postalcode,
                           "Borough": borough,
                           "Neighborhood": neighborhood})

toronto_df = toronto[toronto.Borough != "Not assigned"].reset_index(drop=True)
toronto_df.head()

Unnamed: 0,Borough,Neighborhood,Postalcode
0,North York,Parkwoods,M3A
1,North York,Victoria Village,M4A
2,Downtown Toronto,Harbourfront,M5A
3,Downtown Toronto,Regent Park,M5A
4,North York,Lawrence Heights,M6A


In [8]:
toronto_df_agg = toronto_df.groupby(["Postalcode", "Borough"], as_index=False).agg(lambda a: ", ".join(a))
toronto_df_agg.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
for index, row in toronto_df_agg.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
toronto_df_agg.head()

Unnamed: 0,Postalcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
coordinates = pd.read_csv("http://cocl.us/Geospatial_data")
coordinates.rename(columns={"Postal Code": "Postalcode"}, inplace=True)
coordinates.head()

Unnamed: 0,Postalcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
toronto_coor_df = toronto_df_agg.merge(coordinates, on="Postalcode", how="left")
toronto_coor_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [13]:
borough_list = list(toronto_coor_df.Borough.unique())
toronto_only = []
for a in borough_list:
    if "toronto" in a.lower():
        toronto_only.append(a)
        
toronto_coor_df = toronto_coor_df[toronto_coor_df['Borough'].isin(toronto_only)].reset_index(drop=True)
toronto_coor_df.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [14]:
geolocator = Nominatim(user_agent="my-application")
address = 'Toronto'
location = geolocator.geocode(address)

toronto_map = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)

for latitude1, longitude1, borough, neighborhood in zip(toronto_coor_df['Latitude'], toronto_coor_df['Longitude'], toronto_coor_df['Borough'], toronto_coor_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude1, longitude1],
        radius=3,
        popup=label,
        color='red').add_to(toronto_map)  
    
toronto_map

In [15]:
CLIENT_ID = 'PI4RVCMHSFBLP5J502OMYO0OZ32YSLX2GDE15VSIKCQKGM1O'
CLIENT_SECRET = 'I3U1KDCGSZ4LVGHZGEHKS33YA1LXXZ0IVESAEHPMOFFIVNLP'
VERSION = '20180605'

radius = 300
max = 100
venuelist = []

for latitude, longitude, postalcode, borough, neighborhood in zip(toronto_coor_df['Latitude'], toronto_coor_df['Longitude'], toronto_coor_df['Postalcode'], toronto_coor_df['Borough'], toronto_coor_df['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        latitude,
        longitude,
        radius, 
        max)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    for venue in results:
        venuelist.append((
            postalcode, 
            borough,
            neighborhood,
            latitude, 
            longitude,
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [16]:
toronto_df_venue = pd.DataFrame(venuelist)
toronto_df_venue.columns = ['Postalcode', 'Borough', 'Neighborhood', 'Latitude', 'Longitude',  'Venue','VenueLat', 'VenueLong', 'VenueCat']
toronto_df_venue.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude,Venue,VenueLat,VenueLong,VenueCat
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Park,43.675278,-79.294647,Park
1,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Stewart Ravine,43.6763,-79.294784,Other Great Outdoors
2,M4E,East Toronto,The Beaches,43.676357,-79.293031,Balmy Beach Playground,43.676078,-79.290805,Playground
3,M4E,East Toronto,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [17]:
# one hot encoding
tor_onehot = pd.get_dummies(toronto_df_venue[['VenueCat']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
tor_onehot['Neighborhood'] = toronto_df_venue['Neighborhood'] 

print(tor_onehot.shape)
tor_onehot.head()

(942, 189)


Unnamed: 0,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
tor_neighbor = tor_onehot.groupby(["Neighborhood"]).mean().reset_index()
tor_neighbor

Unnamed: 0,Neighborhood,Adult Boutique,Airport Food Court,Airport Gate,Airport Lounge,Airport Terminal,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,...,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.052632,0.0,0.0,0.052632,...,0.0,0.0,0.017544,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.25,0.25,0.25,0.25,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014286,0.0,...,0.014286,0.0,0.057143,0.0,0.028571,0.0,0.014286,0.0,0.0,0.0
8,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Church and Wellesley,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.018519,0.0,0.0,0.018519,0.018519,0.018519


In [19]:
tor_park = tor_neighbor[["Neighborhood","Park"]]
tor_park.head()

Unnamed: 0,Neighborhood,Park
0,"Adelaide, King, Richmond",0.017544
1,Berczy Park,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.142857
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0


In [20]:
# set number of clusters
kclusters = 3

tor_cluster = tor_park.drop(["Neighborhood"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(tor_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 0, 2, 0, 0, 0, 0, 0, 0], dtype=int32)

In [21]:
temp = toronto_coor_df[["Neighborhood","Latitude","Longitude"]]
temp.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,The Beaches,43.676357,-79.293031
1,"The Danforth West, Riverdale",43.679557,-79.352188
2,"The Beaches West, India Bazaar",43.668999,-79.315572
3,Studio District,43.659526,-79.340923
4,Lawrence Park,43.72802,-79.38879


In [22]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
tor_final = tor_park.copy()
tor_final["Cluster Labels"] = kmeans.labels_
tor_final = tor_final.join(temp.set_index("Neighborhood"), on="Neighborhood")
tor_final

Unnamed: 0,Neighborhood,Park,Cluster Labels,Latitude,Longitude
0,"Adelaide, King, Richmond",0.017544,0,43.650571,-79.384568
1,Berczy Park,0.0,0,43.644771,-79.373306
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0,43.636847,-79.428191
3,Business Reply Mail Processing Centre 969 Eastern,0.142857,2,43.662744,-79.321558
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0,43.628947,-79.39442
5,"Cabbagetown, St. James Town",0.0,0,43.667967,-79.367675
6,Central Bay Street,0.03125,0,43.657952,-79.387383
7,"Chinatown, Grange Park, Kensington Market",0.0,0,43.653206,-79.400049
8,Christie,0.0,0,43.669542,-79.422564
9,Church and Wellesley,0.018519,0,43.66586,-79.38316


In [23]:
tor_final.sort_values(["Cluster Labels"], inplace=True)
tor_final

Unnamed: 0,Neighborhood,Park,Cluster Labels,Latitude,Longitude
0,"Adelaide, King, Richmond",0.017544,0,43.650571,-79.384568
31,Studio District,0.0,0,43.659526,-79.340923
30,Stn A PO Boxes 25 The Esplanade,0.022222,0,43.646435,-79.374846
29,St. James Town,0.025641,0,43.651494,-79.375418
28,"Ryerson, Garden District",0.0,0,43.657162,-79.378937
27,"Runnymede, Swansea",0.0,0,43.651571,-79.48445
26,Roselawn,0.0,0,43.711695,-79.416936
24,"Parkdale, Roncesvalles",0.0,0,43.64896,-79.456325
23,North Toronto West,0.0,0,43.715383,-79.405678
21,"Little Portugal, Trinity",0.0,0,43.647927,-79.41975


In [27]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tor_final['Latitude'], tor_final['Longitude'], tor_final['Neighborhood'], tor_final['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

Conclusion:

Cluster 1 has many (relatively) while cluster 2 has moderate number of parks. Cluster 0 on the other hand has little to no parks. Parks should be built in neighborhoods in cluster 0. Cluster 0 has the most neighborhoods, providing several options for potential park locations.