In [3]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='2e88fef6-17c5-401c-9c4c-c2bccc706428', project_access_token='p-ed66b8abf200888a861f99284266615d5143fe1b')
pc = project.project_context


In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

from sklearn.cluster import KMeans 
from sklearn.datasets import make_blobs

from geopy.geocoders import Nominatim

import folium

import matplotlib.cm as cm
import matplotlib.colors as colors

## Open the csv file:

In [2]:
tc_df = pd.read_csv('Toronto_Coordinates.csv')
tc_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhoods,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


Find the geographical coordinates for the city of Toronto:

In [3]:
geolocator = Nominatim(user_agent="Toronto_explorer")
location = geolocator.geocode('Toronto, Canada')
t_lat = location.latitude
t_lng = location.longitude
print('Toronto coordinates:\nLatitude = {}\nLongitude = {}'.format(t_lat, t_lng))

Toronto coordinates:
Latitude = 43.6534817
Longitude = -79.3839347


## Map of the Toronto Boroughs:
Create a folium map of Toronto and all of the boroughs based on the coordinates in our table.

In [4]:
tmap = folium.Map(location=[t_lat,t_lng]) # Coordinates for Toronto.

for bor,hoods,lat,lng in zip(tc_df.Borough, tc_df.Neighborhoods, tc_df.Latitude, tc_df.Longitude):
    label = '{}:\n{}'.format(bor,hoods)
    label = folium.Popup(html=label, parse_html=True)

    folium.CircleMarker(
        location=[lat,lng],
        radius=5,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.6,
        popup=label
    ).add_to(tmap)

tmap

## Define functions to perform the desired tasks:

In [5]:
def FilterDF(borough_list):
    """ Creates a new data frame based off of a list of desired boroughs."""
    new_df_dict = {}
    for index, row in tc_df.iterrows():
        if row[1] in borough_list:
            new_df_dict[index]=row
    new_df = pd.DataFrame(new_df_dict).transpose()
    
    return new_df

In [6]:
def TorontoClusterMap(num_clusters, cluster_df, map_zoom):
    """ Creates a folium circle marker map of Toronto boroughs identified in the provided dataframe (cluster_df).
    Input:
        num_clusters (int) = The number of desired clusters.
        cluster_df (Pandas DataFrame) = A dataframe containing the 
        following columns: ['ClusterLabel','Borough','Neighborhoods','Latitude','Longitude']
        map_zoom (int): The number identifying the zoom of the generated map.
    Output:
        A Folium Circle Marker map
    """
    cluster_map = folium.Map(location=[43.6534817,-79.3839347], zoom_start=map_zoom) # Coordinates for Toronto.

    # set color scheme for the clusters
    x = np.arange(num_clusters)
    ys = [i + x + (i*x)**2 for i in range(num_clusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map:
    markers_colors=[]
    for lat, lng, bor, hoods, cluster in zip(cluster_df['Latitude'],
                                             cluster_df['Longitude'],
                                             cluster_df['Borough'],
                                             cluster_df['Neighborhoods'],
                                             cluster_df['ClusterLabel']):
        
        label = folium.Popup(html='{},{} - Cluster:{}'.format(bor,hoods,cluster), parse_html=True)
        
        folium.CircleMarker(location=[lat,lng],
                            radius=5,
                            color=rainbow[cluster-1],
                            fill=True,
                            fill_color=rainbow[cluster-1],
                            fill_opacity=0.8,
                            popup=label).add_to(cluster_map)
    return cluster_map

___
# Clustering the boroughs with Toronto in their name:

In [7]:
# Create a new dataframe with only the boroughs with Toronto in their name then add the ClusterLabel column:

toronto_name_bors = ['Downtown Toronto', 'East Toronto', 'West Toronto','Central Toronto']

toronto_name_df = FilterDF(toronto_name_bors)

k_toronto_name = len(toronto_name_bors)
toronto_name_clustering = toronto_name_df.drop(columns=['PostalCode', 'Borough', 'Neighborhoods'])
kmeans = KMeans(n_clusters=k_toronto_name, random_state=0).fit(toronto_name_clustering)

toronto_name_df.insert(loc=0, column='ClusterLabel', value=kmeans.labels_)
toronto_name_df.head()

Unnamed: 0,ClusterLabel,PostalCode,Borough,Neighborhoods,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.6543,-79.3606
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.6572,-79.3789
15,0,M5C,Downtown Toronto,St. James Town,43.6515,-79.3754
19,2,M4E,East Toronto,The Beaches,43.6764,-79.293
20,0,M5E,Downtown Toronto,Berczy Park,43.6448,-79.3733


In [8]:
# Map of all boroughs with "Toronto" in their name:

TorontoClusterMap(num_clusters=k_toronto_name, cluster_df=toronto_name_df, map_zoom=12)

In [9]:
print('The number of clusters for boroughs with Toronto in thier name is {}.'.format(k_toronto_name))

The number of clusters for boroughs with Toronto in thier name is 4.


### Observations:
The boroughs with Toronto in their name are evenly spread throughout the Toronto area. The largest concentration of these boroughs are named Downtown Toronto and are clearly indicated by the red circles.

____

## Clustering the boroughs with York in their name:

In [10]:
york_bors = ['North York', 'East York', 'York']

In [11]:
# Create a new dataframe with only the boroughs with Toronto in their name then add the ClusterLabel column:

york_df = FilterDF(york_bors)

k_york = len(york_bors)
york_clustering = york_df.drop(columns=['PostalCode', 'Borough', 'Neighborhoods'])
kmeans = KMeans(n_clusters=k_york, random_state=0).fit(york_clustering)

york_df.insert(loc=0, column='ClusterLabel', value=kmeans.labels_)
york_df.head()

Unnamed: 0,ClusterLabel,PostalCode,Borough,Neighborhoods,Latitude,Longitude
0,2,M3A,North York,Parkwoods,43.7533,-79.3297
1,2,M4A,North York,Victoria Village,43.7259,-79.3156
3,1,M6A,North York,"Lawrence Manor,Lawrence Heights",43.7185,-79.4648
7,2,M3B,North York,Don Mills North,43.7459,-79.3522
8,2,M4B,East York,"Parkview Hill,Woodbine Gardens",43.7064,-79.3099


In [12]:
# Map of all boroughs with "York" in their name:

TorontoClusterMap(num_clusters=k_york, cluster_df=york_df, map_zoom=11)

In [13]:
print('The number of clusters for the boroughs with York in thier name is {}.'.format(k_york))

The number of clusters for the boroughs with York in thier name is 3.


### Observations:
The boroughs with York in their name are all located in northern Toronto. The East York boroughs are the only boroughs that tend to be closer to Downtown Toronto.

___

## Clustering the other boroughs:

In [14]:
other_bors = ["Queen's Park", "Scarborough", "Etobicoke"]

In [15]:
# Create a new dataframe with the 'other' remaining boroughs within Toronto then add the ClusterLabel column:
other_bors_df = FilterDF(borough_list=other_bors)

k_other = len(other_bors)
other_clustering = other_bors_df.drop(columns=['PostalCode', 'Borough', 'Neighborhoods'])
kmeans = KMeans(n_clusters=k_other, random_state=0).fit(other_clustering)

other_bors_df.insert(loc=0, column='ClusterLabel', value=kmeans.labels_)
other_bors_df.head()

Unnamed: 0,ClusterLabel,PostalCode,Borough,Neighborhoods,Latitude,Longitude
4,2,M7A,Queen's Park,Ontario Provincial Government,43.6623,-79.3895
5,1,M9A,Etobicoke,Islington Avenue,43.6679,-79.5322
6,0,M1B,Scarborough,"Malvern,Rouge",43.8067,-79.1944
11,1,M9B,Etobicoke,"West Deane Park,Princess Gardens,Martin Grove,...",43.6509,-79.5547
12,0,M1C,Scarborough,"Rouge Hill,Port Union,Highland Creek",43.7845,-79.1605


In [16]:
# Map of the other remaining boroughs:

TorontoClusterMap(num_clusters=k_other, cluster_df=other_bors_df, map_zoom=10)

In [17]:
print('The number of clusters for the "other" remaining boroughs is {}.'.format(k_other))

The number of clusters for the "other" remaining boroughs is 3.


### Observations:
The remaining boroughs can be found flanking the boroughs Toronto and York in their name. Scarborough appears to be intermingled with East York.

# Clustering all of the boroughs:

In [18]:
# Create a copy of the original dataframe:
toronto_bors_df = tc_df

# The number of primary Toronto boroughs based off of the dataframe:
k = 7

toronto_clustering = toronto_bors_df.drop(columns=['PostalCode', 'Borough', 'Neighborhoods'])
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_clustering)

toronto_bors_df.insert(loc=0, column='ClusterLabel', value=kmeans.labels_)
toronto_bors_df.head()

Unnamed: 0,ClusterLabel,PostalCode,Borough,Neighborhoods,Latitude,Longitude
0,4,M3A,North York,Parkwoods,43.753259,-79.329656
1,4,M4A,North York,Victoria Village,43.725882,-79.315572
2,0,M5A,Downtown Toronto,"Regent Park,Harbourfront",43.65426,-79.360636
3,5,M6A,North York,"Lawrence Manor,Lawrence Heights",43.718518,-79.464763
4,0,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


In [19]:
TorontoClusterMap(num_clusters=7, cluster_df=toronto_bors_df, map_zoom=10)

In [20]:
print('The number of clusters for all of the boroughs is {}.'.format(k))

The number of clusters for all of the boroughs is 7.


### Observations:
The clustering is based off of geographical location and does not take into consideration the names of the boroughs. As a consequence, we see that Scarborough, North York and East York are lumped into the fourth cluster due to their geographical proximity.