# Segment and Cluster Toronto Neighborhoods

## Author: C. Lee Allan - 7/2020

### ***

## Section I - scape list of Toronto neighborhoods

The first step is to scrape the table found at https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M
with the list of postal codes, boroughs, and neighborhoods

In [42]:
#!pip install lxml # install required package. RIMINDER if you have to install you will need to restart kernel

In [6]:
import pandas as pd

#Load Data
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M') #Reads all the tables on the page
df_toronto = df[0] # assign only the table we want to a dataframe


df_toronto = df_toronto[df_toronto.Borough != 'Not assigned']   # stip out unassigned boroughs
df_toronto.reset_index(drop=True, inplace=True)                            # reset the index
df_toronto.groupby(['Postal Code'])                             # sort by postal code
df_toronto

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [3]:
# print .shape
df_toronto.shape

(103, 4)

# Section II - geocoding neighborhoods

In [41]:
#!pip install geocoder # Install geocoder. comment out if already installed

In [26]:
#copy the datafram to preserve the original and add the Long and Lat columns
df_tmp = df_toronto
df_tmp['Latitude'], df_tmp['Longitude'] = ['lat', 'long']
df_tmp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,lat,long
1,M4A,North York,Victoria Village,lat,long
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",lat,long
3,M6A,North York,"Lawrence Manor, Lawrence Heights",lat,long
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",lat,long
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",lat,long
99,M4Y,Downtown Toronto,Church and Wellesley,lat,long
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",lat,long
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",lat,long


In [27]:
import geocoder # import geocoder

df_tmp = df_tmp.rename(columns={'Postal Code': 'PostCode'}) # change the name of the column to remove the space
# loop over the post codes
print('Looping over the list of Postal Codes.')
for index, row in df_tmp.iterrows():
    g=geocoder.arcgis('{}, Toronto, Ontario'.format(row.PostCode)) #get each tal, long
    lat_lng_coords = g.latlng
    latitude = lat_lng_coords[0] # assign values
    longitude = lat_lng_coords[1]
    df_tmp.at[index, 'Latitude'] = latitude    # update the dataframe
    df_tmp.at[index, 'Longitude'] = longitude
    print('\x1b'+row.PostCode+'\x1b[0m', end='\r')
print(' ')
print('Complete')
df_tmp

Looping over the list of Postal Codes.
 M8Z[0m
Complete


Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7519,-79.3304
1,M4A,North York,Victoria Village,43.7304,-79.3128
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6551,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7232,-79.4514
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6645,-79.393
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.6537,-79.5111
99,M4Y,Downtown Toronto,Church and Wellesley,43.6666,-79.3813
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.6487,-79.3854
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.6329,-79.4895


## Section III - cluster neighborhoods

In [35]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

df_tmp2 = df_tmp
df_tmp2 = df_tmp2.drop(['Neighborhood', 'PostCode'], axis = 1) 
#
df_cluster = pd.get_dummies(df_tmp2, columns=['Borough']) # get dummies for Borough to cluster on like areas


# run k-means clustering
kclusters = 5
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

df_cluster.insert(0, 'Cluster Labels', kmeans.labels_)
#add PostCode back into the df
df_cluster['PostCode'] = df_tmp['PostCode']
df_cluster

Unnamed: 0,Cluster Labels,Latitude,Longitude,Borough_Central Toronto,Borough_Downtown Toronto,Borough_East Toronto,Borough_East York,Borough_Etobicoke,Borough_Mississauga,Borough_North York,Borough_Scarborough,Borough_West Toronto,Borough_York,PostCode
0,2,43.7519,-79.3304,0,0,0,0,0,0,1,0,0,0,M3A
1,2,43.7304,-79.3128,0,0,0,0,0,0,1,0,0,0,M4A
2,1,43.6551,-79.3626,0,1,0,0,0,0,0,0,0,0,M5A
3,2,43.7232,-79.4514,0,0,0,0,0,0,1,0,0,0,M6A
4,1,43.6645,-79.393,0,1,0,0,0,0,0,0,0,0,M7A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,0,43.6537,-79.5111,0,0,0,0,1,0,0,0,0,0,M8X
99,1,43.6666,-79.3813,0,1,0,0,0,0,0,0,0,0,M4Y
100,4,43.6487,-79.3854,0,0,1,0,0,0,0,0,0,0,M7Y
101,0,43.6329,-79.4895,0,0,0,0,1,0,0,0,0,0,M8Y


In [40]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab


In [39]:
import folium # map rendering library
import numpy as np # library to handle data in a vectorized manner
import matplotlib.cm as cm
import matplotlib.colors as colors


# create map
map_clusters = folium.Map(location=[latitude, longitude], width='80%', height='80%', zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_cluster['Latitude'], df_cluster['Longitude'], df_cluster['PostCode'], kmeans.labels_):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters