# Segmenting and Clustering Neighborhoods in Toronto
###### *David Vazquez*

___

## First Task of the Assignment

In [54]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
!pip install folium
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors



In [22]:
#Import data from Wikipedia table using Beautiful Soup

URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')
print(soup.title)
from IPython.display import display_html
tab = str(soup.table)
display_html(tab,raw=True)

<title>List of postal codes of Canada: M - Wikipedia</title>


Postal Code,Borough,Neighbourhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


In [23]:
#Convert html file into a dataframe
dfa=pd.read_html(tab)
df=dfa[0]
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [25]:
#Let's drop the rows where Borough has the value 'Not Assigned'
indexNames = df[df['Borough'] =='Not assigned'].index
df.drop(indexNames, inplace=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [26]:
#Let's combine neighbourhoods iaw same Postalcode
df1 = df.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
df1.reset_index(inplace=True)

In [27]:
#Let's replace the name of the neighborhoods with no value with the name of their Borough
df1['Neighbourhood'] = np.where(df['Neighbourhood']=='Not assigned',df['Borough'],df['Neighbourhood'])
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [28]:
df1.shape

(103, 3)

___

## Second Task of the Assignment

In [29]:
#Import latitudes and longitudes from csv file into a dataframe
latlon=pd.read_csv('https://cocl.us/Geospatial_data')
latlon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [30]:
#Merge both dataframes
df2 = pd.merge(df1,latlon,on='Postal Code')
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


___

## Third Task of the Assignment

I will work only with boroughs that contain the word Toronto as mentioned in the assignment

In [48]:
#Make a new dataframe selecting only the boroughs containing the word 'Toronto'
df3=df2[df2['Borough'].str.contains('Toronto',regex=False)]
df3.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [49]:
#Get the latitude and longitude for Toronto
from geopy.geocoders import Nominatim
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.6534817, -79.3839347.


In [50]:
#Create a map of Toronto and its neighbourhoods
toronto_map = folium.Map(location=[latitude,longitude],zoom_start=11)
for lat,lng,borough,neighbourhood in zip(df3['Latitude'],df3['Longitude'],df3['Borough'],df3['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(toronto_map)
toronto_map

##### K Means Clustering

In [51]:
#Cluster neihbourhoods using KMeans
kclusters=5
toronto_clustering=df3.drop(['Postal Code','Borough','Neighbourhood'],1)
kmeans= KMeans(n_clusters=kclusters,random_state=0).fit(toronto_clustering)
kmeans.labels_
df3.insert(0,'Cluster Labels', kmeans.labels_)

In [52]:
df3

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,1,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,3,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,2,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [62]:
#Create a map of how the neighbourhoods cluster together
# create map
map_clusters = folium.Map(location=[latitude,longitude],zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighbourhood, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Neighbourhood'], df3['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Let's examine each Cluster

In [57]:
#Cluster 1
df3.loc[df3['Cluster Labels'] == 0, df3.columns[[1] + list(range(5, df3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
2,M5A,-79.360636
4,M7A,-79.389494
9,M5B,-79.378937
15,M5C,-79.375418
20,M5E,-79.373306
24,M5G,-79.387383
30,M5H,-79.384568
36,M5J,-79.381752
42,M5K,-79.381576
48,M5L,-79.379817


In [58]:
#Cluster 2
df3.loc[df3['Cluster Labels'] == 1, df3.columns[[1] + list(range(5, df3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
19,M4E,-79.293031
41,M4K,-79.352188
47,M4L,-79.315572
54,M4M,-79.340923
100,M7Y,-79.321558


In [59]:
#Cluster 3
df3.loc[df3['Cluster Labels'] == 2, df3.columns[[1] + list(range(5, df3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
31,M6H,-79.442259
69,M6P,-79.464763
75,M6R,-79.456325
81,M6S,-79.48445


In [60]:
#Cluster 4
df3.loc[df3['Cluster Labels'] == 3, df3.columns[[1] + list(range(5, df3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
25,M6G,-79.422564
37,M6J,-79.41975
43,M6K,-79.428191
74,M5R,-79.405678
80,M5S,-79.400049
84,M5T,-79.400049


In [61]:
#Cluster 5
df3.loc[df3['Cluster Labels'] == 4, df3.columns[[1] + list(range(5, df3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
61,M4N,-79.38879
62,M5N,-79.416936
67,M4P,-79.390197
68,M5P,-79.411307
73,M4R,-79.405678
79,M4S,-79.38879
83,M4T,-79.38316
86,M4V,-79.400049
