Segmenting and clustering neighborhoods in the city of Toronto

First, I will import the needed libraries

In [1]:
import pandas as pd
import numpy as np

 I will obtain the data and transform it into a pandas dataframe 

In [5]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')[0]
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


I will adjust the spelling of the Neighborhood

In [6]:
df.rename(columns={'Neighbourhood':'Neighborhood'}, inplace=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


I will remove the brought that are not assigned.

In [7]:
df = df[df['Borough'] != 'Not assigned']
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


I will group the neighborhoods with that have the same postal code

In [8]:
df1 = df.groupby('Postal Code', sort=False).agg(', '.join)
df1.head(10)

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"
M3B,North York,Don Mills
M4B,East York,"Parkview Hill, Woodbine Gardens"
M5B,Downtown Toronto,"Garden District, Ryerson"


I will replace cells that has a borough but a Not assigned  neighborhood to have the neighborhood same as the borough.

In [9]:
df1.loc[df1['Neighborhood'] =='Not assigned', 'Neighborhood'] = df1.loc[df1['Neighborhood'] =='Not assigned', 'Borough']
df1.reset_index(inplace=True)
df1.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Displaying shape

In [11]:
df1.shape

(103, 3)

Second part

Import Geocoder library

In [12]:
!pip install geocoder
import geocoder
print('done')

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 8.0 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
done


Clear the coordinates

In [15]:
df1['Latitude'] = None
df1['Longitude'] = None
df1.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,,
1,M4A,North York,Victoria Village,,
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",,
3,M6A,North York,"Lawrence Manor, Lawrence Heights",,
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",,


Getting latitude and longitude and add it to the data fram from geocoder

In [17]:
for i, pc in enumerate(df1['Postal Code']):
    lat_lng_coords = None
    
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Toronto, Ontario'.format(pc))
        lat_lng_coords = g.latlng
    
    if lat_lng_coords:
        latitude = lat_lng_coords[0]
        longitude = lat_lng_coords[1]
    
    df1.loc[i, 'Latitude'] = latitude
    df1.loc[i, 'Longitude'] = longitude

df1.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.7525,-79.3299
1,M4A,North York,Victoria Village,43.7306,-79.3131
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6551,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.7233,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6625,-79.3919
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.6626,-79.5283
6,M1B,Scarborough,"Malvern, Rouge",43.8114,-79.1966
7,M3B,North York,Don Mills,43.7492,-79.3619
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.7072,-79.3119
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.6574,-79.378


Part 3 for visualization

Import additional needed libraries

In [20]:
from geopy.geocoders import Nominatim
!pip install folium
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting folium
  Downloading folium-0.12.0-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 3.5 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.0


Finding the coordinate of Toronto

In [23]:
address = 'Toronto'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print(f'The coordinate of Toronto are {latitude}, {longitude}.')

The coordinate of Toronto are 43.6534817, -79.3839347.


In [30]:
df2=df1[df1['Borough'].str.contains('Toronto')]
df3=df2.reset_index(drop=True)
df3.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6551,-79.3626
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6625,-79.3919
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6574,-79.378
3,M5C,Downtown Toronto,St. James Town,43.6522,-79.3759
4,M4E,East Toronto,The Beaches,43.6771,-79.2955


In [31]:
df3['Borough'].value_counts()

Downtown Toronto    19
Central Toronto      9
West Toronto         6
East Toronto         5
Name: Borough, dtype: int64

In [35]:
df3.shape

(39, 6)

In [33]:
df3['Label']=df3['Borough'].replace(to_replace=['Downtown Toronto','Central Toronto','West Toronto','East Toronto'],value=[1,2,3,4],inplace=False)
df3.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Label
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.6551,-79.3626,1
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.6625,-79.3919,1
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.6574,-79.378,1
3,M5C,Downtown Toronto,St. James Town,43.6522,-79.3759,1
4,M4E,East Toronto,The Beaches,43.6771,-79.2955,4
5,M5E,Downtown Toronto,Berczy Park,43.6454,-79.3731,1
6,M5G,Downtown Toronto,Central Bay Street,43.6561,-79.3849,1
7,M6G,Downtown Toronto,Christie,43.6687,-79.4207,1
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.6497,-79.3826,1
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.6651,-79.4389,3


In [45]:
kclusters=len(df3)

Toronto_Map = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, cluster in zip(df3['Latitude'], df3['Longitude'], df3['Label']):
    label = folium.Popup(str(df3['Borough']) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=1).add_to(Toronto_Map)

In [46]:
Toronto_Map