# Segmenting and Clustering Neighborhoods in Toronto

## Part One - Creating the dataframe

Importing libraries

In [1]:
import pandas as pd
import numpy as np

* Reading the given wikipedia page URL to a pandas dataframe
* Looking for the table containing "Postal Code"s
* Setting the first row to headers
* Setting "Not assigned" to be considered missing values so we can use the dropna function

In [2]:
df = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M',
             match= 'Postal Code', header=0, na_values='Not assigned')[0]

Drop "Not assigned" boroughs

In [3]:
df.dropna(subset=['Borough'], inplace=True)

Making sure there are no postal codes with a borough but a not assigned neighborhood

In [4]:
df[df['Neighborhood'].isna()]

Unnamed: 0,Postal Code,Borough,Neighborhood


Reset the index and view the dataframe

In [5]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


Using the .shape method to print the number of rows of the dataframe

In [6]:
df.shape

(103, 3)

## Part Two - Get Latitude and Longitude

Installing & importing the Geocoder package

In [7]:
import geocoder

Tried to use the geocoder package to get the latitudes & longitudes for all the postal codes but it was taking forever

In [8]:
lats, longs = [], []
codes = df['Postal Code']
for code in codes:
    coords = None
    while coords == None:
        g = geocoder.google('{}, Toronto, Ontario'.format(code))
        coords = g.latlng
    lats.append(coords[0])
    longs.append(coords[1])

KeyboardInterrupt: 

Then tried to just get the coordinates for a single postal code and limited it to 100 tries

In [9]:
# initialize your variable to None
lat_lng_coords = None
postal_code = 'M5G'
tries = 0

# loop until you get the coordinates
while tries < 100:
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
    tries += 1
    if lat_lng_coords:
        break

if lat_lng_coords:
    print('got em')
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
else:
    print('no go')

no go


Still not working - got the coordiantes from the csv file

In [10]:
coords = pd.read_csv('Geospatial_Coordinates.csv')
coords.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the original dataframe with the new dataframe containing latitudes & longitudes

In [11]:
TorontoMetro = df.join(coords.set_index('Postal Code'), on='Postal Code')
TorontoMetro.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


## Part Three - Explore and Cluster

Importing Stuff

In [12]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [13]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [41]:
import requests # library to handle requests

In [14]:
import folium # map rendering library

In [15]:
Tlat, Tlong = 43.6532, -79.3832

Create a map of Tornto and its neighborhoods

In [17]:
# create map of Toronto
TorontoMetroMap = folium.Map(location=[Tlat, Tlong], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(TorontoMetro['Latitude'], 
                                           TorontoMetro['Longitude'], 
                                           TorontoMetro['Borough'], 
                                           TorontoMetro['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(TorontoMetroMap)  
    
TorontoMetroMap

Check how many neighborhoods are in each borough

In [18]:
TorontoMetro.groupby(['Borough']).count()

Unnamed: 0_level_0,Postal Code,Neighborhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


Create a dataframe for just neighborhoods in Toronto proper (boroughs named Toronto) 

In [19]:
TorontoProper = TorontoMetro[TorontoMetro['Borough'].str.contains('Toronto')]
TorontoProper.reset_index(drop=True, inplace=True)
TorontoProper.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


Check how many neighborhoods are in Toronto proper & list them

In [20]:
TorontoProper.shape

(39, 5)

In [21]:
for hood in TorontoProper['Neighborhood']:
    print(hood)

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
The Danforth West, Riverdale
Toronto Dominion Centre, Design Exchange
Brockton, Parkdale Village, Exhibition Place
India Bazaar, The Beaches West
Commerce Court, Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West, Forest Hill Road Park
High Park, The Junction South
North Toronto West, Lawrence Park
The Annex, North Midtown, Yorkville
Parkdale, Roncesvalles
Davisville
University of Toronto, Harbord
Runnymede, Swansea
Moore Park, Summerhill East
Kensington Market, Chinatown, Grange Park
Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
R

Get rid of the PO Boxes & Business reply neighborhoods

In [32]:
pobox = TorontoProper[TorontoProper['Neighborhood'].str.contains('PO Boxes')]
reply = TorontoProper[TorontoProper['Neighborhood'].str.contains('Business reply')]
TorontoProper.drop(index=[pobox.index[0], reply.index[0]], inplace=True)
TorontoProper.reset_index(drop=True, inplace=True)
TorontoProper

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031
5,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
6,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
7,M6G,Downtown Toronto,Christie,43.669542,-79.422564
8,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
9,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


Plot all of the neighborhoods in Toronto proper on a map

In [35]:
# create map of Toronto Proper
TorontoProperMap = folium.Map(location=[Tlat, Tlong], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood in zip(TorontoProper['Latitude'], 
                                           TorontoProper['Longitude'], 
                                           TorontoProper['Borough'], 
                                           TorontoProper['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(TorontoProperMap)  
    
TorontoProperMap

Define function to get the venues in each neighborhood

In [39]:
def getNearbyVenues(names, latitudes, longitudes, radius=500, LIMIT=100):
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):  
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Call the function to get all the venues

In [42]:
toronto_venues = getNearbyVenues(names=TorontoProper['Neighborhood'],
                                   latitudes=TorontoProper['Latitude'],
                                   longitudes=TorontoProper['Longitude'])

In [43]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
3,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
4,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa


In [45]:
print(toronto_venues.shape)
toronto_venues.groupby('Neighborhood').count()

(1501, 7)


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,54,54,54,54,54,54
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,63,63,63,63,63,63
Christie,17,17,17,17,17,17
Church and Wellesley,77,77,77,77,77,77
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,34,34,34,34,34,34
Davisville North,8,8,8,8,8,8
"Dufferin, Dovercourt Village",16,16,16,16,16,16


One-hot encode then group by neighborhood

In [60]:
TorontoHot = pd.get_dummies(toronto_venues['Venue Category'])
TorontoHot['Neighborhood'] = toronto_venues['Neighborhood']
TorontoGrouped = TorontoHot.groupby('Neighborhood').mean().reset_index()
TorontoGrouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.018519,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0625,0.0625,0.125,0.1875,0.125,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.015873,0.0,0.0,0.015873,0.0,0.0,0.0,0.015873
4,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Run k-means to get clusters

In [116]:
clusters = 6
TorontoClustered = TorontoGrouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters = i, random_state=0)
kmeans.fit(TorontoClustered)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=0, tol=0.0001, verbose=0)

Merge clusters with Neighborhoods

In [97]:
TorontoProper['Cluster'] = kmeans.labels_
TorontoProper.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636,5
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494,5
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937,5
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,0
4,M4E,East Toronto,The Beaches,43.676357,-79.293031,5


See how many neighborhoods are in each cluster

In [117]:
TorontoProper.groupby('Cluster').count()

Unnamed: 0_level_0,Postal Code,Borough,Neighborhood,Latitude,Longitude
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,14,14,14,14,14
1,2,2,2,2,2
2,1,1,1,1,1
3,1,1,1,1,1
4,1,1,1,1,1
5,18,18,18,18,18


Visualize the different clusters on a map

In [118]:
# create map
map_clusters = folium.Map(location=[Tlat, Tlong], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(TorontoProper['Latitude'], 
                                  TorontoProper['Longitude'], 
                                  TorontoProper['Neighborhood'], 
                                  TorontoProper['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters