# Segmenting and Clustering Neighborhoods in Toronto

In [56]:
import pandas as pd
import numpy as np
import requests

In [57]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M' #url to the wiki site
df_list = pd.read_html(url) #get the list of data_frames from website
df = df_list[0] #we just want the first dataframe

In [58]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


good now we have export table into dataframe

In [59]:
df.shape

(287, 3)

In [60]:
df.Borough.value_counts()

Not assigned        77
Etobicoke           44
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

drop raws where Borough=='Not assigned'

In [61]:
df.drop(df[df['Borough']=="Not assigned"].index, axis=0, inplace=True)

In [62]:
df.shape

(210, 3)

In [63]:
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


In [64]:
df.groupby('Borough').describe()

Unnamed: 0_level_0,Postcode,Postcode,Postcode,Postcode,Neighbourhood,Neighbourhood,Neighbourhood,Neighbourhood
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Borough,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Central Toronto,17,9,M4V,5,17,17,Davisville,1
Downtown Toronto,37,19,M5V,7,37,36,St. James Town,2
East Toronto,7,5,M4L,2,7,7,The Danforth West,1
East York,6,5,M4B,2,6,6,Woodbine Heights,1
Etobicoke,44,11,M9V,8,44,44,Beaumond Heights,1
Mississauga,1,1,M7R,1,1,1,Canada Post Gateway Processing Centre,1
North York,38,24,M6L,3,38,38,Humberlea,1
Queen's Park,1,1,M9A,1,1,1,Not assigned,1
Scarborough,37,17,M1V,4,37,37,Scarborough Town Centre,1
West Toronto,13,6,M6K,3,13,13,The Junction South,1


In [65]:
df.groupby('Postcode').describe()

Unnamed: 0_level_0,Borough,Borough,Borough,Borough,Neighbourhood,Neighbourhood,Neighbourhood,Neighbourhood
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq
Postcode,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
M1B,2,1,Scarborough,2,2,2,Rouge,1
M1C,3,1,Scarborough,3,3,3,Rouge Hill,1
M1E,3,1,Scarborough,3,3,3,Morningside,1
M1G,1,1,Scarborough,1,1,1,Woburn,1
M1H,1,1,Scarborough,1,1,1,Cedarbrae,1
M1J,1,1,Scarborough,1,1,1,Scarborough Village,1
M1K,3,1,Scarborough,3,3,3,East Birchmount Park,1
M1L,3,1,Scarborough,3,3,3,Golden Mile,1
M1M,3,1,Scarborough,3,3,3,Cliffside,1
M1N,2,1,Scarborough,2,2,2,Birch Cliff,1


check how many Neighbourhoods available for Postcode=M5A

In [66]:
df.groupby('Postcode')['Neighbourhood'].unique()['M5A']

array(['Harbourfront'], dtype=object)

only one Neighbourhood is available but in assignment guidelines it says there will be 2.  
further check for Neighbourhood='Regent Park' in the dataset

In [67]:
df[df['Neighbourhood']=='Regent Park']

Unnamed: 0,Postcode,Borough,Neighbourhood


Eventhough in assignment guide lines it says that "M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park." actually our downloaded data set does not have Neighbourhood call 'Regent Park', so we can assume the data has been updated recently

In [68]:
s1 = df.groupby('Postcode')['Neighbourhood'].unique() #get list of Neighbourhood against unique Postcode
s1.head()

Postcode
M1B                            [Rouge, Malvern]
M1C    [Highland Creek, Rouge Hill, Port Union]
M1E         [Guildwood, Morningside, West Hill]
M1G                                    [Woburn]
M1H                                 [Cedarbrae]
Name: Neighbourhood, dtype: object

In [69]:
s2 = df.groupby('Postcode')['Borough'].unique() ##get Borough against unique Postcode
s2.head()

Postcode
M1B    [Scarborough]
M1C    [Scarborough]
M1E    [Scarborough]
M1G    [Scarborough]
M1H    [Scarborough]
Name: Borough, dtype: object

In [70]:
df_combined = pd.concat([s2, s1], axis=1)

In [71]:
df_combined.reset_index(inplace=True) #reset_index will remove Postcode from index and make it a column

In [72]:
df_combined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,[Scarborough],"[Rouge, Malvern]"
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]"
3,M1G,[Scarborough],[Woburn]
4,M1H,[Scarborough],[Cedarbrae]


Now we have got combined Borough and Neighbourhood against each unique Postcode.  
However, still we need to remove bracets in each field

In [73]:
df_combined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,[Scarborough],"[Rouge, Malvern]"
1,M1C,[Scarborough],"[Highland Creek, Rouge Hill, Port Union]"
2,M1E,[Scarborough],"[Guildwood, Morningside, West Hill]"
3,M1G,[Scarborough],[Woburn]
4,M1H,[Scarborough],[Cedarbrae]


In [74]:
#','.join(map(str,[10,"test",10.5]))

In [75]:
#df_combined.applymap(lambda x: ','.join(map(str,x)) if np.where(df_combined.values==x)[1] in [1,2] else x)

apply below lambda function to 'Brough' and 'Neighbourhood' to make each element to a string

In [76]:
df_combined['Borough']=df_combined['Borough'].apply(lambda x: ','.join(map(str,x)))
df_combined['Neighbourhood']=df_combined['Neighbourhood'].apply(lambda x: ','.join(map(str,x)))

In [77]:
df_combined.tail()

Unnamed: 0,Postcode,Borough,Neighbourhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie..."
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam..."
102,M9W,Etobicoke,Northwest


In [78]:
df_combined.shape

(103, 3)

## Second Part of the assignment

In [79]:
#!conda install -c conda-forge geocoder --yes
#import geocoder

#print ("geocoder installed and import successfully")

Define the basic structure of the dataframe and load data from df_combined

In [80]:
#columns = ['PostalCode','Borough','Neighbourhood','Latitude','Longitude']
#df_toro = pd.DataFrame(columns=columns)
#df_toro
#df_toro[['PostalCode','Borough','Neighbourhood']]=df_combined[['Postcode','Borough','Neighbourhood']]
#df_toro.head()

Fill latitude and longitude using geocoder

In [81]:
#df_toro.set_index('PostalCode', inplace=True)
#df_toro.head()

In [82]:
#postcode='M5G'
#for postcode in df_combined.Postcode:
#lat_lng_coord=None
#while(lat_lng_coord is None):
#        g = geocoder.google('{}, Toronto, Ontario'.format(postcode))
#        lat_lng_coord = g.latlng
#
#df_toro.loc[postcode,'Latitude']=lat_lng_coord[0]
#df_toro.loc[postcode,'Longitude']=lat_lng_coord[1]
#print(lat_lng_coord[0])
#print(lat_lng_coord[1])
    

Since geocoder data is unstable, we will import longitude and latitude from given csv

In [83]:
df_geo = pd.read_csv('http://cocl.us/Geospatial_data')

In [84]:
df_combined.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [85]:
df_toro = df_combined.merge(df_geo, left_on='Postcode', right_on='Postal Code').drop('Postal Code', axis=1)
df_toro.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Rename Postcode to Postalcode as shown in the assignment

In [86]:
df_toro.rename(columns={'Postcode':'PostalCode'}, inplace=True)

In [87]:
df_toro.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [88]:
df_toro.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 5 columns):
PostalCode       103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
Latitude         103 non-null float64
Longitude        103 non-null float64
dtypes: float64(2), object(3)
memory usage: 4.8+ KB


In [89]:
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Libraries imported.


## Create a map of Toronto and superimpose Neighbourhood

In [90]:
df_toro['Borough'].value_counts()

North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           11
Central Toronto      9
West Toronto         6
York                 5
East York            5
East Toronto         5
Queen's Park         1
Mississauga          1
Name: Borough, dtype: int64

As suggested in assignment guidelines we will use only those boroughs that contain the word Toronto ("Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto")

In [91]:
only_toro = df_toro[df_toro['Borough'].isin(['Central Toronto','Downtown Toronto','West Toronto','East Toronto'])]
print ("shape =",only_toro.shape)
only_toro.head()

shape = (39, 5)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Neighborhood reduced to 39.  
Lets plot entire dataset and superimpose those selected points in the same map

In [92]:
map_toro = folium.Map(location=[43.715383,-79.405678], zoom_start=10)

# add markers to map for total toro dataset
for lat, lng, borough, neighborhood in zip(df_toro['Latitude'], df_toro['Longitude'], df_toro['Borough'], df_toro['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toro)
    
map_toro

In [None]:
#map_toro = folium.Map(location=[43.715383,-79.405678], zoom_start=10)

# add markers to map from only_toro dataframe
for lat, lng, borough, neighborhood in zip(only_toro['Latitude'], only_toro['Longitude'], only_toro['Borough'], only_toro['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='#39cc31',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toro)
    
map_toro

Those red circles are the neighborhoods in "only_toro" dataframe where as blue circles are the neighbours in the "df_toro" other than in "only_toro" dataframe  
from here onwards we will use "only_toro" dataframe for clustering

In [None]:
Toronto_data = only_toro.reset_index(drop=True)

## Define Foursquare Credintial and Version

In [None]:
CLIENT_ID = 'NCMXYN2A0HNWQH4GXNMUO1VPSBL3CP5C54B0UZBXZVSC2MWK' # your Foursquare ID
CLIENT_SECRET = 'GCIX1PKEONNQLXYCGH33ZIMU1F2CJQ031HUIAYRUMZPHDADU' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
Toronto_data.head()

we will explore the first Neighbourhood, though we refers Neighbourhood we are acually exploring each unique PostalCode where Latitude and Longitude is available not for Neighbourhood but for each unique PostaCode  
However, we will explore first Neighbourhood, The Beaches (i.e. PostalCode = M4E) using Foursquare

## Now lets get top 100 venues in 'The Beaches'

In [None]:
limit = 100
radius = 500
lat = 43.676357
lng = -79.293031
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    lat,lng,
    VERSION,
    radius,
    limit
)

In [None]:
results = requests.get(url).json()
results

In [None]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

## we will only get the required fields
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues.head()


Now we will define a function that will extracts the category of the venue given a row of above dataframe

In [None]:
#explore the first venur.catagories data
nearby_venues['venue.categories'][0]

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
#get_category_type(nearby_venues.loc[0,:])

now we will replace venues.categories field with category names and change column names

In [None]:
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

# Explore Neighborhood in Toronto

Now lets get all the venues nearby each postal code in Toronto dataset

In [None]:
Toronto_data.head()

In [None]:
#define variables

PostalCodes=Toronto_data['PostalCode']
borough=Toronto_data['Borough']
n_hood=Toronto_data['Neighbourhood']
latitudes=Toronto_data['Latitude']
longitudes=Toronto_data['Longitude']

radius=500
LIMIT=100
venues_list=[] ##list to get venues for each neighborhood
for pc, borough, n_hood, lat, lng in zip(PostalCodes, borough, n_hood, latitudes, longitudes):
    print(n_hood)
    
    # create the API request URL
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items'] #['response']['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    venues_list.append([(
            pc,
            borough,
            n_hood, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['categories'][0]['name'],
            v['venue']['location']['lat'], 
            v['venue']['location']['lng']) for v in results])

Toronto_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
Toronto_venues.columns = ['PostalCode',
                  'Borough',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Category', 
                  'Venue Latitude', 
                  'Venue Longitude']

In [None]:
len(venues_list)

In [None]:
Toronto_venues.shape

In [None]:
Toronto_venues.head()

Now let's see how many unique venur categories are there in the dataframe

In [None]:
len(Toronto_venues['Venue Category'].unique())

# Analyze each Neighbourhood

Note: 'Venue Category' contain a category name called "Neighborhood", which confuse with the 'Neighborhood' column of the Toronto_venues  
since we want original 'Neighborhood' in the onehot dataset, we will include it as 'Neighbourhood'

In [None]:
#{'Neighborhood'}.issubset(Toronto_onehot.columns)

In [None]:
#Toronto_venues[Toronto_venues['Venue Category']=='Neighborhood']

In [None]:
# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighbourhood'] = Toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]] + list(Toronto_onehot.columns[:-1])
Toronto_onehot = Toronto_onehot[fixed_columns]

Toronto_onehot.shape

In [None]:
Toronto_onehot.head()

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
Toronto_grouped = Toronto_onehot.groupby('Neighbourhood').mean().reset_index()
Toronto_grouped.head()

### Lets confirm the new size

In [None]:
Toronto_grouped.shape

## Clustering Neighbourhoods

Now we have a dataframe with weight of venue categories against each Neighbourhood
So now we are verymuch ready to use that data to cluster Neighbourhoods based on weight of venue categories

we will run k-means to cluster

In [None]:
# set number of clusters
kclusters = 5

Toronto_clustering = Toronto_grouped.drop('Neighbourhood', 1) ## be careful about the word Neighbourhood, otherwise we will drop the feature called Neighborhood

# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=12).fit(Toronto_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:100]

### Now let's create a dataframe including cluster lables

In [None]:
# add clustering labels to Toronto_grouped dataframe
Toronto_grouped.insert(0, 'Cluster Labels', kmeans.labels_)
Toronto_grouped.head()

In [None]:
Toronto_data[Toronto_data['Neighbourhood']=='Business Reply Mail Processing Centre 969 Eastern']

In [None]:
## Now lets mearge Toronto_data with Toronto_grouped on "Neighbourhood" so that we will have other required information for mapping
Toronto_merged = Toronto_data.join(Toronto_grouped.set_index('Neighbourhood'), on='Neighbourhood')
Toronto_merged.head()

### Now lets visualize resulting clusters on map

In [None]:
Toronto_merged['Cluster Labels'].value_counts()

In [None]:
# create map
map_clusters = folium.Map(location=[43.676357, -79.293031], zoom_start=11) #we use "The Beaches" location to initiate the map

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### It seems like k=5 is not a best value, let's use elbow method to find the best k value

we can use inertia_ value of the k-means model to evaluate the performance of the model

In [None]:
kmeans.inertia_

In [None]:
# let's plot inertia for different values of k

k_values = []
inertias = []
for k in range(3,39):
    # run k-means clustering
    kmeans = KMeans(init="k-means++", n_clusters=k, n_init=12).fit(Toronto_clustering)
    k_values.append(k)
    inertias.append(kmeans.inertia_)

#print("k values", k_values)
#print("inertias", inertias)


In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

# plot inertia against k-values
plt.figure(figsize=(6,10))
plt.plot(k_values, inertias)

plt.xlabel("k-value")
plt.ylabel("value of inertia")
plt.show()

according to above figure there is no clear lbow point, so this might be not suitable to cluster using k-means clustering