## Segmenting and Clustering Neighborhoods in Toronto

Lets import the libraries

In [7]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from IPython.display import Image 
from IPython.core.display import HTML 
from pandas.io.json import json_normalize
!pip install folium==0.5.0
import folium
!pip install geopy
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes


#### Webscraping

. Webscraping from Wikipedia

In [8]:
url_postal = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html_data_POSTAL = requests.get(url_postal).text

In [9]:
soup_POSTAL = BeautifulSoup(html_data_POSTAL,"html5lib")

#### Find all the tables within a webpage

In [10]:
tables_POSTAL = soup_POSTAL.find_all("table")

In [11]:
len(tables_POSTAL)

3

#### Find which table matched the string below

In [13]:
for index,table in enumerate(tables_POSTAL):
    if ("M4A" in str(table)):
        table_index_POSTAL = index
print(table_index_POSTAL)

0


#### Ordering the postal codes,neighbourhoods and boroughs

By examining the webpage I saw that there are 9 columns so a for loop for traversing all 9 columns and appending each column to an existing dataframe generated prior to the for loop

In [14]:
postal_data = pd.DataFrame(columns=['PostalCodes',"Borough","Neighborhood"])
for row in tables_POSTAL[table_index_POSTAL].tbody.find_all('tr'):
    col = row.find_all("td")
    if(col != []):
        for i in range(0,9):
            #postal codes
            c0 = col[i].text.rstrip().replace("\n","").split("(")[0][0:3]
            #city
            city_0 = col[i].text.rstrip().replace("\n","").split("(")[0][3:]
            #neighbors
            neigh_0 = col[i].text.rstrip().split('(')
            if len(neigh_0) > 1:
                neigh_0 = col[i].text.rstrip().split('(')[1].replace(')','').replace('/',',')
            postal_data = postal_data.append({'PostalCodes':c0,"Borough":city_0,"Neighborhood":neigh_0},ignore_index=True)
postal_data

Unnamed: 0,PostalCodes,Borough,Neighborhood
0,M1A,Not assigned,[\nM1ANot assigned]
1,M2A,Not assigned,[\nM2ANot assigned]
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park , Harbourfront"
...,...,...,...
175,M5Z,Not assigned,[\nM5ZNot assigned]
176,M6Z,Not assigned,[\nM6ZNot assigned]
177,M7Z,Not assigned,[\nM7ZNot assigned]
178,M8Z,Etobicoke,"Mimico NW , The Queensway West , South of Bloo..."


In [15]:
postal_data.Borough.replace("Not assigned",np.nan,inplace=True)
postal_data.dropna(inplace=True)
postal_data.reset_index(inplace=True,drop=True)

In [16]:
postal_data['Borough']=postal_data['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

#### Now displaying the updated table

In [19]:
postal_data.head(10)

Unnamed: 0,PostalCodes,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park , Harbourfront"
3,M6A,North York,"Lawrence Manor , Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Malvern , Rouge"
7,M3B,North York,Don MillsNorth
8,M4B,East York,"Parkview Hill , Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


#### Lets check the shape of the dataframe now

In [20]:
postal_data.shape

(103, 3)

Now the dataframe is ready and clean


### Lets download the Geospatial datset

In [21]:
!wget -q -O 'Geospatial_Coordinates.csv' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv
print('Data downloaded!')

Data downloaded!


#### Read the dataframe using Pandas and change the column name as per our postal_data

In [22]:
df = pd.read_csv('Geospatial_Coordinates.csv')
df.rename(columns={'Postal Code':"PostalCodes"},inplace=True)
df.head()

Unnamed: 0,PostalCodes,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Now lets merge the two dataframes together based on postal codes

In [23]:
postal_data = postal_data.merge(df,how='outer',on='PostalCodes')
postal_data.head()

Unnamed: 0,PostalCodes,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park , Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494


### Getting the map of Canada

In [24]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Canada are 43.6534817, -79.3839347.


In [25]:
map_canada = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, label in zip(postal_data['Latitude'], postal_data['Longitude'], postal_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_canada)  
    
map_canada

##### I chose North York Borough

In [26]:
north_york_data = postal_data[postal_data['Borough'] == 'North York'].reset_index(drop=True)
north_york_data.head()

Unnamed: 0,PostalCodes,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763
3,M3B,North York,Don MillsNorth,43.745906,-79.352188
4,M6B,North York,Glencairn,43.709577,-79.445073


#### Get the location of the  North York Borough

In [27]:
address = 'North York, Canada'
geolocator = Nominatim(user_agent="canada_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of North York are 43.7543263, -79.44911696639593.


In [29]:
map_ny = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(north_york_data['Latitude'], north_york_data['Longitude'], north_york_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny

## Foursquare API Credentials

In [30]:
CLIENT_ID = 'AYQM1NBATJ220LXCDOUQRV10U04EZBOCRZMBV5QSDMDKG1UV' # your Foursquare ID
CLIENT_SECRET = 'Z3JQ0MKI5WFYNB3GIP0VPKF21TAKSQPEFLCSD5CALQLPCAMC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: AYQM1NBATJ220LXCDOUQRV10U04EZBOCRZMBV5QSDMDKG1UV
CLIENT_SECRET:Z3JQ0MKI5WFYNB3GIP0VPKF21TAKSQPEFLCSD5CALQLPCAMC


### Lets get the nearby venues of each neighborhood

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [32]:
north_york_venues = getNearbyVenues(names=north_york_data['Neighborhood'],
                                   latitudes=north_york_data['Latitude'],
                                   longitudes=north_york_data['Longitude']
                                  )

Parkwoods
Victoria Village
Lawrence Manor , Lawrence Heights
Don MillsNorth
Glencairn
Don MillsSouth
Hillcrest Village
Bathurst Manor , Wilson Heights , Downsview North
Fairview , Henry Farm , Oriole
Northwood Park , York University
Bayview Village
DownsviewEast  
York Mills , Silver Hills
DownsviewWest
North Park , Maple Leaf Park , Upwood Park
Humber Summit
Willowdale , Newtonbrook
DownsviewCentral
Bedford Park , Lawrence Manor East
Humberlea , Emery
WillowdaleSouth
DownsviewNorthwest
York Mills West
WillowdaleWest


#### Lets see the shape of the dataframe 

In [33]:
print(north_york_venues.shape)
north_york_venues.head()

(245, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.753259,-79.329656,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.753259,-79.329656,KFC,43.754387,-79.333021,Fast Food Restaurant
2,Parkwoods,43.753259,-79.329656,Variety Store,43.751974,-79.333114,Food & Drink Shop
3,Victoria Village,43.725882,-79.315572,Victoria Village Arena,43.723481,-79.315635,Hockey Arena
4,Victoria Village,43.725882,-79.315572,Tim Hortons,43.725517,-79.313103,Coffee Shop


In [34]:
north_york_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Bathurst Manor , Wilson Heights , Downsview North",23,23,23,23,23,23
Bayview Village,4,4,4,4,4,4
"Bedford Park , Lawrence Manor East",26,26,26,26,26,26
Don MillsNorth,4,4,4,4,4,4
Don MillsSouth,22,22,22,22,22,22
DownsviewCentral,3,3,3,3,3,3
DownsviewEast,3,3,3,3,3,3
DownsviewNorthwest,4,4,4,4,4,4
DownsviewWest,4,4,4,4,4,4
"Fairview , Henry Farm , Oriole",68,68,68,68,68,68


##### Let's find out how many unique categories can be curated from all the returned venues

In [35]:
print('There are {} uniques categories.'.format(len(north_york_venues['Venue Category'].unique())))

There are 102 uniques categories.


## ANALYSING EACH NEIGHBOURHOOD

In [36]:
# one hot encoding
north_york_onehot = pd.get_dummies(north_york_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
north_york_onehot['Neighborhood'] = north_york_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [north_york_onehot.columns[-1]] + list(north_york_onehot.columns[:-1])
north_york_onehot = north_york_onehot[fixed_columns]

north_york_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Supermarket,Supplement Shop,Sushi Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Women's Store
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


And let's examine the new dataframe size.

In [37]:
north_york_onehot.shape

(245, 103)

### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [38]:

north_york_grouped = north_york_onehot.groupby('Neighborhood').mean().reset_index()
north_york_grouped

Unnamed: 0,Neighborhood,Accessories Store,Airport,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,Bank,...,Supermarket,Supplement Shop,Sushi Restaurant,Thai Restaurant,Theater,Toy / Game Store,Trail,Video Game Store,Vietnamese Restaurant,Women's Store
0,"Bathurst Manor , Wilson Heights , Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.086957,...,0.043478,0.0,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bedford Park , Lawrence Manor East",0.0,0.0,0.038462,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.038462,0.038462,0.0,0.0,0.0,0.0,0.0,0.0
3,Don MillsNorth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Don MillsSouth,0.0,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,...,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,DownsviewCentral,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,DownsviewEast,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,DownsviewNorthwest,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,DownsviewWest,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"Fairview , Henry Farm , Oriole",0.0,0.0,0.014706,0.0,0.0,0.014706,0.0,0.029412,0.029412,...,0.0,0.014706,0.0,0.0,0.014706,0.029412,0.0,0.014706,0.0,0.029412


#### Let's confirm the new size

In [39]:
north_york_grouped.shape

(22, 103)

## Let's print each neighborhood

In [41]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [42]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = north_york_grouped['Neighborhood']

for ind in np.arange(north_york_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(north_york_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Bathurst Manor , Wilson Heights , Downsview North",Coffee Shop,Bank,Sandwich Place,Pharmacy,Pizza Place
1,Bayview Village,Chinese Restaurant,Café,Bank,Japanese Restaurant,Women's Store
2,"Bedford Park , Lawrence Manor East",Italian Restaurant,Restaurant,Pizza Place,Coffee Shop,Sandwich Place
3,Don MillsNorth,Gym,Caribbean Restaurant,Café,Japanese Restaurant,Distribution Center
4,Don MillsSouth,Gym,Restaurant,Coffee Shop,Grocery Store,Shopping Mall


### lets Cluster Neighborhoods

In [43]:
kclusters = 5 #no of clusters

north_york_grouped_clustering = north_york_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=141).fit(north_york_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([1, 0, 1, 0, 1, 2, 3, 1, 1, 1], dtype=int32)

#### Merging neighborhood data with the venue data

In [45]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

north_york_merged = north_york_data


north_york_merged = north_york_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

north_york_merged.head()

Unnamed: 0,PostalCodes,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,1.0,Park,Food & Drink Shop,Fast Food Restaurant,Discount Store,Coffee Shop
1,M4A,North York,Victoria Village,43.725882,-79.315572,1.0,Coffee Shop,Hockey Arena,French Restaurant,Portuguese Restaurant,Distribution Center
2,M6A,North York,"Lawrence Manor , Lawrence Heights",43.718518,-79.464763,1.0,Clothing Store,Accessories Store,Furniture / Home Store,Miscellaneous Shop,Arts & Crafts Store
3,M3B,North York,Don MillsNorth,43.745906,-79.352188,0.0,Gym,Caribbean Restaurant,Café,Japanese Restaurant,Distribution Center
4,M6B,North York,Glencairn,43.709577,-79.445073,1.0,Japanese Restaurant,Pizza Place,Bakery,Italian Restaurant,Distribution Center


Finally, let's visualize the resulting clusters

In [46]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(north_york_merged['Latitude'], north_york_merged['Longitude'], north_york_merged['Neighborhood'], north_york_merged['Cluster Labels']):
    try:
        label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
        #print(cluster)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[int(cluster-1)],
            fill=True,
            fill_color=rainbow[int(cluster-1)],
            fill_opacity=0.7).add_to(map_clusters)
    except:
        print('sorry cannot do nans')
       
map_clusters

sorry cannot do nans
sorry cannot do nans
