# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto (Part 3)

## The previous works of part 1 and 2 has been copied in the next block to simplify the reviewing for peers

In [1]:
####Part 1 work:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

#Retrieving Data from Wikipedia Page

# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text


# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

#Collecting required information from the soup object

# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

# append the data into the respective lists
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n'))
        
#Appending lists appropriately to the dataframe

torontodf = pd.DataFrame({"PostalCode": postalCodeList,"Borough": boroughList,"Neighborhood": neighborhoodList})

#Performing the preprocessing operations

#Dropping cells with a borough that is "Not assigned"
torontodf.drop(torontodf[ torontodf['Borough'] == "Not assigned"].index , inplace=True)

# group neighborhoods in the same borough
torontodf = torontodf.groupby(["PostalCode", "Borough"], as_index=False).agg(lambda x: ", ".join(x))

# for Neighborhood="Not assigned", make the value the same as Borough
for index, row in torontodf.iterrows():
    if (row["Neighborhood"] == "Not assigned"):
        row["Neighborhood"] = row["Borough"]
        
####Part 2 work:
#Loading Geograhical Coordinates file from Coursera
coordinatesdf = pd.read_csv("http://cocl.us/Geospatial_data")

# rename the column "PostalCode" to match with previous dataframe
coordinatesdf.rename(columns={"Postal Code": "PostalCode"}, inplace=True)

#merging two dataframes
torontowithcordinatesdf=torontodf.merge(coordinatesdf, on="PostalCode", how="left")

## Use geopy library to get the latitude and longitude values of Toronto

In [2]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


## Create a map of Toronto with neighborhoods superimposed on top

In [3]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontowithcordinatesdf['Latitude'], torontowithcordinatesdf['Longitude'], torontowithcordinatesdf['Borough'], torontowithcordinatesdf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

### Filter only boroughs that contain the word Toronto

In [4]:
torontofilterddf=torontowithcordinatesdf[torontowithcordinatesdf['Borough'].str.contains("Toronto")]

In [5]:
#Recreating the map
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(torontofilterddf['Latitude'], torontofilterddf['Longitude'], torontofilterddf['Borough'], torontofilterddf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

## Define Foursquare Credentials and Version

In [6]:
# define Foursquare Credentials and Version
CLIENT_ID = 'TO RUN IT PROVIDE YOUR CLIENT ID' # your Foursquare ID
CLIENT_SECRET = 'TO RUN IT PROVIDE YOUR CLIENT SQUARE' # your Foursquare Secret
VERSION = 'YYYYMMDD' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BMU3TDFZ0IYBNUI0H425VRF5AHSOJ0423X4ZD21RFBWWYRD3
CLIENT_SECRET:WSONYFWUAEH12Z0HCJPKQN2JNWX1ZDA0IA3QDAYQG55DZGW3


## Now, let's get the top 100 venues that are within a radius of 500 meters

In [7]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighborhood in zip(torontofilterddf['Latitude'], torontofilterddf['Longitude'], torontofilterddf['PostalCode'], torontofilterddf['Borough'], torontofilterddf['Neighborhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [8]:
# convert the venues list into a new DataFrame
venuesdf = pd.DataFrame(venues)

# define the column names
venuesdf.columns = ['PostalCode', 'Borough', 'Neighborhood', 'BoroughLatitude', 'BoroughLongitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

In [9]:
#Let's check how many venues were returned for each PostalCode
venuesdf.groupby(["PostalCode", "Borough", "Neighborhood"]).count()

#Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(venuesdf['VenueCategory'].unique())))

There are 239 uniques categories.


## Analyze Each Neighborhood

In [10]:
# one hot encoding
toronto_onehot = pd.get_dummies(venuesdf[['VenueCategory']], prefix="", prefix_sep="")

# add postal, borough and neighborhood column back to dataframe
toronto_onehot['PostalCode'] = venuesdf['PostalCode'] 
toronto_onehot['Borough'] = venuesdf['Borough'] 
toronto_onehot['Neighborhoods'] = venuesdf['Neighborhood'] 

# move postal, borough and neighborhood column to the first column
fixed_columns = list(toronto_onehot.columns[-3:]) + list(toronto_onehot.columns[:-3])
toronto_onehot = toronto_onehot[fixed_columns]

print(toronto_onehot.shape)

(1613, 242)


### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [11]:
toronto_grouped = toronto_onehot.groupby(["PostalCode", "Borough", "Neighborhoods"]).mean().reset_index()
print(toronto_grouped.shape)

(39, 242)


### Now let's create the new dataframe and display the top 10 venues for each PostalCode

In [12]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
areaColumns = ['PostalCode', 'Borough', 'Neighborhoods']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = toronto_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = toronto_grouped['Borough']
neighborhoods_venues_sorted['Neighborhoods'] = toronto_grouped['Neighborhoods']

for ind in np.arange(toronto_grouped.shape[0]):
    row_categories = toronto_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]

# neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)
print(neighborhoods_venues_sorted.shape)

(39, 13)


### Cluster Areas

In [13]:
#Run k-means to cluster the Toronto areas into 5 clusters

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop(["PostalCode", "Borough", "Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([4, 0, 0, 0, 3, 0, 0, 0, 2, 0])

In [14]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
toronto_merged = torontofilterddf.copy()

# add clustering labels
toronto_merged["Cluster Labels"] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.drop(["Borough", "Neighborhoods"], 1).set_index("PostalCode"), on="PostalCode")

print(toronto_merged.shape)

(39, 16)


In [15]:
# sort the results by Cluster Labels
print(toronto_merged.shape)
toronto_merged.sort_values(["Cluster Labels"], inplace=True)

(39, 16)


### Finally, let's visualize the resulting clusters

In [16]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['PostalCode'], toronto_merged['Borough'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examine Clusters

### Cluster 1

In [17]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
59,Downtown Toronto,0,Coffee Shop,Aquarium,Café,Hotel,Italian Restaurant,Scenic Lookout,Sporting Goods Shop,Fried Chicken Joint,Brewery,Restaurant
60,Downtown Toronto,0,Coffee Shop,Hotel,Café,Restaurant,Salad Place,Italian Restaurant,American Restaurant,Japanese Restaurant,Seafood Restaurant,Deli / Bodega
61,Downtown Toronto,0,Coffee Shop,Restaurant,Café,Hotel,Gym,American Restaurant,Japanese Restaurant,Seafood Restaurant,Deli / Bodega,Italian Restaurant
65,Central Toronto,0,Sandwich Place,Café,Coffee Shop,Pharmacy,History Museum,Donut Shop,Burger Joint,Middle Eastern Restaurant,Flower Shop,Indian Restaurant
66,Downtown Toronto,0,Café,Bar,Italian Restaurant,Japanese Restaurant,Bookstore,Restaurant,Bakery,Yoga Studio,Pub,Beer Bar
67,Downtown Toronto,0,Café,Bakery,Vietnamese Restaurant,Coffee Shop,Mexican Restaurant,Bar,Dessert Shop,Gaming Cafe,Vegetarian / Vegan Restaurant,Donut Shop
68,Downtown Toronto,0,Airport Service,Airport Terminal,Plane,Rental Car Location,Boat or Ferry,Sculpture Garden,Harbor / Marina,Boutique,Airport Lounge,Airport Gate
69,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Seafood Restaurant,Japanese Restaurant,Italian Restaurant,Beer Bar,Cocktail Bar,Hotel,Bakery
70,Downtown Toronto,0,Coffee Shop,Café,Restaurant,Gym,Hotel,Japanese Restaurant,Salad Place,Steakhouse,Asian Restaurant,Deli / Bodega
75,Downtown Toronto,0,Grocery Store,Café,Park,Nightclub,Coffee Shop,Italian Restaurant,Baby Store,Diner,Athletics & Sports,Candy Store


### Cluster 2

In [18]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
63,Central Toronto,1,Home Service,Garden,Yoga Studio,Department Store,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop


### Cluster 3

In [19]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
48,Central Toronto,2,Restaurant,Park,Trail,Deli / Bodega,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop,Doner Restaurant
64,Central Toronto,2,Trail,Jewelry Store,Park,Sushi Restaurant,Diner,Discount Store,Distribution Center,Dog Run,Doner Restaurant,Yoga Studio
50,Downtown Toronto,2,Park,Trail,Playground,Doner Restaurant,Dessert Shop,Diner,Discount Store,Distribution Center,Dog Run,Yoga Studio


### Cluster 4

In [20]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
44,Central Toronto,3,Park,Bus Line,Swim School,Yoga Studio,Department Store,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Donut Shop


### Cluster 5

In [21]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
37,East Toronto,4,Neighborhood,Health Food Store,Pub,Trail,Eastern European Restaurant,Electronics Store,Donut Shop,Doner Restaurant,Dance Studio,Dog Run
