# Segmenting and Clustering Neighborhoods in Toronto -- PART 3

### Preparations (PART 1 & PART 2)

#### Data Precprocessing in previsous sections

In [15]:
from requests import get       # import the get function from requests module
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
response = get(url)            # retrieve HTML file from the given Wikipedia URL

from bs4 import BeautifulSoup                       # import the "beautifulsoup4" library
soup=BeautifulSoup(response.text,"html.parser")     # make the soup, the format convenient for both extracting and preprocessing of the data
table = soup.find("tbody")    # find the table data and save as "table"

import pandas as pd
data=[]                                          # create an empty dataset "data"

rows = table.find_all('tr')                      # find all rows in the "table" 
for row in rows:                                 # use for loop to read each entry of the "Wikitable" to the "data"
    cols = row.find_all('td')               
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele])

df=pd.DataFrame(data)                            # create the dataframe requried using "Pandas"


df.columns=["PostalCode","Borough","Neighborhood"]   # Rename the columns as required

for i in df.index:                         # use the for loop to find all rows with their "Borough" values to be "Not Assigned"
    if df.iloc[i,1]=="Not assigned":       #
        df.iloc[i,1]=None                  # replace the "Not Assigned" to "None", which can be droped using dropna()

df.dropna(inplace=True)                 # remove all rows with "None" values
df.reset_index(drop=True,inplace=True)     # reset the index

for j in df.index:                      # use the for loop to find the all rows with their "Neighborhood" values to be "Not Assigned"                
    if df.iloc[j,2]=="Not assigned":    #       
        df.iloc[j,2]=df.iloc[j,1]       # set their neighborhood to be equivalent to their "Borgough" values
        

pd.options.mode.chained_assignment = None               # avoid the warn of chained assignment, default='warn'

for k in range(1,212):                                  # use for loop to combine the rows with the same "Borough" values
    if df.iloc[k-1,1]==df.iloc[k,1]:                    # if i-1th and ith row share the same borough
        df.iloc[k,2]=df.iloc[k-1,2]+","+df.iloc[k,2]    # append i-1th neighborhood to ith neighborhood separated with ","
        df.iloc[k-1,2]=None                             # set the i-1th neighborhood to "None" in order to remove by dropna later

df.dropna(inplace=True)                                 # drop all the rows with "None" values
df.reset_index(drop=True,inplace=True)                  # reset the index

latlng=pd.read_csv("https://cocl.us/Geospatial_data")    # read the online CSV file into pandas dataframe
latlng.head()                                            # check the dataframe

df_latlng=df.merge(latlng,left_on="PostalCode",right_on="Postal Code")    # inner join two dataframes, "df" as left set and "latlng" as right set, keyword is the Postal Code from both sets                                                       #                                                        # checked the merged dataframe "df_latlng"
df_latlng=df_latlng[["Postal Code","Borough","Neighborhood","Latitude","Longitude"]] 

In [16]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.

Libraries imported.


#### Plotting on the `Folium` Map - Overview of Toronto

In [17]:
df_latlng.head

<bound method NDFrame.head of    Postal Code           Borough  \
0          M4A        North York   
1          M5A  Downtown Toronto   
2          M6A        North York   
3          M7A      Queen's Park   
4          M9A         Etobicoke   
5          M1B       Scarborough   
6          M3B        North York   
7          M4B         East York   
8          M5B  Downtown Toronto   
9          M6B        North York   
10         M9B         Etobicoke   
11         M1C       Scarborough   
12         M3C        North York   
13         M4C         East York   
14         M5C  Downtown Toronto   
15         M6C              York   
16         M9C         Etobicoke   
17         M1E       Scarborough   
18         M4E      East Toronto   
19         M5E  Downtown Toronto   
20         M6E              York   
21         M1G       Scarborough   
22         M4G         East York   
23         M6G  Downtown Toronto   
24         M1H       Scarborough   
25         M3H        North York  

In [18]:
import folium
import numpy as np
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
import requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

In [19]:
address = 'Toronto,ON'

geolocator = Nominatim()
location = geolocator.geocode(address)
toronto_latitude = location.latitude
toronto_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(toronto_latitude, toronto_longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinate of Toronto are 43.653963, -79.387207.


In [20]:
#toronto_latitude=43.6532
#toronto_longitude=-79.3832
toronto_map=folium.Map(location=[toronto_latitude, toronto_longitude],zoom_start=12)

for lat, lng, label in zip(df_latlng['Latitude'], df_latlng['Longitude'], df_latlng['Neighborhood']):
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

toronto_map

In [21]:

X=df_latlng.drop(['Postal Code','Neighborhood','Borough'],axis=1)
X.head()

Unnamed: 0,Latitude,Longitude
0,43.725882,-79.315572
1,43.65426,-79.360636
2,43.718518,-79.464763
3,43.662301,-79.389494
4,43.667856,-79.532242


In [22]:
num_clusters = 5

k_means = KMeans(init = "k-means++", n_clusters = num_clusters, n_init = 12)
k_means.fit(X)
labels = k_means.labels_

print(labels)

[0 4 2 4 1 3 0 0 4 2 1 3 0 0 4 4 1 3 0 4 2 3 0 4 3 2 0 4 4 3 2 0 4 4 3 2 0
 4 4 0 2 0 4 1 3 2 0 2 2 1 0 2 2 1 3 2 4 4 1 3 2 4 4 1 1 3 4 4 1 3 4 4 3 4
 4 1 3 4 1 3 4 1 4 0 1]


In [23]:
labels.shape

(85,)

In [24]:
X["Label"]=labels

In [25]:
data=df_latlng.merge(X,left_on=["Latitude","Longitude"],right_on=["Latitude","Longitude"])    # inner join two dataframes, "df" as left set and "latlng" as right set, keyword is the Postal Code from both sets                                                       #                                                        # checked the merged dataframe "df_latlng"
#data=data[["Postal Code","Borough","Neighborhood","Latitude","Longitude","Label"]] 
data.head()


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Label
0,M4A,North York,"Parkwoods,Victoria Village",43.725882,-79.315572,0
1,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,4
2,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,2
3,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,4
4,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242,1


In [26]:
data_0=data.loc[data["Label"]==0]
data_0.head()

data_1=data.loc[data["Label"]==1]
data_1.head()

data_2=data.loc[data["Label"]==2]
data_2.head()

data_3=data.loc[data["Label"]==3]
data_3.head()

data_4=data.loc[data["Label"]==4]
data_4.head()

#data_5=data.loc[data["Label"]==5]
#data_5.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Label
1,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,4
3,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,4
8,M5B,Downtown Toronto,"Ryerson,Garden District",43.657162,-79.378937,4
14,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418,4
15,M6C,York,Humewood-Cedarvale,43.693781,-79.428191,4


In [27]:
toronto_map=folium.Map(location=[toronto_latitude, toronto_longitude],zoom_start=12)

for lat, lng, label in zip(data_0['Latitude'], data_0['Longitude'], data_0["Neighborhood"]):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

for lat, lng, label in zip(data_1['Latitude'], data_1['Longitude'], data_1['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='red',
        fill=True,
        fill_color='lightpink',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
for lat, lng, label in zip(data_2['Latitude'], data_2['Longitude'], data_2['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='darkorange',
        fill=True,
        fill_color='yellow',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
for lat, lng, label in zip(data_3['Latitude'], data_3['Longitude'], data_3['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='lightgreen',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

for lat, lng, label in zip(data_4['Latitude'], data_4['Longitude'], data_4['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='purple',
        fill=True,
        fill_color='violet',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
 
    
toronto_map

#### Detailed Analysis on Downtown Toronto

In [28]:
data.loc[data[Label]]

NameError: name 'Label' is not defined

In [None]:
Borough=data["Borough"].unique()

In [None]:
print(Borough)

There are 11 distinct boroughs in Toronto. Here we will focus on the analysis of "Downtown Toronto" firstly

In [None]:
data_dt=data.loc[data["Borough"]=="Downtown Toronto"]

In [None]:
data_dt.head()

In [None]:
Neighborhoods=data_dt.iloc[0,2]
for i in range(1,data_dt.shape[1]):
    Neighborhoods=Neighborhoods+","+ data_dt.iloc[i,2]

In [None]:
Neighborhoods

In [None]:
df_dt=pd.DataFrame(Neighborhoods.split(","))

In [None]:
df_dt.columns=["Neighborhood"]

In [None]:
df_dt.head()

In [None]:
df_dt.shape

There are 11 different neighborhoods contained in Downtown Toronto, and we will focus on analyzing on these neighborhoods using FourSquare API.

In [None]:
# entering the credentials for FourSquare API 
CLIENT_ID = 'O34LIZWJVCJGRPLWKRYKIXDK13ZDUHDSM4IFLI4KINLJIYA1' # your Foursquare ID
CLIENT_SECRET = 'AJRTIEBCWY51QE0DFMY1PQDM2SE0AMZQE5LJ0XLI2KMH3ZSM' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [None]:
address = 'Harbourfront,Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [None]:
latitude=[]
longitude=[]

In [None]:
for i in range(11):
    address=df_dt.iloc[i,0]+",Toronto"
    geolocator = Nominatim()
    location = geolocator.geocode(address)
    latitude.append(location.latitude)
    longitude.append(location.longitude)
    

In [None]:
df_dt["Latitude"]=latitude

In [None]:
df_dt["Longitude"]=longitude

In [None]:
df_dt.head()

After getting the latitudes and longitudes for these neighbohoods spots in Donwtown Toronto, now let's create a function to find all neighborhoods' venues in Downtown Toronto.

In [None]:
import requests

In [None]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500,LIMIT=100):
    
   
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=df_dt['Neighborhood'],
                                   latitudes=df_dt["Latitude"],
                                   longitudes=df_dt['Longitude']
                                  )

In [None]:
toronto_venues.head()

In [None]:
toronto_venues.groupby("Neighborhood").count()

In [None]:
toronto_venues.reset_index(inplace=True)

In [None]:
toronto_venues.drop(["index"],axis=1,inplace=True)

In [None]:
toronto_venues.head()

In [None]:
toronto_venues.groupby('Neighborhood').count()

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
toronto_onehot.set_index("Neighborhood",inplace=True)

In [None]:
toronto_onehot.reset_index(inplace=True)

In [None]:
toronto_onehot.head()

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

In [None]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
toronto_merged = df_dt

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location=[toronto_latitude, toronto_longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
toronto_merged.sort_values(by=["Cluster Labels"])