# Jupyter Notebook containing code for the IBM Capstone Project Course

In [1]:
import pandas as pd
import numpy as np

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


# Code for the Week 3 Peer-review Assignment

## From here on, the code refers to the segmentation and clustering of the neighborhoods of Toronto

# #### 1st part ####

In [3]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Since the information in the Wikipedia is already in a table (structured),  we can use pandas to directly read this information.

In [4]:
# Reading Wiki page into pandas object
temp_wiki = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
toronto_df = temp_wiki[0]

# Rename the columns to be like the ones shown in the assignment instructions
toronto_df.columns = ['PostalCode','Borough','Neighborhood']

# Drop non-assigned boroughs by first substituting the string by NaN and then droping it
toronto_df.replace('Not assigned', np.NaN, inplace=True)
toronto_df.dropna(inplace=True)
toronto_df.reset_index(inplace=True, drop=True)
toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [5]:
# Check dataframe's shape
toronto_df.shape

(103, 3)

# #### 2nd part

In [6]:
# Installing and importing geocoder library
!pip install geocoder
import geocoder
print('Library successfully loaded!')

Collecting geocoder
  Using cached https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl
Collecting ratelim (from geocoder)
  Using cached https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting requests (from geocoder)
  Using cached https://files.pythonhosted.org/packages/29/c1/24814557f1d22c56d50280771a17307e6bf87b70727d975fd6b2ce6b014a/requests-2.25.1-py2.py3-none-any.whl
Collecting future (from geocoder)
Collecting six (from geocoder)
  Using cached https://files.pythonhosted.org/packages/ee/ff/48bde5c0f013094d729fe4b0316ba2a24774b3ff1c52d924a8a4cb04078a/six-1.15.0-py2.py3-none-any.whl
Collecting click (from geocoder)
  Using cached https://files.pythonhosted.org/packages/d2/3d/fa76db83bf75c4f8d338c2fd15c8d33fdd7ad23a9b5e57eb6c5de26b430e/click-7.1.2-py2.py3-none-any.whl
Collecting decorator (from 

### WARNING: The geocoder library is not working due to a lack of API key to make requests to Google Maps.
### Since this API key is now paid, I will directly download the .csv file containing latitude and longitude data

In [7]:
# Loop through each postal code to get the latitude and longitude
# As described in the assignment instructions, we should loop through each one because sometimes the request fail

# Create a list containing the lat. and long. for each neighborhood so we can add to the dataframe later
#lat_list=[]
#long_list=[]
#for i, pc in enumerate(toronto_df['PostalCode']):
#    ll_coords=None
#    while (ll_coords == None):
#        print('Trying to get coordinates for {} postal code'.format(pc))
#        g = geocoder.google('{}, Toronto, Ontario'.format(pc))
#        ll_coords = g.latlng
#    lat_list.append(ll_coords[0])
#    long_list.append(ll_coords[1])
    
#lat_list
#long_list

In [8]:
# Download .csv file containing latitude and longitude information (geocoder not working)
!wget -q -O 'latlong_data.csv' https://cocl.us/Geospatial_data
latlong_df = pd.read_csv('latlong_data.csv')
latlong_df.columns = ['PostalCode', latlong_df.columns[1], latlong_df.columns[2]]
latlong_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Join dataframes in order to add lat. and long. columns to the main dataframe

In [9]:
toronto_df = toronto_df.join(latlong_df.set_index('PostalCode'), on='PostalCode')
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [10]:
# Check shape
toronto_df.shape

(103, 5)

# ### 3rd part

### With the latitude and longitude of the postal codes, we can start exploring all these neighborhood using the Foursqure API

### First, we set up the credentials

In [19]:
client_id = 'OXRTGFVDOOXWP4YWZZIFL2MYJX0CZ1BUQIFLY3GYQW2CZZKQ'
client_secret = 'JL2GNLHNDSPVEV4LO1STXL4UYBEORBVFIWVDWVQGGSQY3DNV'
version = '20180605'
limit = 100

### Now we loop through each postal code and find the venues within 500 meters of each, for a limit of 100 

### With the retrieved info, we create our working dataframe

In [20]:
import requests
from pandas.io.json import json_normalize

In [22]:
rad = 500
temp_df = toronto_df.set_index('PostalCode')
base_url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&radius={}&limit={}'.format(client_id,client_secret,version,rad,limit)

venues_list=[]
for pc in toronto_df['PostalCode']:
    lat = temp_df.loc[pc, 'Latitude']
    long = temp_df.loc[pc, 'Longitude']
    
    # Set up the url for the API request by adding the lat. and long. info to the base url
    url = base_url + '&ll={},{}'.format(lat,long)
    print(url)
    result = requests.get(url).json()
    print(result)
    result = result['response']['groups'][0]['items']
    
    for venue in result:
        venues_list.append([pc, venue['venue']['name'], venue['venue']['categories'][0]['name']])

https://api.foursquare.com/v2/venues/explore?client_id=OXRTGFVDOOXWP4YWZZIFL2MYJX0CZ1BUQIFLY3GYQW2CZZKQ&client_secret=JL2GNLHNDSPVEV4LO1STXL4UYBEORBVFIWVDWVQGGSQY3DNV&v=20180605&radius=500&limit=100&ll=43.7532586,-79.3296565
{'meta': {'code': 429, 'errorType': 'quota_exceeded', 'errorDetail': 'Quota exceeded', 'requestId': '601ab30d2efe1b20da00f4ee'}, 'response': {}}


KeyError: 'groups'

In [None]:
venues_df = pd.DataFrame(venues_list)
venues_df.columns=['PostalCode','Venue Name','Venue Category']
venues_df.head()

### Now we create dummie variables/features (one-hot encoding) for each category so we can calculate the frequency (and mean) of each venue for each postal code
### Remember that we are doing all in terms of postal codes because they are the ones holding the latitude/longitude information, not the boroughs
### We may experiment grouping the dataframe by boroughs later for the clustering

In [None]:
# First, let's count the number of unique categories
len(pd.unique(venues_df['Venue Category']))

In [None]:
# One-hot enconding
venues_onehot = pd.get_dummies(venues_df[['Venue Category']], prefix="", prefix_sep="")

# We now add the postal codes
venues_onehot['PostalCode'] = venues_df['PostalCode']

# Fix columns
col_order = [[venues_onehot.columns[-1]] + list(venues_onehot.columns[:-1])]
venues_onehot = venues_onehot[col_order[0]]
venues_onehot.head(10)

In [None]:
# Take the mean of category appearances for each postal code
venues_grouped = venues_onehot.groupby('PostalCode').mean().reset_index()
venues_grouped.head()

### With our working dataframe created, we are now ready to apply the K-Means algorithms

In [None]:
from sklearn.cluster import KMeans

In [None]:
venues_cluster = venues_grouped.drop('PostalCode', 1)

kclusters = 5

kmm = KMeans(n_clusters=kclusters, init="k-means++", n_init=10, random_state=0)

kmm.fit(venues_cluster)

kmm.labels_

### With the labels obtained, we insert it into the previous dataframe

In [None]:
venues_grouped.insert(1, 'Cluster', kmm.labels_)
venues_grouped.head()

### Let's clean the database, dropping the categories and adding latitude/longitude so we can finally plot it

In [None]:
# Drop the category information
venues_grouped = venues_grouped.iloc[:,[0,1]]

# Add latitude/longitude and borough/neighborhood information
postal_clusters = venues_grouped.join(toronto_df.set_index('PostalCode'), on='PostalCode')

# Rearrange columns so that cluster goes to the end
new_cols = [col for col in postal_clusters.columns if not col == 'Cluster']
new_cols.append('Cluster')

postal_clusters = postal_clusters[new_cols]
postal_clusters.head()

### Now we can finally create a Folium map and insert our postal code-based clusters

In [None]:
import folium

import matplotlib.pyplot as plt
import matplotlib.colors as colors_mpl

In [None]:
toronto_lat = 43.6532
toronto_long = -79.3832

# Create our map centered in Toronto
postal_map = folium.Map(location=[toronto_lat, toronto_long], zoom_start=11)

# Now we create our colored markers for each postal code and cluster number
cols = plt.cm.Spectral(np.linspace(0,1,len(set(kmm.labels_))))
cols = [colors_mpl.rgb2hex(x) for x in cols]

for postal, lat, long, clus, bor in zip(postal_clusters['PostalCode'], postal_clusters['Latitude'], postal_clusters['Longitude'], postal_clusters['Cluster'], postal_clusters['Borough']):
    label = folium.Popup('{}: {}, {}'.format(postal,bor,clus))
    folium.CircleMarker([lat,long],
                       radius=5,
                       popup=label,
                       fill=True,
                       color=cols[clus],
                       fill_color=cols[clus],
                       fill_opacity=0.9).add_to(postal_map)
postal_map

### We can see that most of Toronto's postal codes are clustered as "0"
### We could then analyze what are the characteristics of the 0-cluster so that we can make assertions about these neighborhoods

## From the above result, we see that clustering by postal codes may not be the best choice, since many different postal codes have very similar lat/long. locations

## We now try something different
### We will segment and cluster only the neighborhoods pertaining to Downtown Toronto borough

In [None]:
# We first form our dataframe containing Downtown Toronto information
dt_df = toronto_df[toronto_df['Borough']=='Downtown Toronto']
dt_df.head()

### Now we explore venues for each (set of) neighborhood in a radius of 200 meters

In [None]:
rad = 200
temp_df = dt_df.set_index('Neighborhood')
base_url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&radius={}&limit={}'.format(client_id,client_secret,version,rad,limit)

dtvenues_list=[]
for nb in dt_df['Neighborhood']:
    lat = temp_df.loc[nb, 'Latitude']
    long = temp_df.loc[nb, 'Longitude']
    
    # Set up the url for the API request by adding the lat. and long. info to the base url
    url = base_url + '&ll={},{}'.format(lat,long)
    
    result = requests.get(url).json()['response']['groups'][0]['items']
    
    for venue in result:
        dtvenues_list.append([nb, venue['venue']['name'], venue['venue']['categories'][0]['name']])

In [None]:
# We now create a dataframe with the venues found from the above list
dtvenues_df = pd.DataFrame(dtvenues_list, columns=['Neighborhood', 'Venue Name', 'Venue Category'])
dtvenues_df.head()

In [None]:
# One-hot encoding of the above df
dtvenues_onehot = pd.get_dummies(dtvenues_df[['Venue Category']], prefix="", prefix_sep="")

# Now we add the Neighborhood and fix columns
dtvenues_onehot['Neighborhood'] = dtvenues_df['Neighborhood']
col_order = ['Neighborhood'] + [col for col in dtvenues_onehot.columns if not col == 'Neighborhood']
dtvenues_onehot = dtvenues_onehot[col_order]
dtvenues_onehot.head()

### Finally, we get the mean frequency for each neighborhood and apply our K-Means algorithm

In [None]:
dtvenues_mean = dtvenues_onehot.groupby('Neighborhood').mean().reset_index()
dtvenues_mean.head()

In [None]:
dtvenues_cluster = dtvenues_mean.drop('Neighborhood', axis=1)

kclusters=5

dt_kmm = KMeans(init="k-means++", n_clusters=kclusters, n_init=10, random_state=0)
dt_kmm.fit(dtvenues_cluster)

dt_kmm.labels_

### Now we once again clean the dataframe and insert appropriate information for plotting

In [None]:
dtvenues_mean.insert(1, 'Cluster', dt_kmm.labels_)
dtvenues_mean.head()

In [None]:
dtcluster_df = dtvenues_mean.iloc[:, :2]
dtcluster_df.head()

In [None]:
# Join Postal Code and Lat/Long information
dtcluster_df = dtcluster_df.join(dt_df.drop('Borough',axis=1).set_index('Neighborhood'), on='Neighborhood')

# Let Cluster column be the last one and PostalCode the first, just for the sake of consistency
col_order = ['PostalCode'] + [col for col in dtcluster_df.columns if not (col=='PostalCode' or col=='Cluster')] + ['Cluster']
dtcluster_df = dtcluster_df[col_order]
dtcluster_df.head()

### Finally, we plot the neighborhoods with cluster colors on a Folium map

In [None]:
# Let us centralize the map in Downtown Toronto area
# In order to do that, we get the mean of lat/long from all postal codes in such area
lat,long = float(dt_df.groupby('Borough').mean()['Latitude']), float(dt_df.groupby('Borough').mean()['Longitude'])

dt_map = folium.Map(location=[lat,long], zoom_start=13)

# Now we create our colored markers for each postal code and cluster number
cols = plt.cm.Spectral(np.linspace(0,1,len(set(dt_kmm.labels_))))
cols = [colors_mpl.rgb2hex(x) for x in cols]

for postal, lat, long, clus, nb in zip(dtcluster_df['PostalCode'], dtcluster_df['Latitude'], dtcluster_df['Longitude'], dtcluster_df['Cluster'], dtcluster_df['Neighborhood']):
    label = folium.Popup('{}: {}, {}'.format(postal,nb,clus))
    folium.CircleMarker([lat,long],
                       radius=5,
                       popup=label,
                       fill=True,
                       color=cols[clus],
                       fill_color=cols[clus],
                       fill_opacity=0.9).add_to(dt_map)
dt_map

###### We can see that, since this area is very dense, and hence the Neighborhoods are too close to each other, almost all of them pertain to a single cluster