# IBM Data Science Professional Certificate - Capstone Project

Author: Carl-Michael Edeling

##### This final project demonstrates the skills and knowledge gained during the course.

In [538]:
import pandas as pd
import numpy as np
import requests

In [539]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


---

# Webscraping - Toronto Neighbourhood Data

Convert contents from URL to Data Frame.

In [540]:
urlwiki1 = "https://en.wikipedia.org/w/index.php?title=List_of_postal_codes_of_Canada:_M&oldid=945633050."
list = pd.read_html(urlwiki1)
type(list)

list

In [541]:
len(list)

3

In [542]:
list[0][0:5]

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


The first table is the one we're interested in.  So assign it to the "table" variable.

In [543]:
table = list[0]
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


We need to drop the rows that have unassigned Boroughs.

In [544]:
# get names of indexes for which column Borough has value "Not assigned"
index_names = table[table['Borough'] == 'Not assigned'].index
  
# drop these row indexes from dataFrame
table.drop(index_names, inplace = True)
  
table.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


The index needs to be reset.

In [545]:
table.reset_index(inplace=False).head()

Unnamed: 0,index,Postcode,Borough,Neighbourhood
0,2,M3A,North York,Parkwoods
1,3,M4A,North York,Victoria Village
2,4,M5A,Downtown Toronto,Harbourfront
3,5,M6A,North York,Lawrence Heights
4,6,M6A,North York,Lawrence Manor


Check to see if any neighbourhoods are "Not assigned"

In [546]:
table[table['Neighbourhood'] == 'Not assigned'].count()

Postcode         0
Borough          0
Neighbourhood    0
dtype: int64

There are no unassigned Neighbourhoods

So now we need to group neighbourhoods together with postal code and boroughs.

In [574]:
table_grouped = table.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
table_grouped.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Let's see the shape of the new dataframe.

In [554]:
print("The shape is: ", table_grouped.shape)

The shape is:  (103, 3)


---

# Combining Dataframe with Geospatial Data

In [None]:
geo = pd.read_csv(r"C:\A - Work\Courses\Data Science - IBM\Lessons\10 - Applied Data Science Capstone\Final Assignment\Geospatial_Coordinates.csv")

In [None]:
geo.head()

In [None]:
print("The shape is: ", geo.shape)

But this dataframe also has a column for Postal Code, so let's drop it.

In [None]:
geo1 = geo.drop(labels='Postal Code', axis=1, index=None, columns=None, level=None, inplace=False, errors='raise')
geo1.head()

In [None]:
# Join the 2 dfs
frames = [table_grouped, geo1]

In [None]:
comb_table = pd.concat(
    frames,
    axis=1,
    join="outer",
    ignore_index=False,
    keys=None,
    levels=None,
    names=None,
    verify_integrity=False,
    copy=True,
)

In [None]:
comb_table.head()

In [None]:
print("The shape is: ", comb_table.shape)

---

# Clustering

Import needed Libraries

In [None]:
import os
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

#### Create a map of Toronto with neighbourhoods superimposed on top.

In [None]:
from geopy.geocoders import ArcGIS

address = 'Toronto'

geolocator = ArcGIS(scheme="https")     # Problem was using Nominatim.  Also, ArcGIS(scheme="http") DIDN'T work.
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(comb_table['Latitude'], 
                                           comb_table['Longitude'], 
                                           comb_table['Borough'], 
                                           comb_table['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

---

Get nearby venues for each neighbourhood.

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
toronto_venues = getNearbyVenues(names=comb_table['Neighbourhood'],
                                   latitudes=comb_table['Latitude'],
                                   longitudes=comb_table['Longitude'])

In [None]:
print("The shape is: ", toronto_venues.shape)
toronto_venues.head()

Let's check how many venues were returned for each neighborhood


In [None]:
total_venue_categories = toronto_venues.groupby('Neighbourhood').count()

In [None]:
total_venue_categories.head()

In [None]:
print("The total number of venues = ", sum(total_venue_categories["Venue"]))

In [None]:
print('There are {} uniques venues.'.format(len(toronto_venues['Venue'].unique())))

In [None]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

---

# Analyzing each neighbourhood

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [None]:
toronto_onehot.shape

#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category


In [None]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

In [None]:
toronto_grouped.shape

#### Let's print each neighbourhood along with the top 5 most common venues

In [None]:
"""num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')"""

A function to sort the venues in descending order.

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Create the new dataframe and display the top 10 venues for each neighbourhood.

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

# Cluster Neighbourhoods

Run _k_-means to cluster the neighbourhood into 10 clusters.


In [None]:
# set number of clusters
kclusters = 10

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

test_kmeans = KMeans(n_clusters=kclusters, 
                init='k-means++', 
                n_init=100,
                max_iter=5000,
                tol=0.001, 
                random_state=12)
test_labels = test_kmeans.fit_predict(toronto_grouped_clustering) #toronto_grouped_clustering
test_labels

### What is the optimal value of k?

I tried to calculate the efficacy of using various numbers of clusters, ranging from 2 to 20.

I tried 3 methods:

1. Method - model.score()
2. Attribute - model.inertia_
3. Function - calculate the WSS score.  Unfortunately I couldn't get this function to work.

#### Using .score() 

"Opposite of the value of X on the K-means objective."

By comparing it to inertia, it gives the negative of the inertia.

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
kclusters = 20
kscores = []
for k in range(2, kclusters):
    kscore = KMeans(n_clusters=k, 
                init='k-means++', 
                n_init=100,
                max_iter=3000,
                tol=0.01, 
                random_state=7).fit(toronto_grouped_clustering).score(toronto_grouped_clustering, 
                                                                      y=None, 
                                                                      sample_weight=None)
    kscores.append(kscore)
    #print(kscore1)
#print(kscore)

#### Using .inertia_

"Sum of squared distances of samples to their closest cluster center."

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
kclusters = 20
kinertia = []
for k in range(2, kclusters):
    kscore = KMeans(n_clusters=k, 
                init='k-means++', 
                n_init=100,
                max_iter=3000,
                tol=0.01, 
                random_state=7).fit(toronto_grouped_clustering)
    kinertia.append(kscore.inertia_)
    #print(kscore1)
#print(kscore)

"""test_kmeans = KMeans(n_clusters=kclusters, 
                init='k-means++', 
                n_init=100,
                max_iter=5000,
                tol=0.001, 
                random_state=12)
test_labels = test_kmeans.fit_predict(toronto_grouped_clustering) #toronto_grouped_clustering
test_labels"""

In [None]:
kinertia

In [None]:
kscores

Let's graph number of clusters against kscores and kinertia

In [None]:
x = np.linspace(2, 19, 18)

In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot()
ax.set(xlim=[0,20],ylim=[-8,-18])
#plt.xlim([2, 20])
ax.plot(x, kscores, color='lightblue', linewidth=1)
ax.scatter(x,kscores,
color='darkgreen',
marker='^')
ax.set_xlabel("Number of Clusters")
ax.set_ylabel("k-Means Scores")


In [None]:
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot()
ax.set(xlim=[0,20],ylim=[8,18])
ax.plot(x, kinertia, color='lightblue', linewidth=1)
ax.scatter(x,kinertia,
color='darkgreen',
marker='^')
ax.set_xlabel("Number of Clusters")
ax.set_ylabel("Inertia Scores")

Unfortuantely there is a poorly defined "elbow-point" on both graphs, so it is difficult to tell.  It appears that the "elbow point" is around 9 clusters.

Now, let's run the algorithm again with the optimal number of clusters: 9

In [None]:
# set number of clusters
kclusters = 9

kmeans_8_model = KMeans(n_clusters=kclusters, 
                init='k-means++', 
                n_init=100,
                max_iter=5000,
                tol=0.001, 
                random_state=12)
kmeans_8 = kmeans_8_model.fit_predict(toronto_grouped_clustering)
kmeans_8

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighbourhood.


In [None]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans_8_model.labels_)

toronto_merged = comb_table

# merge manhattan_grouped with manhattan_data to add latitude/longitude for each neighbourhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

In [None]:
toronto_merged.head(5)

Finally, let's visualize the resulting clusters


In [None]:
toronto_merged[toronto_merged["Cluster Labels"] == 1].head()

In [None]:
toronto_merged[toronto_merged['Cluster Labels'].isna()]

In [None]:
toronto_merged = toronto_merged.dropna(axis = 0, how = 'any')
toronto_merged.head()

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], 
                                  toronto_merged['Longitude'], 
                                  toronto_merged['Neighbourhood'], 
                                  toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)],
        fill=True,
        fill_color=rainbow[int(cluster)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters