# Coursera IBM Data Science Professional Specialization Capstone Project

## Description
This notebook will be used to complete the final capstone project.

In [1]:
import pandas as pd
import numpy as np

In [2]:
print('Hello Capstone Project Course!')

Hello Capstone Project Course!


## Scrape Wikipedia for Toronto neighbourhoods

In [None]:
# Install BeautifulSoup
!conda install -c anaconda beautifulsoup4 --yes

In [1]:
from bs4 import BeautifulSoup
import requests

r = requests.get("https://en.wikipedia.org/wiki/List_of_neighbourhoods_in_Toronto")
soup = BeautifulSoup(r.content)

In [3]:
# Check that we have the right page
print(soup.title)

<title>List of neighbourhoods in Toronto - Wikipedia</title>


In [93]:
# Get coordinates for a neighbourhood
# Returns a LatLng array
def get_coordinates(neighbourhood):
    base_uri = 'https://en.wikipedia.org'
    r_neigh = requests.get(base_uri + neighbourhood.a['href'])
    soup_neigh = BeautifulSoup(r_neigh.content)
    
    # Get latitude
    try:
        lat = soup_neigh.find('span', class_='latitude').get_text()
    except:
        lat = None
    
    # Get longitude
    try:
        lng = soup_neigh.find('span', class_='longitude').get_text()
    except:
        lng = None
    
    return [lat, lng]

# Build dataframe
column_names = ['District', 'Neighbourhood', 'Latitude', 'Longitude']
neigh = pd.DataFrame(columns=column_names)

# Iterate through districts
for district in soup.find_all('h3'):
    # Get district name
    heading = district.find('span', class_='mw-headline')
    if heading is None:
        break
    district_name = heading.get_text()
    print('Getting data for {}'.format(district_name))
    
    # Get neighbourhoods in district
    for neighbourhood in district.find_next_sibling('div', class_=None).table.find_all('li'):        
        lat_lng = get_coordinates(neighbourhood)
        neighbourhood_name = neighbourhood.get_text()
        neigh = neigh.append({'District': district_name,
                              'Neighbourhood': neighbourhood_name,
                              'Latitude': lat_lng[0],
                              'Longitude': lat_lng[1]}, ignore_index=True)

neigh

Getting data for Old Toronto
Getting data for East York
Getting data for Etobicoke
Getting data for North York
Getting data for Scarborough
Getting data for York


Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,Old Toronto,Alexandra Park,43°39′N,79°24′W
1,Old Toronto,The Annex,43°40′12″N,79°24′14″W
2,Old Toronto,Baldwin Village,43°39′22″N,79°23′36″W
3,Old Toronto,Cabbagetown,43°39′59″N,79°21′46″W
4,Old Toronto,CityPlace,43°38′24″N,79°23′43″W
5,Old Toronto,Chinatown,43°39′10″N,79°23′53″W
6,Old Toronto,Church and Wellesley,43°39′56.50″N,79°22′51.44″W
7,Old Toronto,Corktown,43°39′20″N,79°21′35″W
8,Old Toronto,Discovery District,43°39′29″N,79°23′17″W
9,Old Toronto,Distillery District,,


In [101]:
# Save data as CSV
neigh.to_csv('toronto_neigh.csv')

In [95]:
print('The dataframe has {} districts and {} neighbourhoods.'.format(
        len(neigh['District'].unique()),
        neigh.shape[0]
    )
)

The dataframe has 6 districts and 212 neighbourhoods.


In [96]:
!conda install -c conda-forge geopy --yes

Collecting package metadata: ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\Desmond\Anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.3.9           |           py37_0         149 KB  conda-forge
    conda-4.6.14               |           py37_0         2.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.19.0               |             py_0          53 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.3 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.49-py_0
  geopy              conda-forge/noarch::geopy-1.19.0-py_0

The following packages will be UPDATED:





  current version: 4.6.11
  latest version: 4.6.14

Please update conda by running

    $ conda update -n base -c defaults conda


'ET' is not recognized as an internal or external command,
operable program or batch file.


## Get coordinates for Toronto

In [2]:
from geopy.geocoders import Nominatim

In [3]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


## Wrangle data

In [82]:
df = pd.read_csv('toronto_neigh.csv', index_col=0)

# Convert latitude and longitude to decimals
# When there is no minute
df.loc[df['Latitude'].str.len() == 7, 'Latitude'] = \
    pd.to_numeric(df['Latitude'].str.slice(start=0, stop=2)) + \
    pd.to_numeric(df['Latitude'].str.slice(start=3, stop=5))/60

# When minutes is included
df.loc[df['Latitude'].str.len() > 7, 'Latitude'] = \
    pd.to_numeric(df['Latitude'].str.slice(start=0, stop=2)) + \
    pd.to_numeric(df['Latitude'].str.slice(start=3, stop=5))/60 + \
    np.nan_to_num(pd.to_numeric(df['Latitude'].str.slice(start=6, stop=-2)))/3600

# When there is no minute
df.loc[df['Longitude'].str.len() == 7, 'Longitude'] = \
    - pd.to_numeric(df['Longitude'].str.slice(start=0, stop=2)) - \
    pd.to_numeric(df['Longitude'].str.slice(start=3, stop=5))/60

# When minutes is included
df.loc[df['Longitude'].str.len() > 7, 'Longitude'] = \
    - pd.to_numeric(df['Longitude'].str.slice(start=0, stop=2)) - \
    pd.to_numeric(df['Longitude'].str.slice(start=3, stop=5))/60 - \
    np.nan_to_num(pd.to_numeric(df['Longitude'].str.slice(start=6, stop=-2)))/3600

df.head()

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,Old Toronto,Alexandra Park,43.65,-79.4
1,Old Toronto,The Annex,43.67,-79.4039
2,Old Toronto,Baldwin Village,43.6561,-79.3933
3,Old Toronto,Cabbagetown,43.6664,-79.3628
4,Old Toronto,CityPlace,43.64,-79.3953


In [83]:
# Find NaN coordinates
df[df['Latitude'].isnull()]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
9,Old Toronto,Distillery District,,
10,Old Toronto,The Entertainment District,,
19,Old Toronto,Little Japan[5],,
78,Old Toronto,Little Tibet,,
109,Etobicoke,Humberwood,,
193,Scarborough,Steeles,,


In [84]:
# Clean Neighbourhood name
df.loc[19]['Neighbourhood'] = df.loc[19]['Neighbourhood'][0:-3]

df.loc[19]

District          Old Toronto
Neighbourhood    Little Japan
Latitude                  NaN
Longitude                 NaN
Name: 19, dtype: object

In [85]:
# Get coordinates for missing coordinates
for i, row in df[df['Latitude'].isnull()].iterrows():
    location = geolocator.geocode(row['Neighbourhood'] + ', Toronto, Canada')
    
    if location is not None:
        df.loc[row.name]['Latitude'] = location.latitude
        df.loc[row.name]['Longitude'] = location.longitude

In [86]:
# Find NaN coordinates
df[df['Latitude'].isnull()]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
19,Old Toronto,Little Japan,,


In [87]:
# drop row 19 as location cannot be found
df.drop([19], 0, inplace=True)

In [88]:
df

Unnamed: 0,District,Neighbourhood,Latitude,Longitude
0,Old Toronto,Alexandra Park,43.65,-79.4
1,Old Toronto,The Annex,43.67,-79.4039
2,Old Toronto,Baldwin Village,43.6561,-79.3933
3,Old Toronto,Cabbagetown,43.6664,-79.3628
4,Old Toronto,CityPlace,43.64,-79.3953
5,Old Toronto,Chinatown,43.6528,-79.3981
6,Old Toronto,Church and Wellesley,43.6657,-79.381
7,Old Toronto,Corktown,43.6556,-79.3597
8,Old Toronto,Discovery District,43.6581,-79.3881
9,Old Toronto,Distillery District,43.6503,-79.3595


In [89]:
# Find NaN coordinates
df[df['Latitude'].isnull()]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude


In [94]:
df.to_csv('toronto_neigh_clean.csv')

## Create map

In [10]:
!conda install -c conda-forge folium -y

Collecting package metadata: ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [8]:
import folium

df = pd.read_csv('toronto_neigh_clean.csv')
map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers
for lat, lng, district, neighbourhood in zip(df['Latitude'], df['Longitude'], df['District'], df['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, district)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map) 
map

### [Screenshot of result](./screenshots/toronto_map.png)
![screenshot of map](./screenshots/toronto_map.png)

In [5]:
import json

# import foursquare credentials
with open('credentials.json') as f:
    data = json.load(f)
    CLIENT_ID = data['CLIENT_ID']
    CLIENT_SECRET = data['CLIENT_SECRET']
    VERSION = '20190519'

In [10]:
# get venues near neighbourhoods

import requests

LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(df['Neighbourhood'], df['Latitude'], df['Longitude'])

Alexandra Park
The Annex
Baldwin Village
Cabbagetown
CityPlace
Chinatown
Church and Wellesley
Corktown
Discovery District
Distillery District
The Entertainment District
East Bayfront
Fashion District
Financial District
Garden District
Grange Park
Harbord Village
Harbourfront
Kensington Market
Moss Park
Old Town
Quayside
Queen Street West
Regent Park
South Core
St. James Town
St. Lawrence
Toronto Islands
Trefann Court
University (includes Huron–Sussex)
Yorkville
The Beaches (also known as The Beach)
East Chinatown
East Danforth
Gerrard Street East (Gerrard India Bazaar)
Greektown (also known as the Danforth)
Leslieville
Main Square
Playter Estates
Port Lands
Riverdale
Upper Beaches
Bedford Park
Casa Loma
Chaplin Estates
Davisville Village
Deer Park (Yonge and St. Clair)
Forest Hill (and Forest Hill Village)
Lawrence Park
Lytton Park
Midtown
Moore Park
North Toronto
Rosedale
South Hill (includes Rathnelly)
Summerhill
Wanless Park
Wychwood Park
Yonge–Eglinton (considered centre of Midtown

In [11]:
toronto_venues.shape

(5375, 7)

In [12]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Alexandra Park,43.65,-79.4,Maker Pizza,43.650401,-79.39804,Pizza Place
1,Alexandra Park,43.65,-79.4,Saku Sushi,43.648038,-79.400268,Sushi Restaurant
2,Alexandra Park,43.65,-79.4,Sonic Boom,43.650859,-79.396985,Record Shop
3,Alexandra Park,43.65,-79.4,Drom Taberna,43.648134,-79.399675,Bar
4,Alexandra Park,43.65,-79.4,Core Studio Yoga & Pilates,43.64792,-79.400196,Yoga Studio


In [52]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
print('There are {} uniques neighbourhoods.'.format(len(toronto_venues['Neighborhood'].unique())))

There are 330 uniques categories.
There are 203 uniques neighbourhoods.


## Analyse data

In [20]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Alexandra Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Alexandra Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Alexandra Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Alexandra Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Alexandra Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
toronto_onehot.shape

(5375, 331)

In [23]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Animal Shelter,Antique Shop,Aquarium,Arcade,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Birch Cliff Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alderwood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alexandra Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.02
4,Amesbury,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
toronto_grouped.shape

(203, 331)

In [45]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Birch Cliff Heights,Bank,Thai Restaurant,Discount Store,Diner,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Service,Event Space,Exhibit
1,Agincourt,Park,Yoga Studio,Farm,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Service,Event Space,Exhibit,Falafel Restaurant
2,Alderwood,Park,Playground,Athletics & Sports,Market,Yoga Studio,Falafel Restaurant,Empanada Restaurant,Ethiopian Restaurant,Event Service,Event Space
3,Alexandra Park,Bar,Café,Vegetarian / Vegan Restaurant,Coffee Shop,Dessert Shop,French Restaurant,Yoga Studio,Record Shop,Thrift / Vintage Store,Sushi Restaurant
4,Amesbury,Grocery Store,Gym,Supermarket,Yoga Studio,Falafel Restaurant,Electronics Store,Empanada Restaurant,Ethiopian Restaurant,Event Service,Event Space


## Cluster neighbourhoods

In [29]:
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 1, 2, 0, 0, 0, 0, 0, 0, 2])

Merge cluster label and common venues

In [46]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

df = pd.read_csv('toronto_neigh_clean.csv', index_col=0)
toronto_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,District,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Old Toronto,Alexandra Park,43.65,-79.4,0.0,Bar,Café,Vegetarian / Vegan Restaurant,Coffee Shop,Dessert Shop,French Restaurant,Yoga Studio,Record Shop,Thrift / Vintage Store,Sushi Restaurant
1,Old Toronto,The Annex,43.67,-79.403889,0.0,Coffee Shop,Park,Pizza Place,Thai Restaurant,Pub,Gym,Social Club,Grocery Store,Korean Restaurant,Clothing Store
2,Old Toronto,Baldwin Village,43.656111,-79.393333,0.0,Coffee Shop,Sandwich Place,Chinese Restaurant,Café,Ice Cream Shop,Bubble Tea Shop,Bar,Arts & Crafts Store,Dumpling Restaurant,Japanese Restaurant
3,Old Toronto,Cabbagetown,43.666389,-79.362778,0.0,Coffee Shop,Park,Breakfast Spot,Snack Place,Baseball Field,Thai Restaurant,General Entertainment,Taiwanese Restaurant,Beer Store,Sushi Restaurant
4,Old Toronto,CityPlace,43.64,-79.395278,0.0,Coffee Shop,Gym,Café,Park,Pub,Japanese Restaurant,Pizza Place,Light Rail Station,Grocery Store,Historic Site


In [71]:
# remove neighbourhoods which did not have any result
toronto_merged.dropna(how='any', inplace=True)
toronto_merged[toronto_merged['Cluster Labels'].isnull()]

Unnamed: 0,District,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [75]:
toronto_merged['Cluster Labels'] = toronto_merged['Cluster Labels'].astype(int)

In [76]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## [Screenshot of result](./screenshots/toronto_map_clustered.png)
![Screenshot of result](./screenshots/toronto_map_clustered.png)