# Applied Data Science Capstone

## Week 3 Assignment

### Obtain Toronto data

Import libraries.

In [1]:
import pandas as pd
import requests

Get wikipedia data.

In [2]:
# Get wikipedia data
WIKIPEDIA_CANADIAN_POSTAL_CODE_M_URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
r = requests.get(WIKIPEDIA_CANADIAN_POSTAL_CODE_M_URL)
df = pd.read_html(r.text)[0]


In [3]:
# Clean data
# drop "Not Assigned" boroughs
df = df[df["Borough"] != "Not assigned"]
# replace "Not Assigned" neighbourhoods with their borough name
df["Neighbourhood"][df["Neighbourhood"] == "Not assigned"] = df["Borough"][df["Neighbourhood"] == "Not assigned"]
# aggregate neighbourhoods for borough with the same postcode
df = df.groupby("Postcode", as_index=False).agg({"Borough": "first", "Neighbourhood": lambda x: ", ".join(x)})

In [4]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


### Get boroughs coordinates

Import libraries.

In [5]:
import geocoder

Get boroughs coordinates

In [6]:
# Get data
COORDINATES_DATA_URL = "https://cocl.us/Geospatial_data"
df_coordinates = pd.read_csv(COORDINATES_DATA_URL)

# Join data frames
df = df.join(df_coordinates.set_index("Postal Code"), on="Postcode")

In [7]:
df

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437


In [8]:
df.shape

(103, 5)

### Clustering neighborhoods

Import libraries.

In [9]:
import folium

#### Display map of toronto

In [10]:
TORONTO_COORDINATES =  (43.72, -79.38)

# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[*TORONTO_COORDINATES], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df["Latitude"], df["Longitude"], df["Borough"], df["Neighbourhood"]):
    label = "{}, {}".format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color="blue",
        fill=True,
        fill_color="#3186cc",
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Obtain extra data

Set up Foursquare interface

In [11]:
CLIENT_ID = "xxx" # your Foursquare ID
CLIENT_SECRET = "xxx" # your Foursquare Secret
VERSION = "20180605" # Foursquare API version

In [12]:
LIMIT=100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = "https://api.foursquare.com/v2/venues/explore?" \
              f"&client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}" \
              f"&v={VERSION}" \
              f"&ll={lat},{lng}&radius={radius}" \
              f"&limit={LIMIT}"
            
        # make the GET request
        results = requests.get(url).json()["response"]["groups"][0]["items"]
        
        # return only relevant information for each nearby venue
        if len(results) == 0:
            print(f"No venues found for {name}")
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v["venue"]["name"], 
            v["venue"]["location"]["lat"], 
            v["venue"]["location"]["lng"],  
            v["venue"]["categories"][0]["name"]
        ) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = [
        "Neighbourhood", 
        "Neighbourhood Latitude", 
        "Neighbourhood Longitude", 
        "Venue", 
        "Venue Latitude", 
        "Venue Longitude", 
        "Venue Category"
    ]
    
    return(nearby_venues)

Get venues in Toronto

In [13]:
df_venues = getNearbyVenues(
    names=df["Neighbourhood"],
    latitudes=df["Latitude"],
    longitudes=df["Longitude"]
)

No venues found for Upper Rouge
No venues found for Newtonbrook, Willowdale
No venues found for Islington Avenue
No venues found for Cloverdale, Islington, Martin Grove, Princess Gardens, West Deane Park


#### Prepare data for neighborhoods

One hot encoding

In [14]:
# one hot encoding
df_onehot = pd.get_dummies(df_venues[['Venue Category']], prefix="", prefix_sep="")

# Add Neighborhood column
df_onehot["Neighbourhood"] = df_venues["Neighbourhood"]
# Move Neighborhood column to front
df_onehot_columns = df_onehot.columns.to_list()
df_onehot_columns.insert(0, df_onehot_columns.pop())
df_onehot = df_onehot[df_onehot_columns]

And group them

In [15]:
df_grouped = df_onehot.groupby("Neighbourhood").mean().reset_index()

#### Run clustering

Import libraries

In [16]:
from sklearn.cluster import KMeans

Run clustering

In [17]:
# set number of clusters
kclusters = 8

df_clustering = df_grouped.drop("Neighbourhood", 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters).fit(df_clustering)

And annotate neighbourhood data with clusters

In [18]:
# add clustering labels
df_clusters = df_grouped["Neighbourhood"].to_frame()
df_clusters.insert(0, "Cluster", kmeans.labels_)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
df_clusters = df.join(df_clusters.set_index("Neighbourhood"), on="Neighbourhood", how="outer")

#### Visualize cluster

Import libraries

In [19]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

In [20]:
map_toronto_clusters = folium.Map(location=[*TORONTO_COORDINATES], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.Spectral(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
for lat, lng, borough, neighborhood, cluster in zip(
    df_clusters["Latitude"], 
    df_clusters["Longitude"],
    df_clusters["Borough"],
    df_clusters["Neighbourhood"],
    df_clusters["Cluster"]
):
    label = "{}, {}".format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    
    color = "white"
    if not np.isnan(cluster):
        color = rainbow[int(cluster) - 1]
    
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7,
        parse_html=False
    ).add_to(map_toronto_clusters)  
    
map_toronto_clusters

In [21]:
df_clusters

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,6.0
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,5.0
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1.0
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1.0
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1.0
...,...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188,2.0
99,M9P,Etobicoke,Westmount,43.696319,-79.532242,1.0
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.688905,-79.554724,1.0
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.739416,-79.588437,1.0
