### Table of Contents

<font size = 3>

1. <a href="#item1">Data Scraping and Cleaning</a>

2. <a href="#item2">Merging Geolocation Data</a>

3. <a href="#item3">Neighborhood Clustering</a>

</font>
</div>

<a id='item1'></a>
## Part 1: Data Scraping

#### Get Data

In [None]:
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
rawdf = tables = pd.read_html(url)[0] # Read raw data

In [None]:
rawdf.head()

#### Clean up formatting

In [None]:
# Rename columns (prepend 'M')
df = rawdf.rename(columns = dict([(i, "M%d" % (i + 1)) for i in range(rawdf.shape[1])]))

# Rename indeces (extract first 2 characters from first column entries)
df.rename(index = dict([(i, df.iloc[i, 0][2]) for i in range(rawdf.shape[0])]), inplace = True)

# Remove FSAs (first 3 characters) from entries
df = df.applymap(lambda x: x[3:])

In [None]:
df.head()

#### Convert data into desired format

In [None]:
import re

data = [] # Entries of the resulting dataframe
for col in df.columns:
    for row in df.index:
        entry = df.loc[row, col]    # Cell at this index and column
        if entry != 'Not assigned': # Ignore missing entries
            # Extract boroughs. Conditions:
            # 1. Keep only first part of a borough.
            # E.g. "North York (Don Mills) South (Flemingdon Park)" (M3C) becomes "North York"
            # 2. If there are no neighbourhoods in parentheses, take the entry to be a borough
            borough, _ = entry.split("(", 1)[0] if "(" in entry else entry, _

            # Find all neighbourhoods (all entries surrounded by parentheses)
            neighborhoods = re.findall(r'\((.*?)\)', entry)
            # 1. Concatenate if there are multiple, e.g. "Don Mills / Flemingdon Park"
            # 2. Replace slash separator with comma
            # 3. Take borough if none
            neighborhoods = " / ".join(neighborhoods).replace(" / ", ", ")

            data.append([col + row, borough, neighborhoods if neighborhoods else borough])

In [None]:
# Append rows to dataframe
fsa = pd.DataFrame(data, columns = ['PostalCode', 'Borough', 'Neighborhood'])
fsa.head()

#### Final cleanup

In [None]:
# Some boroughs seem to be a special case of more common boroughs
fsa.Borough.value_counts()

In [None]:
# Boroughs that appear more than once
boroughs = fsa.Borough.value_counts() \
    .reset_index(name = "count").query("count > 1")["index"].values

In [None]:
# Replace single occurrence boroughs with their parent if possible
for b in boroughs:
    fsa.loc[fsa.Borough.str.startswith(b), "Borough"] = b

In [None]:
fsa.Borough.value_counts()

#### Shape of data frame

In [None]:
fsa.shape

<a id='item2'></a>
## Part 2: Merge Geolocation Data

In [None]:
import geocoder

def get_coords_from_geocoder(postal_code):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    i = 0
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        lat_lng_coords = g.latlng
        i += 1
        if (i == 10):
            print("Could not get coordinates for {} within 10 tries".format(postal_code))
            break

    return lat_lng_coords

In [None]:
use_geocoder = False # Geocoder doesn't work

if use_geocoder:
    for code in fsa.PostalCode.values:
        lat_lng_coords = get_coords_from_geocoder(code)
        if lat_lng_coords is not None:
            fsa.loc[fsa.PostalCode == code, "Latitude"] = lat_lng_coords[0]
            fsa.loc[fsa.PostalCode == code, "Longitude"] = lat_lng_coords[1]
else:
    # Read geospatial data from csv
    coords = pd.read_csv("https://cocl.us/Geospatial_data/Geospatial_coordinates.csv")
    
    print("Data Frames are compatible"
          if list(fsa.PostalCode.values) == list(coords.loc[:, "Postal Code"].values)
          else "Data Frames are not compatible!")
    
    # Merge data frames on Postal Code
    fsa = fsa.merge(coords, left_on = "PostalCode", right_on = "Postal Code") \
        .drop("Postal Code", axis = 1)

#### Dataframe with geolocation data

In [None]:
fsa.head()

<a id='item3'></a>
## Part 3: Neighborhood Clustering

In [None]:
# Imports
import requests
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
import folium
import numpy as np

import matplotlib.cm as cm
import matplotlib.colors as colors

#### Consider only neighbourhoods in outer Toronto

In [None]:
# Take only boroughs NOT containing the word 'Toronto', also remove Queen's Park
idx = (fsa.Borough != "Queen's Park / Ontario Provincial Government") & \
    (~fsa.Borough.str.contains("Toronto"))
outer = fsa.loc[idx, :].reset_index(drop = True)
print(outer.shape)
outer.head()

Geographical cooordinates of Toronto

In [None]:
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode('Toronto, Ontario')
print('The geograpical coordinate of Toronto are {}, {}.'
      .format(location.latitude, location.longitude))

Visualize outer Toronto neighborhoods

In [None]:
# create map of Toronto using latitude and longitude values
map_outer = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)

# add markers to map
for lat, lng, label in zip(outer['Latitude'], outer['Longitude'], outer['Neighborhood']):
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_outer)  
    
map_outer

#### Foursquare Credentials and Version

In [None]:
try:
    with open("foursquare.creds", 'r') as creds:
        CLIENT_ID, CLIENT_SECRET = creds.readline().split(",")
        print("CLIENT_ID = {}".format('*' * len(CLIENT_ID)))
        print("CLIENT_SECRET = {}".format('*' * len(CLIENT_SECRET)))
except FileNotFoundError:
    print("Make a 'foursquare.creds' file storing your Foursquare credentials!")
    CLIENT_ID = CLIENT_SECRET = ''

VERSION = '20180605' # Foursquare API version

#### Explore neighborhoods of outer Toronto

In [None]:
# Use function from example notebook
def getNearbyVenues(names, latitudes, longitudes, radius = 750, LIMIT = 500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?' + \
        '&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
            
        # make the GET request
        response = requests.get(url).json()["response"]
        # fixme: should use a try, except block here
        results = response['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [None]:
outer_venues = getNearbyVenues(names = outer['Neighborhood'],
                              latitudes = outer['Latitude'],
                              longitudes = outer['Longitude'])

In [None]:
print(outer_venues.shape)
outer_venues.head(10)

Number of venues for each neighborhood

In [None]:
pd.DataFrame(outer_venues.groupby('Neighborhood')["Venue"].count())

In [None]:
print('There are {} uniques categories.'.format(len(outer_venues['Venue Category'].unique())))

#### Analyze Each Neighborhood

In [None]:
# one hot encoding
outer_onehot = pd.get_dummies(outer_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
outer_onehot['Neighborhood'] = outer_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [outer_onehot.columns[-1]] + list(outer_onehot.columns[:-1])
outer_onehot = outer_onehot[fixed_columns]

print(outer_onehot.shape)
outer_onehot.head()

Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
outer_grouped = outer_onehot.groupby('Neighborhood').mean().reset_index()
outer_grouped.head()

Some neighborhoods have only a couple of venues. Discard such neighborhoods

In [None]:
min_venues = 10

outer_grouped['count'] = outer_onehot.groupby('Neighborhood').size().values
outer_grouped = outer_grouped.loc[outer_grouped['count'] >= min_venues, :]\
    .drop("count", axis = 1).reset_index(drop = True)

Drop all categories without an example

In [None]:
outer_grouped = outer_grouped.loc[:, (outer_grouped != 0).any(axis = 0)]

In [None]:
outer_grouped.shape

Get each neighborhood along with the top 10 most common venues

In [None]:
# Sort venues in descending order
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    if ind < len(indicators):
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    else:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = outer_grouped['Neighborhood']

for ind in np.arange(outer_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(outer_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

### Cluster neighborhoods

In [None]:
# set number of clusters
kclusters = 3

outer_clustering = outer_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters = kclusters, random_state = 0, n_init = 100).fit(outer_clustering)

In [None]:
# check cluster labels generated for each row in the dataframe
kmeans.labels_

Add cluster label to the top 10 venues for each neighborhood.

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster', kmeans.labels_)

# Neighborhoods with at least 10 venues
neighborhoods = neighborhoods_venues_sorted.Neighborhood.values

# The merge is more complicated than in the example notebook because:
# 1. Not all venues were kept for the clustering
# 2. Some neighborhoods span across multiple postal codes. Thus we take the mean coordinates
outer_merged = (outer    
    # Filter for remaining neighborhoods
    .loc[outer.Neighborhood.isin(neighborhoods), :]\
    # Take mean coordinates
    .groupby('Neighborhood').mean().reset_index()\
    .join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
)

outer_merged.head()

Visualize the resulting clusters

In [None]:
# create map
map_clusters = folium.Map(location=[location.latitude, location.longitude], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(outer_merged['Latitude'], outer_merged['Longitude'], outer_merged['Neighborhood'], outer_merged['Cluster']):
    label = folium.Popup(str(poi) + ' (Cluster {})'.format(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examine clusters

In [None]:
outer_merged.loc[outer_merged['Cluster'] == 0, outer_merged.columns[[0] + list(range(4, outer_merged.shape[1]))]]

In [None]:
outer_merged.loc[outer_merged['Cluster'] == 1, outer_merged.columns[[0] + list(range(4, outer_merged.shape[1]))]]

In [None]:
outer_merged.loc[outer_merged['Cluster'] == 2, outer_merged.columns[[0] + list(range(4, outer_merged.shape[1]))]]

#### Conclusion

The clustering results are underwhelming, it's hard to find an interpretation for the resulting clusters (the last one could perhaps be described as 'Neighborhoods with pizza places and other fast food'). Perhaps because there are relatively few venues in most neighborhoods of outer Toronto, such that the number of categories is much larger than the number of neighborhoods.

A more interesting outcome might be obtained by aggregating categories, e.g. by adding up restaurants, fast food places, shops, recreation venues, etc.