## Capstone Clustering Assignment: Neighborhoods in Toronto

In [12]:
# Import required libraries
import pandas as pd
import numpy as np
import io
from bs4 import BeautifulSoup
import requests

### Dataframe of the postal code of each neighborhood along with the borough name and neighborhood name

In [13]:
# Get Data
wikipage = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
result = requests.get(wikipage)

# Upon successful download parse the data into a BeautifulSoup object 
if result.status_code == 200:
    soup = BeautifulSoup(result.content, "html.parser")
    
# Find the object with HTML class wikitable sortable
table = soup.find('table',{'class':'wikitable sortable'})

# loop through all the rows and pull the text
new_table = []
for row in table.find_all('tr')[1:]:
    column_marker = 0
    columns = row.find_all('td')
    new_table.append([column.get_text().strip() for column in columns])

# Load data into a dataframe     
df = pd.DataFrame(new_table, columns=['Postcode', 'Borough', 'Neighbourhood'])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [14]:
# Exclude Not Assigned Borough
df1=df[df['Borough']!='Not assigned']
df1.head()


Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [48]:
#If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
df2 = df1.groupby(['Postcode','Borough'], as_index=False).agg(lambda x: ','.join(x))
find_na = df2['Neighbourhood'] == "Not assigned"
df2.loc[find_na, 'Neighbourhood'] = df2.loc[find_na, 'Borough']
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [49]:
df2.shape

(103, 3)

###  Latitude and the longitude coordinates of each neighborhood

In [50]:
# Download longitude Latitude data and load into a dataframe
url = "http://cocl.us/Geospatial_data"
ll= requests.get(url).text
df_ll = pd.read_csv(io.StringIO(ll))
df_ll.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [51]:
# Merge Neighbourhood and Longitude and Latitude data 
df_ll=df_ll.rename(columns={'Postal Code': 'Postcode'})
toronto_df=pd.merge(df2, df_ll, on='Postcode')
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [52]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(toronto_df['Borough'].unique()),
        toronto_df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


## Explore and cluster the neighborhoods in Toronto

In [5]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize  # tranform JSON file into a pandas dataframe


# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    certifi-2020.6.20          |   py36h9f0ad1d_0         151 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    openssl-1.1.1g             |       h516909a_1         2.1 MB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.4.1               |             py_0          26 KB  conda-forge
    altair-4.1.0               |             py_1         614 KB  conda-forge
    ------------------------------------------------------------
                       

In [6]:
#Get longitude and Latitude of Toronto
address = "Toronto, ON"
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto city are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto city are 43.6534817, -79.3839347.


In [53]:
# create map of Toronto using latitude and longitude values and add markers
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# select only boroughs that contain the word Toronto 
toronto_df_only = toronto_df[toronto_df['Borough'].str.contains("Toronto")].reset_index(drop=True)

# add markers
for lat, lng, borough, neighborhood in zip(
        toronto_df_only['Latitude'], 
        toronto_df_only['Longitude'], 
        toronto_df_only['Borough'], 
        toronto_df_only['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  

map_toronto


## Explore the neighborhood using Foursquare

In [24]:
CLIENT_ID = 'NJVB2OJDMAEKWUCMCKKTWP53T35DSXPTQAKDKXQ4ZFU21MV0' # Foursquare ID
CLIENT_SECRET = 'NPEIBN5HUYOLOFBRPKXXQBEFF5XT22O3EGVT0NV4O3G3L5KO' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [32]:
neighborhood_latitude = toronto_df_only.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_df_only.loc[0, 'Longitude'] # neighborhood longitude value
neighborhood_name = toronto_df_only['Neighbourhood']
print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of 0                                           The Beaches
1                          The Danforth West, Riverdale
2                        India Bazaar, The Beaches West
3                                       Studio District
4                                         Lawrence Park
5                                      Davisville North
6                    North Toronto West,  Lawrence Park
7                                            Davisville
8                           Moore Park, Summerhill East
9     Summerhill West, Rathnelly, South Hill, Forest...
10                                             Rosedale
11                          St. James Town, Cabbagetown
12                                 Church and Wellesley
13                            Regent Park, Harbourfront
14                             Garden District, Ryerson
15                                       St. James Town
16                                          Berczy Park
17             

### Get the top 100 venues that are in The Beaches within a radius of 500 meters

In [33]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)

# get the result to a json file
results = requests.get(url).json()

In [34]:
# Function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [37]:
# Results in data frame
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869
4,Seaspray Restaurant,Asian Restaurant,43.678888,-79.298167
