## Applied Data Science Capstone - Week3 - Assignment: Segmenting and Clustering Neighborhoods in Toronto

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read the postal codes data file from my Watson cloud storage
file_url = "https://cloud-object-storage-ul-cos-standard-97j.s3.us-east.cloud-object-storage.appdomain.cloud/Postal_codes_of_Canada.csv"

# Read the data file
df_orig = pd.read_csv(file_url)
df_orig.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


In [5]:
df_orig.shape

(180, 3)

In [3]:
# Delete rows if Borough has 'Not assigned'
df = df_orig[df_orig['Borough'] != 'Not assigned'].reset_index(drop=True)
df.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [4]:
df.shape

(103, 3)

In [8]:
# df.values

In [5]:
# groupby 'Postal Code' and combine Neighbourhood values
df.groupby(['Postal Code'], as_index = False).agg({'Neighbourhood': ', '.join})
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [6]:
df.shape

(103, 3)

#### Check if we have 'Not assigned' assigned to Neighbourhood column

In [7]:
# Check if we have 'Not assigned' assigned to Neighbourhood column
df_na = df[df['Neighbourhood'] == 'Not assigned']
df_na.shape

(0, 3)

#### Since there is no case where 'Not assigned' assigned to the Neighbourhood column, we do not need to run the below code.

In [84]:
# If Neighbourhood value is 'Not assigned', replace it with the value of Borough column
# df.loc[df['Neighbourhood'] == 'Not assigned', 'Neighbourhood'] = df['Borough']
# df.values

In [8]:
df.shape

(103, 3)

#### Get the location coordinates for each Postal Code in the dataset

In [None]:
!pip install geocoder  # Install Geocoder package

In [8]:
import geocoder

In [9]:
for index, row in df.iterrows():
    # initialize your variable to None
    lat_lng_coords = None
    
    print("Looking up a coordinate for {}...".format(row['Postal Code']))
    
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.google('{}, Toronto, Ontario'.format(row['Postal Code']))
        lat_lng_coords = g.latlng
    
    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    print("{} - ({}, {})".format(row['Postal Code'], latitude, longitude))

Looking up a coordinate for M3A...


KeyboardInterrupt: 

### Use the Geospatial_data instead.
[Geospatial_data](https://cocl.us/Geospatial_data)

In [9]:
# Read the geospatial data from the Watson cloud storage
geospatial_url = "https://cloud-object-storage-ul-cos-standard-97j.s3.us-east.cloud-object-storage.appdomain.cloud/Geospatial_Coordinates.csv"

df_geo = pd.read_csv(geospatial_url)
df_geo.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [10]:
# Join the dataframe with Latitude and Longitude
df_merged = pd.merge(left=df, right=df_geo, how='left', left_on='Postal Code', right_on='Postal Code')
df_merged.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Explore and cluster the neighborhoods in Toronto. 

In [11]:
# Make a subset with boroughs containing 'Toronto'
df_sub = df_merged[df_merged['Borough'].str.contains("Toronto")]
df_sub.head(10)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


In [12]:
df_sub.shape

(39, 5)

In [13]:
# Import packages
import json # library to handle JSON files
import requests # library to handle requests
from pandas import json_normalize

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

print('Packages imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Packages imported.


### Use geopy library to get the latitude and longitude values of Toronto.

In [14]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address, country_codes='CA')
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto is ({}, {}).'.format(latitude, longitude))

The geograpical coordinate of Toronto is (43.6534817, -79.3839347).


### Utilize the Foursquare API to explore the neighborhoods and segment them.

In [15]:
CLIENT_ID = '1NFFC0UOCFEGDLKX5XCTDHGYH0WMYW53HFMHGEB34T4I4ADQ' # your Foursquare ID
CLIENT_SECRET = '03ATI4IEN1ZZ1YC1M1HIS0SW5AFJ2PWUIWL1RZTRA0DPGUNT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

In [16]:
# Function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [17]:
# Function that call the Foursquare API and creats dataframe with venue information 
# such as name, lat & lng, and category_name.
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
#         print(url)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']       
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [18]:
# Create a new dataframe with venues
radius = 500
toronto_venues = getNearbyVenues(df_sub['Neighbourhood'], df_sub['Latitude'], df_sub['Longitude'], radius)
print('Completed')

Completed


### Note that the orifinal dataframe uses 'Neighbourhood' while the new dataframe with venues uses 'Neighborhood'

In [19]:
print(toronto_venues.shape)
toronto_venues.head(10)

(1624, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant
5,"Regent Park, Harbourfront",43.65426,-79.360636,Corktown Common,43.655618,-79.356211,Park
6,"Regent Park, Harbourfront",43.65426,-79.360636,Dominion Pub and Kitchen,43.656919,-79.358967,Pub
7,"Regent Park, Harbourfront",43.65426,-79.360636,The Distillery Historic District,43.650244,-79.359323,Historic Site
8,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
9,"Regent Park, Harbourfront",43.65426,-79.360636,The Extension Room,43.653313,-79.359725,Gym / Fitness Center


### Let's check how many venues were returned for each neighborhood

In [20]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,55,55,55,55,55,55
"Brockton, Parkdale Village, Exhibition Place",23,23,23,23,23,23
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",16,16,16,16,16,16
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",16,16,16,16,16,16
Central Bay Street,68,68,68,68,68,68
Christie,16,16,16,16,16,16
Church and Wellesley,75,75,75,75,75,75
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
Davisville,33,33,33,33,33,33
Davisville North,9,9,9,9,9,9


### Let's find out how many unique categories can be curated from all the returned venues

In [21]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 237 uniques categories.


In [32]:
toronto_venues['Venue Category'].unique()

array(['Bakery', 'Coffee Shop', 'Distribution Center', 'Spa',
       'Restaurant', 'Park', 'Pub', 'Historic Site', 'Breakfast Spot',
       'Gym / Fitness Center', 'Farmers Market', 'Chocolate Shop',
       'Dessert Shop', 'Performing Arts Venue', 'Theater',
       'Mexican Restaurant', 'French Restaurant', 'Yoga Studio', 'Café',
       'Event Space', 'Shoe Store', 'Art Gallery', 'Electronics Store',
       'Brewery', 'Beer Store', 'Bank', 'Hotel', 'Antique Shop',
       'Italian Restaurant', 'Portuguese Restaurant', 'Beer Bar',
       'Creperie', 'Sushi Restaurant', 'Hobby Shop', 'Diner',
       'Fried Chicken Joint', 'Chinese Restaurant', 'Smoothie Shop',
       'Sandwich Place', 'Gym', 'College Auditorium', 'Bar',
       'College Cafeteria', 'Music Venue', 'Clothing Store', 'Comic Shop',
       'Pizza Place', 'Plaza', 'Burrito Place', 'Ramen Restaurant',
       'Burger Joint', 'Thai Restaurant', 'Sporting Goods Shop',
       'College Rec Center', 'Shopping Mall', 'Japanese Restauran

## Analyze Each Neighborhood

In [34]:
# One hot encoding
dummies = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# Drop 'Neighborhood' column from dummies


# Concat Neighborhood column with the dummies
# toronto_onehot = pd.concat([toronto_venues['Neighborhood'], dummies], axis=1)
# toronto_onehot.head(10)

# toronto_onehot['Neighborhood']

Index(['Afghan Restaurant', 'Airport', 'Airport Food Court', 'Airport Gate',
       'Airport Lounge', 'Airport Service', 'Airport Terminal',
       'American Restaurant', 'Antique Shop', 'Aquarium',
       ...
       'Theme Restaurant', 'Toy / Game Store', 'Trail', 'Train Station',
       'Vegetarian / Vegan Restaurant', 'Video Game Store',
       'Vietnamese Restaurant', 'Wine Bar', 'Women's Store', 'Yoga Studio'],
      dtype='object', length=237)

In [23]:
# Examine the new dataframe size
toronto_onehot.shape

(1624, 238)

### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [26]:
toronto_onehot.groupby(['Neighborhood']).mean()
# toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

ValueError: Grouper for 'Neighborhood' not 1-dimensional