## Segmenting and Clustering Neighborhoods in Toronto
### Let's get started
import necessary packages


In [12]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation
#  http://beautiful-soup-4.readthedocs.io/en/latest/    # for more advanced web scraping  

import lxml
import html5lib

print('Libraries imported.')

Libraries imported.


Get postal code data from Wikipedias page List of "postal codes of Canada: M"  

Then turn it into a panda dataframe

In [13]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
df = pd.read_html(url, attrs={"class": "wikitable"})[0]   # 0 is for the 1st table in this particular page
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


Clean the data in the dataframe df

In [14]:
df['Borough']=df['Borough'].replace('Not assigned',np.NaN)   # replace 'Not assigned' with NaN 
df=df.dropna() 
df.reset_index(drop=True, inplace=True)
print("Dropped 'Not assigned' values in the 'Postal Code' column!")

# pd.set_option('display.max_rows', None)     # show all data in dataframe

# check for mispelled 'Not assigned' values in 'Borough' which has not been replaced
print("Any mispelles not assigned values in 'Borough'?  " + str(df['Borough'][df['Borough'].str.lower().str.contains('t as', regex=False)]) + ", " + str(df['Borough'][df['Borough'].str.lower().str.contains('not ', regex=False)]) )   

# find non unique (duplicate) postal codes 
dupli= "no"
for d in df['Postal Code'].duplicated().unique():
    if d == True:
        dupli = ""
print("There is " + dupli+" duplicate 'Postal Code' rows!")


# find neighborhoods with 'Not assigned' values
assigned="Cannot find any"
for a in df['Neighbourhood'].isin(['Not assigned']):
    if a == True:
        assigned="Found"
print(assigned+" 'Not assigned' values in the 'Neighbourhood' column!")

df.head()

Dropped 'Not assigned' values in the 'Postal Code' column!
Any mispelles not assigned values in 'Borough'?  Series([], Name: Borough, dtype: object), Series([], Name: Borough, dtype: object)
There is no duplicate 'Postal Code' rows!
Cannot find any 'Not assigned' values in the 'Neighbourhood' column!


Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Let's check the size of our dataframe

In [15]:
df.shape

(103, 3)


# Part 2

Import necessary packages: 

Google Maps Geocoding API started to cost therefore geocoder is used, however it is a bit unreliable it seems, thus we need to download an .CSV file with data of the coordinates


In [16]:
df_coordinates=pd.read_csv('../datafiles/Geospatial_Coordinates.csv')
df_coordinates

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [17]:
# pip install geopandas

In [18]:
import geocoder # import geocoder
import geopy
from geopy.geocoders import Nominatim

In [19]:
# https://towardsdatascience.com/geocode-with-python-161ec1e62b89

locator = Nominatim(user_agent="myGeocoder")
location = locator.geocode("Champ de Mars, Paris, France")

In [21]:

# Coordinate arrays
lat_list=np.array([])
long_list=np.array([])

for i, pcode in enumerate(df['Postal Code']):
    print(i, pcode)
    # initialize your variable to None
    lat_coords =None
    lng_coords =None
    

    locator = Nominatim(user_agent="myGeocoder")
    count = 0
    # loop until you get the coordinates
    while(lng_coords is None):
        location = locator.geocode('{}, Toronto, Ontario'.format(pcode))
        try:
            lat_coords = location.latitude
            lng_coords = location.longitude
            # print("Coordinates: "+ str(lat_coords) + ", "+str(lng_coords) )
        except:
            lat_coords =None
            lng_coords =None

        if count == 3:  # the limit
            # print(" Error 'Location' not found in Geocode! Switching to CSV file")
            lat_coords = df_coordinates.loc[ df_coordinates['Postal Code'] == pcode ]['Latitude'].to_list()[0]
            lng_coords = df_coordinates.loc[ df_coordinates['Postal Code'] == pcode ]['Longitude'].to_list()[0]
            # print("\nCoordinates: "+ str(lat_coords) + ", "+str(lng_coords) )
        # if count == 3:
        #     print("Error count = 3")
        #     break

        count = count + 1
    
    lat_list= np.append(lat_list, lat_coords)
    long_list= np.append(long_list, lng_coords)

lat_list

0 M3A
1 M4A
 Error 'Location' not found in Geocode! Switching to excel file
2 M5A
 Error 'Location' not found in Geocode! Switching to excel file
3 M6A
 Error 'Location' not found in Geocode! Switching to excel file
4 M7A
5 M9A
 Error 'Location' not found in Geocode! Switching to excel file
6 M1B
7 M3B
 Error 'Location' not found in Geocode! Switching to excel file
8 M4B
 Error 'Location' not found in Geocode! Switching to excel file
9 M5B
 Error 'Location' not found in Geocode! Switching to excel file
10 M6B
 Error 'Location' not found in Geocode! Switching to excel file
11 M9B
 Error 'Location' not found in Geocode! Switching to excel file
12 M1C
13 M3C
14 M4C
 Error 'Location' not found in Geocode! Switching to excel file
15 M5C
 Error 'Location' not found in Geocode! Switching to excel file
16 M6C
 Error 'Location' not found in Geocode! Switching to excel file
17 M9C
18 M1E
 Error 'Location' not found in Geocode! Switching to excel file
19 M4E
 Error 'Location' not found in Geocode

array([43.6534817 , 43.7258823 , 43.6542599 , 43.718518  , 43.6534817 ,
       43.6678556 , 43.6534817 , 43.7459058 , 43.7063972 , 43.6571618 ,
       43.709577  , 43.6509432 , 43.6534817 , 43.7328216 , 43.6953439 ,
       43.6514939 , 43.6937813 , 43.64410993, 43.7635726 , 43.6763574 ,
       43.6421064 , 43.6890256 , 43.76571677, 43.7090604 , 43.6579524 ,
       43.669542  , 43.773136  , 43.8037622 , 43.7543283 , 43.7053689 ,
       43.64990081, 43.6690051 , 43.7447342 , 43.7797719 , 43.7679803 ,
       43.685347  , 43.6392586 , 43.6522219 , 43.7279292 , 43.7869473 ,
       43.7374732 , 43.6795571 , 43.6471768 , 43.63709691, 43.7111117 ,
       43.7574902 , 43.7390146 , 43.6727601 , 43.6481985 , 43.7137562 ,
       43.7563033 , 43.716316  , 43.7859621 , 43.7284964 , 43.6595255 ,
       43.7332825 , 43.6911158 , 43.7247659 , 43.692657  , 43.77923857,
       43.7616313 , 43.7280205 , 43.7116948 , 43.67455325, 43.706876  ,
       43.7574096 , 43.7527583 , 43.7127511 , 43.6969476 , 43.66

In [22]:
lat_list.shape

(103,)

The shape is of the correct size

In [23]:
df["Latitude"]=lat_list
df["Longitude"]=long_list
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.653482,-79.383935
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.653482,-79.383935
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Part 3: Analysis

explore and cluster neighborhoods. Begin by importing necessary libraries.

In [24]:
import json # library to handle JSON files

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

The credentials for Foursquare is kept in another file, imported here.

In [25]:
# some_file.py
import sys
# insert at 1, 0 is the script path (or '' in REPL)
sys.path.insert(1, '../credentials/')

import config

Continue exploring data 

In [44]:
print('The dataframe of size {} has {} boroughs and {} neighborhoods.'.format(
        df.shape,
        len(df['Borough'].unique()),
        len(df['Neighbourhood'].unique())
    )
)

The dataframe of size (103, 5) has 10 boroughs and 99 neighborhoods.


### Use geopy library to get the latitude and longitude values of Toronto

In [45]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address)
to_latitude = location.latitude
to_longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(to_latitude, to_longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [46]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[to_latitude, to_longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Define Foursquare Credentials and Version

In [30]:
# the credentials for Foursquared is stored in another file
CLIENT_ID=config.client_id
CLIENT_SECRET=config.client_secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

### Explore the first neighborhood of Toronto

In [31]:
df.loc[0, 'Neighbourhood']

'Parkwoods'

In [33]:
neighborhood_latitude = df.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Parkwoods are 43.6534817, -79.3839347.


### Now, let's get the top 100 venues that are in 'Parkwoods' within 500 meters.

First, create the GET request URL

In [34]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

In [38]:
url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)

Send Get request and examine results

In [39]:
results = requests.get(url).json()

Use the function 'get_category_type' from below to help filter the JSON file

In [40]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [41]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Downtown Toronto,Neighborhood,43.653232,-79.385296
1,Nathan Phillips Square,Plaza,43.65227,-79.383516
2,Eggspectation Bell Trinity Square,Breakfast Spot,43.653144,-79.38198
3,Japango,Sushi Restaurant,43.655268,-79.385165
4,Indigo,Bookstore,43.653515,-79.380696


In [42]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

90 venues were returned by Foursquare.


## Explore all the neighborhoods in Toronto 

Create a function to repeat the same process for all neighborhoods in Toronto.

In [47]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

run the above function

In [49]:
Toronto_venues = getNearbyVenues(names=df['Neighbourhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [51]:
print(Toronto_venues.shape)
Toronto_venues.head()

(2598, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.653482,-79.383935,Downtown Toronto,43.653232,-79.385296,Neighborhood
1,Parkwoods,43.653482,-79.383935,Nathan Phillips Square,43.65227,-79.383516,Plaza
2,Parkwoods,43.653482,-79.383935,Eggspectation Bell Trinity Square,43.653144,-79.38198,Breakfast Spot
3,Parkwoods,43.653482,-79.383935,Japango,43.655268,-79.385165,Sushi Restaurant
4,Parkwoods,43.653482,-79.383935,Indigo,43.653515,-79.380696,Bookstore


In [52]:
print('There are {} uniques categories.'.format(len(Toronto_venues['Venue Category'].unique())))

There are 281 uniques categories.
