<h1> Segmenting and Clustering Neighborhoods in Toronto City </h1>

<h4>Importing dependencies</h4>

In [23]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
from geopy.geocoders import Nominatim
import folium
import json
from pandas.io.json import json_normalize

<h4>Requesting source page</h4>

In [3]:
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text

<h4>Using BeautifulSoup for scraping the data and geocoder for obtaining coordinates</h4>

In [4]:
soup = BeautifulSoup(source, 'lxml')
table = soup.find('tbody')
postcode = []
borough = []
neighborhood = []
latitudes = []
longitudes = []
for row in table.findAll('tr'):
    temp = []
    for each_data in row.findAll('td'):
        temp.append(each_data.text)
    if len(temp) and temp[1] != 'Not assigned': #Ignoring cells with a borough that is Not assigned
        postcode.append(temp[0])
        borough.append(temp[1])
        temp_2 = temp[2].rstrip('\n')
        if temp_2 == 'Not assigned': #If a cell has a Not assigned neighborhood
            temp2 = temp[1]          #then the neighborhood will be the same as the borough
        neighborhood.append(temp_2)
        
        #lat_lng_coords = None
        #while(lat_lng_coords is None):
            #g = geocoder.google('{}, Toronto, Ontario'.format(temp[0]))
            #lat_lng_coords = g.latlng

        #latitudes.append(lat_lng_coords[0])
        #longitudes.append(lat_lng_coords[1])
data = {'Postal Code': postcode, 'Borough': borough, 'Neighborhood': neighborhood} #The dataframe will consist of three columns:
                                                                                  #PostalCode, Borough, and Neighborhood

Given that this package can be very unreliable, in case you wish to get the geographical coordinates of the neighborhoods using the Geocoder package, uncomment the lines in the above code snippet

<h4>Storing the scraped data in pandas dataframe</h4>

In [5]:
df = pd.DataFrame(data)                                                           
df = df.groupby("Postal Code").agg(lambda x:','.join(set(x)))

<h4>The number of rows of the dataframe</h4>

In [6]:
df.shape

(103, 2)

<h4>Displaying any 10 data from the dataframe</h4>

In [7]:
df.sample(10)

Unnamed: 0_level_0,Borough,Neighborhood
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M9W,Etobicoke,Northwest
M6K,West Toronto,"Brockton,Parkdale Village,Exhibition Place"
M3M,North York,Downsview Central
M5T,Downtown Toronto,"Chinatown,Kensington Market,Grange Park"
M6B,North York,Glencairn
M3L,North York,Downsview West
M5S,Downtown Toronto,"Harbord,University of Toronto"
M6M,York,"Mount Dennis,Silverthorn,Del Ray,Keelesdale"
M5A,Downtown Toronto,"Regent Park,Harbourfront"
M6J,West Toronto,"Little Portugal,Trinity"


<h4>Get the latitude and the longitude coordinates of each neighborhood using geospatial data</h4>   
The http://cocl.us/Geospatial_data (csv file) contains geographical coordinates of each postal code of Toronto

In [8]:
df2 = pd.read_csv('Geospatial_Coordinates.csv')
df = pd.merge(df, df2, on = 'Postal Code')

<h4>Examing the resulting dataframe</h4> 

In [9]:
df.sample(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
92,M8Z,Etobicoke,"The Queensway West,South of Bloor,Mimico NW,Ki...",43.628841,-79.520999
7,M1L,Scarborough,"Golden Mile,Oakridge,Clairlea",43.711112,-79.284577
27,M3C,North York,"Don Mills South,Flemingdon Park",43.7259,-79.340923
62,M5M,North York,"Bedford Park,Lawrence Manor East",43.733283,-79.41975
8,M1M,Scarborough,"Cliffside,Scarborough Village West,Cliffcrest",43.716316,-79.239476
34,M4A,North York,Victoria Village,43.725882,-79.315572
98,M9N,York,Weston,43.706876,-79.518188
31,M3L,North York,Downsview West,43.739015,-79.506944
86,M7R,Mississauga,Canada Post Gateway Processing Centre,43.636966,-79.615819
21,M2M,North York,"Willowdale,Newtonbrook",43.789053,-79.408493


In [10]:
df.groupby('Borough')['Neighborhood'].count()

Borough
Central Toronto      9
Downtown Toronto    18
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Queen's Park         1
Scarborough         17
West Toronto         6
York                 5
Name: Neighborhood, dtype: int64

In [11]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


In [12]:
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653963, -79.387207.


<h4>Creating a map of Toronto with neighborhoods superimposed on top</h4>

In [13]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        fill_color='##3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

<h4>Define Foursquare Credentials and Version</h4>

In [14]:
Creds = pd.read_csv('FourSquareCredentials.csv')
CLIENT_ID = Creds['CLIENT_ID'][0] # your Foursquare ID
CLIENT_SECRET = Creds['CLIENT_SECRET'][0] # your Foursquare Secret
VERSION = Creds['VERSION'][0] # Foursquare API version

In [26]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [27]:
toronto_venues = getNearbyVenues(names=df['Neighborhood'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

Rouge,Malvern
Port Union,Highland Creek,Rouge Hill
West Hill,Morningside,Guildwood
Woburn
Cedarbrae
Scarborough Village
Ionview,Kennedy Park,East Birchmount Park
Golden Mile,Oakridge,Clairlea
Cliffside,Scarborough Village West,Cliffcrest
Birch Cliff,Cliffside West
Wexford Heights,Dorset Park,Scarborough Town Centre
Maryvale,Wexford
Agincourt
Tam O'Shanter,Sullivan,Clarks Corners
Steeles East,Agincourt North,L'Amoreaux East,Milliken
L'Amoreaux West
Upper Rouge
Hillcrest Village
Oriole,Henry Farm,Fairview
Bayview Village
Silver Hills,York Mills
Willowdale,Newtonbrook
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Don Mills South,Flemingdon Park
Wilson Heights,Downsview North,Bathurst Manor
York University,Northwood Park
CFB Toronto,Downsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine Gardens,Parkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
Riverdale,The Danforth West
India Bazaar,The Beac

In [28]:
print(toronto_venues.shape)
toronto_venues.head()

(2250, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge,Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Port Union,Highland Creek,Rouge Hill",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,"Port Union,Highland Creek,Rouge Hill",43.784535,-79.160497,Scarborough Historical Society,43.788755,-79.162438,History Museum
3,"West Hill,Morningside,Guildwood",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"West Hill,Morningside,Guildwood",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store


In [29]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,Richmond,King",100,100,100,100,100,100
Agincourt,4,4,4,4,4,4
"Albion Gardens,Mount Olive,Silverstone,Jamestown,South Steeles,Beaumond Heights,Humbergate,Thistletown",9,9,9,9,9,9
Bayview Village,4,4,4,4,4,4
"Bedford Park,Lawrence Manor East",23,23,23,23,23,23
Berczy Park,56,56,56,56,56,56
"Birch Cliff,Cliffside West",4,4,4,4,4,4
"Brockton,Parkdale Village,Exhibition Place",23,23,23,23,23,23
Business Reply Mail Processing Centre 969 Eastern,18,18,18,18,18,18
"CFB Toronto,Downsview East",3,3,3,3,3,3


In [31]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 281 uniques categories.


In [32]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
