Importing necessary libraries

In [1]:
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Using BeatifulSoup to scrape the web page

In [2]:
wiki_toronto = BeautifulSoup(requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').content, 'lxml')

In [3]:
#function to clean the text
def clean(content):
    content = content.replace(' / ', ', ')
    content = content.replace('(', '')
    content = content.replace(')', '')
    content = content.replace('\n','')
    return content

items = []
for item in wiki_toronto.table.find_all('tr')[1:]:
    postal_code = item.find_all('td')[0].text
    if item.find_all('td')[1].text == "Not assigned":
        continue
    parts = [it for it in item.stripped_strings if it not in "()"]
    borough = parts[1]
    neighborhood = ','.join(parts[2:])
    
    items.append((postal_code, clean(borough), clean(neighborhood)))

df = pd.DataFrame([it for it in items], columns=['PostalCode', 'Borough', 'Neighborhood'])
df.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
...,...,...,...
95,M2M,North York,Newtonbrook
96,M2M,North York,Willowdale
97,M3M,North York,Downsview Central
98,M4M,East Toronto,Studio District


Using groupby function along with 'apply' to aggregate neighborhoods with same Postal Code and Borough.

In [4]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).to_frame().reset_index()

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
df.shape

(103, 3)

Adding Latitudes and Longitudes with the dataframe

In [7]:
latlng = pd.read_csv('http://cocl.us/Geospatial_data')

Concatenating df and latlng dataframes

In [8]:
df_latlng = pd.concat([df.set_index('PostalCode'),latlng.set_index('Postal Code')],axis = 1,sort = False).reset_index()
df_latlng = df_latlng.rename(columns = {'index':'PostalCode'})

In [9]:
df_latlng

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437


In [10]:
df_latlng['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke', "Queen's Park"], dtype=object)

In [11]:
df_latlng['neighborhood_count'] = df_latlng['Neighborhood'].apply(lambda x: len(x.split(',')))

In [12]:
df_latlng

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,neighborhood_count
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353,2
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497,3
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711,3
3,M1G,Scarborough,Woburn,43.770992,-79.216917,1
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,1
...,...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188,1
99,M9P,Etobicoke,Westmount,43.696319,-79.532242,1
100,M9R,Etobicoke,"Kingsview Village,Martin Grove Gardens,Richvie...",43.688905,-79.554724,4
101,M9V,Etobicoke,"Albion Gardens,Beaumond Heights,Humbergate,Jam...",43.739416,-79.588437,8


Analysing clusters

In [13]:
print('There are {} Boroughs and {} Neighborhoods'
      .format(len(df['Borough'].unique()),len(set(','.join(df_latlng['Neighborhood']).split(',')))))

There are 11 Boroughs and 208 Neighborhoods


In [14]:
import folium

In [15]:
m = folium.Map(location = [df_latlng['Latitude'].mean(),df_latlng['Longitude'].mean()], zoom_start = 10)

In [16]:
for lat, lng, borough, neighborhood in zip(df_latlng['Latitude'], df_latlng['Longitude'], df_latlng['Borough'], df_latlng['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(m) 

In [17]:
m

In [33]:
CLIENT_ID = 'FRZ2C4TPVAEUMRSWEBZFIU1XOEWDBM4MHMOLTYCP0CWIEZYX' # your Foursquare ID
CLIENT_SECRET = 'O2LYFGRDH4CE0RIB5VNMGOWWVXQY4N3ZCRKXKI04CBGDI5EP' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 103
def getNearbyVenues(names, borough, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, bor, lat, lng in zip(names, borough, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name,
            bor,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                             'Borough',
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [39]:
dfk = df_latlng[df_latlng['Neighborhood'].apply(lambda x: 'Toronto' in x)]
toronto_venues = getNearbyVenues(names=dfk['Neighborhood'],
                                   borough = dfk['Borough'],
                                   latitudes=dfk['Latitude'],
                                   longitudes=dfk['Longitude']
                                  )

CFB Toronto,Downsview East
East Toronto
North Toronto West
Harbourfront East,Toronto Islands,Union Station
Design Exchange,Toronto Dominion Centre
Harbord,University of Toronto
Humber Bay Shores,Mimico South,New Toronto


In [41]:
toronto_venues

Unnamed: 0,Neighborhood,Borough,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"CFB Toronto,Downsview East",North York,43.737473,-79.464763,Toronto Downsview Airport (YZD),43.738883,-79.470111,Airport
1,"CFB Toronto,Downsview East",North York,43.737473,-79.464763,Ancaster Park,43.734706,-79.464777,Park
2,East Toronto,East York,43.685347,-79.338106,The Path,43.683923,-79.335007,Park
3,East Toronto,East York,43.685347,-79.338106,Sammon Convenience,43.686951,-79.335007,Convenience Store
4,East Toronto,East York,43.685347,-79.338106,The Red Rocket,43.688048,-79.333274,Coffee Shop
...,...,...,...,...,...,...,...,...
268,"Humber Bay Shores,Mimico South,New Toronto",Etobicoke,43.605647,-79.501321,Halibut House Fish and Chips Inc.,43.601960,-79.501147,Seafood Restaurant
269,"Humber Bay Shores,Mimico South,New Toronto",Etobicoke,43.605647,-79.501321,Pet Valu,43.602431,-79.498653,Pet Store
270,"Humber Bay Shores,Mimico South,New Toronto",Etobicoke,43.605647,-79.501321,Crossfit Colosseum,43.604816,-79.507024,Gym
271,"Humber Bay Shores,Mimico South,New Toronto",Etobicoke,43.605647,-79.501321,Big Guy's Little Coffee Shop,43.601359,-79.502480,Coffee Shop


In [42]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Borough,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"CFB Toronto,Downsview East",2,2,2,2,2,2,2
"Design Exchange,Toronto Dominion Centre",100,100,100,100,100,100,100
East Toronto,3,3,3,3,3,3,3
"Harbord,University of Toronto",36,36,36,36,36,36,36
"Harbourfront East,Toronto Islands,Union Station",100,100,100,100,100,100,100
"Humber Bay Shores,Mimico South,New Toronto",13,13,13,13,13,13,13
North Toronto West,19,19,19,19,19,19,19


In [47]:
print('There are {} unique venue categories'.format(len(toronto_venues['Venue Category'].unique())))

There are 96 unique venue categories


In [49]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']],prefix = " ",prefix_sep = "")

In [52]:
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood']

In [55]:
fixed_columns = [toronto_onehot.columns[-1]]+list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

In [56]:
toronto_onehot.head()

Unnamed: 0,Neighborhood,Airport,American Restaurant,Aquarium,Art Gallery,Asian Restaurant,Bakery,Bar,Baseball Stadium,Basketball Stadium,...,Supermarket,Sushi Restaurant,Tailor Shop,Tea Room,Theater,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Wine Bar,Yoga Studio
0,"CFB Toronto,Downsview East",1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"CFB Toronto,Downsview East",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,East Toronto,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').reset