In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source,'lxml')

In [4]:
column_names = ['PostalCode','Borough','Neighborhood']

neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


In [5]:
neigh_loc = soup.find('table', class_='wikitable sortable').find_all('tr')[1:]
i = 0
for row in neigh_loc:
    cells = row.find_all("td")
    if cells[1].text != "Not assigned":
        if cells[2].text != "Not assigned\n":
            neighborhoods.loc[i] = [cells[0].text, cells[1].text, cells[2].text[0:len(cells[2].text)-1]]
        else:
            neighborhoods.loc[i] = [cells[0].text, cells[1].text, cells[1].text]
        i = i + 1
        
neighborhoods.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [6]:
combined_neighbors = neighborhoods.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

combined_neighbors.shape


(103, 3)

In [7]:
locations = pd.read_csv("https://cocl.us/Geospatial_data")

locations.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [8]:
neighbor_locations = pd.concat([combined_neighbors, locations], axis = 1, join='inner')
neighbor_locations = neighbor_locations.T.drop_duplicates().T
neighbor_locations.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887
3,M1G,Scarborough,Woburn,43.771,-79.2169
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395
5,M1J,Scarborough,Scarborough Village,43.7447,-79.2395
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.7279,-79.262
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.7111,-79.2846
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.7163,-79.2395
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.6927,-79.2648


In [9]:
neighbor_locations.shape

(103, 5)

In [10]:
import folium

toronto_map = folium.Map(location=[43.7869, -79.386], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighbor_locations['Latitude'], neighbor_locations['Longitude'], neighbor_locations['Borough'], neighbor_locations['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map

In [11]:
client_id = 'TGCQGSJKJH12NXVL241OAUEHDVGPHHREGXN55S320W3K1XDW'
client_sec = 'D2WSWV23UFUYY0YTWOFUKH4PNGBJROD2SKJZDR2HDO13EFW5'
ver = '20180605'

limit = 100

In [43]:
import requests 
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_sec, 
            ver, 
            lat, 
            lng, 
            radius, 
            limit)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Borough Latitude', 
                  'Borough Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [44]:
toronto_venues = getNearbyVenues(names=neighbor_locations['Borough'],latitudes=neighbor_locations['Latitude'],longitudes=neighbor_locations['Longitude'])

In [45]:
print(toronto_venues.shape)
toronto_venues.head()

(2237, 7)


Unnamed: 0,Borough,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Scarborough,43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,Scarborough,43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
2,Scarborough,43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
3,Scarborough,43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
4,Scarborough,43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant


In [46]:
toronto_venues.groupby('Borough').count()

Unnamed: 0_level_0,Borough Latitude,Borough Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Central Toronto,113,113,113,113,113,113
Downtown Toronto,1281,1281,1281,1281,1281,1281
East Toronto,123,123,123,123,123,123
East York,75,75,75,75,75,75
Etobicoke,71,71,71,71,71,71
Mississauga,11,11,11,11,11,11
North York,235,235,235,235,235,235
Queen's Park,42,42,42,42,42,42
Scarborough,89,89,89,89,89,89
West Toronto,177,177,177,177,177,177


In [50]:
toronto_oh = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_oh['Borough'] = toronto_venues['Borough']
fixed_columns = [toronto_oh.columns[-1]] + list(toronto_oh.columns[:-1])
toronto_oh = toronto_oh[fixed_columns]
toronto_oh.head()

Unnamed: 0,Borough,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Scarborough,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
toronto_oh.shape

(2237, 278)

In [52]:
toronto_gr = toronto_oh.groupby('Borough').mean().reset_index()
toronto_gr.head()

Unnamed: 0,Borough,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Wine Bar,Wings Joint,Women's Store,Yoga Studio
0,Central Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.00885,0.0,0.0,0.00885,0.0,0.0,0.0,0.0,0.00885
1,Downtown Toronto,0.0,0.000781,0.000781,0.000781,0.000781,0.000781,0.001561,0.001561,0.001561,...,0.002342,0.007806,0.002342,0.000781,0.005464,0.0,0.006245,0.000781,0.001561,0.003123
2,East Toronto,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02439
3,East York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.013333,0.0,0.0,0.0,0.013333
4,Etobicoke,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.014085,0.0,0.0,0.0,0.014085,0.0,0.0


In [53]:
toronto_gr.shape

(11, 278)

In [54]:
num_top_venues = 5

for hood in toronto_gr['Borough']:
    print("----"+hood+"----")
    temp = toronto_gr[toronto_gr['Borough'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Central Toronto----
            venue  freq
0     Coffee Shop  0.08
1  Sandwich Place  0.07
2            Park  0.06
3     Pizza Place  0.05
4    Dessert Shop  0.04


----Downtown Toronto----
         venue  freq
0  Coffee Shop  0.09
1         Café  0.06
2   Restaurant  0.04
3       Bakery  0.03
4        Hotel  0.03


----East Toronto----
                venue  freq
0    Greek Restaurant  0.08
1         Coffee Shop  0.07
2      Ice Cream Shop  0.04
3  Italian Restaurant  0.04
4                Café  0.03


----East York----
                 venue  freq
0          Coffee Shop  0.07
1          Pizza Place  0.04
2  Sporting Goods Shop  0.04
3         Burger Joint  0.04
4        Grocery Store  0.04


----Etobicoke----
            venue  freq
0     Pizza Place  0.11
1  Sandwich Place  0.07
2        Pharmacy  0.06
3     Coffee Shop  0.06
4             Gym  0.04


----Mississauga----
                      venue  freq
0               Coffee Shop  0.18
1                     Hotel  0.18
2     

In [21]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [66]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Borough']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

toronto_venues_sorted = pd.DataFrame(columns=columns)
toronto_venues_sorted['Borough'] = toronto_gr['Borough']

for ind in np.arange(toronto_gr.shape[0]):
    toronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_gr.iloc[ind, :], num_top_venues)

toronto_venues_sorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Central Toronto,Coffee Shop,Sandwich Place,Park,Pizza Place,Café,Dessert Shop,Sushi Restaurant,Clothing Store,Pub,Burger Joint
1,Downtown Toronto,Coffee Shop,Café,Restaurant,Hotel,Bakery,Italian Restaurant,Bar,Japanese Restaurant,American Restaurant,Park
2,East Toronto,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Pizza Place,Brewery,Café,Park,Pub,Yoga Studio
3,East York,Coffee Shop,Sporting Goods Shop,Pizza Place,Burger Joint,Pharmacy,Grocery Store,Park,Bank,Breakfast Spot,Sandwich Place
4,Etobicoke,Pizza Place,Sandwich Place,Pharmacy,Coffee Shop,Fast Food Restaurant,Grocery Store,Gym,Bakery,Park,Fried Chicken Joint


In [67]:
from sklearn.cluster import KMeans

k = 10

toronto_knn = toronto_gr.drop('Borough',1)

knn = KMeans(n_clusters=k, random_state=0).fit(toronto_knn)

knn.labels_[0:10]

array([2, 0, 8, 9, 3, 1, 6, 5, 7, 0], dtype=int32)

In [68]:
toronto_venues_sorted.insert(0,'Cluster Labels', knn.labels_)

# Following code remove neighboorhoods with no venues
toronto_m = neighbor_locations[neighbor_locations['Borough'].isin(toronto_venues_sorted['Borough'])]

toronto_m = toronto_m.join(toronto_venues_sorted.set_index('Borough'), on='Borough')

print(toronto_m.shape)
toronto_m.head()

(103, 16)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M1B,Scarborough,"Rouge, Malvern",43.8067,-79.1944,7,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Breakfast Spot,Pizza Place,Sandwich Place,Bakery,Thai Restaurant,Playground,Bus Line
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.7845,-79.1605,7,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Breakfast Spot,Pizza Place,Sandwich Place,Bakery,Thai Restaurant,Playground,Bus Line
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.7636,-79.1887,7,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Breakfast Spot,Pizza Place,Sandwich Place,Bakery,Thai Restaurant,Playground,Bus Line
3,M1G,Scarborough,Woburn,43.771,-79.2169,7,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Breakfast Spot,Pizza Place,Sandwich Place,Bakery,Thai Restaurant,Playground,Bus Line
4,M1H,Scarborough,Cedarbrae,43.7731,-79.2395,7,Fast Food Restaurant,Coffee Shop,Chinese Restaurant,Breakfast Spot,Pizza Place,Sandwich Place,Bakery,Thai Restaurant,Playground,Bus Line


In [71]:
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[43.6869, -79.4563], zoom_start=10)

x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_m['Latitude'], toronto_m['Longitude'], toronto_m['Borough'], toronto_m['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [72]:
toronto_m[['Borough', '1st Most Common Venue', '2nd Most Common Venue']].groupby(['Borough', '1st Most Common Venue', '2nd Most Common Venue']).size()

Borough           1st Most Common Venue  2nd Most Common Venue
Central Toronto   Coffee Shop            Sandwich Place            9
Downtown Toronto  Coffee Shop            Café                     18
East Toronto      Greek Restaurant       Coffee Shop               5
East York         Coffee Shop            Sporting Goods Shop       5
Etobicoke         Pizza Place            Sandwich Place           12
Mississauga       Coffee Shop            Hotel                     1
North York        Coffee Shop            Clothing Store           24
Queen's Park      Coffee Shop            Gym                       1
Scarborough       Fast Food Restaurant   Coffee Shop              17
West Toronto      Bar                    Café                      6
York              Park                   Coffee Shop               5
dtype: int64

# What I am going to do
I'm going to keep all boroughs and their related neighborhoods. I would have done only boroughs where top venue was pizza or chinese, but from the analysis, it turns out there's only one borough...which would have made future assignments too simple.

# Observations
1. Coffee shop is top venue at most Boroughs...Canadians sure love their coffee...most boroughs that does not have coffee as top, it would have it as second most common
2. The boroughs (Downtown Toronto, Etobicoke, North York, Scarborough) have the most neighborhoods