# Part 1: Assignment for Web Scrapping using Beautiful Soup

In [1]:
# To run this, you can install BeautifulSoup
# https://pypi.python.org/pypi/beautifulsoup4

# Or download the file
# http://beautiful-soup-4
# and unzip it in the same directory as this file
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl
import csv

print('BeautifulSoup  & csv imported.')

BeautifulSoup  & csv imported.


In [2]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

print('SSL certificate errors ignored.')

SSL certificate errors ignored.


In [4]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

soup = BeautifulSoup(source, 'lxml')

#print(soup.prettify())
print('soup object created')

soup object created


In [5]:
table = soup.find('table',{'class':'wikitable sortable'})
#table
table_rows = table.find_all('tr')
#table_rows

In [6]:
import pandas as pd
data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]  # to filter out bad rows
df.head()
df.shape

(180, 3)

# Delete Boroughs which are not assigned.

In [7]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df.shape
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [8]:
df1 = df.reset_index()
df1.head()

Unnamed: 0,index,PostalCode,Borough,Neighbourhood
0,3,M3A,North York,Parkwoods
1,4,M4A,North York,Victoria Village
2,5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,6,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table.

In [9]:
df2= df1.groupby('PostalCode').agg(lambda x: ','.join(x))
df2
#df2.shape

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [10]:
df2.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [11]:
df3 = df2.reset_index()
df3.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


# Remove duplicate borough

In [12]:
df3['Borough']= df3['Borough'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")

In [13]:
df3.shape

(103, 3)

# Part 2: latitude and the longitude coordinates of each neighborhood using python Geocoder package

In [50]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [14]:
from  geopy.geocoders import Nominatim
geolocator = Nominatim()
city ="Pune"
country ="India"
loc = geolocator.geocode(city+','+ country)
print("latitude is :-" ,loc.latitude,"\nlongtitude is:-" ,loc.longitude)

  


latitude is :- 18.521428 
longtitude is:- 73.8544541


# Reading csv file as the geocoder api is not working properly

In [54]:
pip install geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6
Note: you may need to restart the kernel to use updated packages.


In [25]:
dff = pd.read_csv('Geospatial_Coordinates.csv')
#Merge this with earlier dataframe to get Latitude, Longitude data.
result = pd.merge(df3, dff, left_on='PostalCode',right_on='Postal Code')
result = result.drop(['Postal Code'],axis=1)
result.head()


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Step 3: Explore and cluster the neighborhoods in Toronto

In [26]:
!conda install -c conda-forge folium=0.5.0 

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [27]:
import pandas as pd
import folium

print('imported pandas & folium')

imported pandas & folium


In [102]:
sorted_df = result.sort_values([ 'Neighbourhood', 'Latitude'], ascending=[True, True])
sorted_df.reset_index(inplace=True)
sorted_df = sorted_df.drop(['index'],axis=1)
sorted_df

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1S,Scarborough,Agincourt,43.7942,-79.262029
1,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
2,M3H,NorthYork,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
3,M2K,NorthYork,Bayview Village,43.786947,-79.385975
4,M5M,NorthYork,"Bedford Park, Lawrence Manor East",43.733283,-79.41975
5,M5E,DowntownToronto,Berczy Park,43.644771,-79.373306
6,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848
7,M6K,WestToronto,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191
8,M7Y,EastToronto,Business reply mail Processing Centre,43.662744,-79.321558
9,M5V,DowntownToronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442


In [103]:
sorted_df.to_csv('sorted_geoloc.csv')

# Build a test set

In [104]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [105]:
# library to handle JSON files

import pandas as pd

import json

sorted_df.to_json(path_or_buf='geo_toronto.json', orient='table')

In [106]:
import pandas as pd
import folium

#grab a random sample from df
subset_of_df = sorted_df.sample(n=11)
map_test = folium.Map(location=[subset_of_df['Latitude'].mean(), 
                                subset_of_df['Longitude'].mean()], 
                      zoom_start=10)
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in subset_of_df.itertuples(): #if you cannot 
    map_test.add_child(folium.Marker(location=[row.Latitude ,row.Longitude],
           popup=row.Borough))

    
#map_test

#open map_test.html in browser
map_test.save("map_test.html")

# if you cannot generate the maps open PGA_map_*.html from the zip file

In [107]:
with open('geo_toronto.json') as json_data:
    Toronto_data = json.load(json_data)

In [108]:
neighborhoods_data = Toronto_data['data']
neighborhoods_data[0]

{'index': 0,
 'PostalCode': 'M1S',
 'Borough': 'Scarborough',
 'Neighbourhood': 'Agincourt',
 'Latitude': 43.7942003,
 'Longitude': -79.2620294}

In [109]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(sorted_df['Borough'].unique()),
        sorted_df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [112]:
sorted_dataframe = sorted_df
sorted_df.shape

(103, 5)

In [113]:
import pandas as pd
import folium

#grab a random sample from df
subset_of_df = sorted_dataframe.sample(n=11)
map_test = folium.Map(location=[subset_of_df['Latitude'].mean(), 
                                subset_of_df['Longitude'].mean()], 
                      zoom_start=10)
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in subset_of_df.itertuples():#if you cannot 
    map_test.add_child(folium.Marker(location=[row.Latitude ,row.Longitude],
           popup=row.Borough))

    
#map_test

#open map_test.html in browser
map_test.save("map_test.html")

# if you cannot generate the maps open PGA_map_*.html from the zip file

In [114]:
from folium.plugins import MarkerCluster
map_borough = folium.Map(location=[subset_of_df['Latitude'].mean(), 
 subset_of_df['Longitude'].mean()], 
 zoom_start=10)
mc = MarkerCluster()
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in subset_of_df.itertuples():
    mc.add_child(folium.Marker(location=[row.Latitude,  row.Longitude],
                 popup=row.Borough))
    map_borough.add_child(mc)


#map_borough

#open in map_borough.html browser 
map_borough.save("map_borough.html")

#if you cannot generate the maps open PGA_map_*.html from the zip file

In [115]:
import pandas as pd
import folium



#grab a random sample from df
toronto_n = sorted_dataframe.sample(n=20)
map_toronto = folium.Map(location=[toronto_n['Latitude'].mean(), 
                                toronto_n['Longitude'].mean()], 
                      zoom_start=10)
#creating a Marker for each point in df_sample. Each point will get a popup with their zip
for row in toronto_n.itertuples():
    map_toronto.add_child(folium.Marker(location=[row.Latitude ,row.Longitude],
           popup=row.Neighbourhood))

    
map_toronto 

#open map_toronto.html in browser

map_toronto.save("map_toronto20.html")

#if you cannot generate the maps open PGA_map_*.html from the zip file

In [116]:
address = 'Toronto, CA'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

  


The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [52]:
sorted_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1S,Scarborough,Agincourt,43.7942,-79.262029
1,M8W,Etobicoke,"Alderwood, Long Branch",43.602414,-79.543484
2,M3H,NorthYork,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259
3,M2K,NorthYork,Bayview Village,43.786947,-79.385975
4,M5M,NorthYork,"Bedford Park, Lawrence Manor East",43.733283,-79.41975


In [117]:
# create map of Toronto using latitude and longitude values
map_toronto_neighbourhoods = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighbourhood in zip(sorted_dataframe['Latitude'], sorted_dataframe['Longitude'], sorted_dataframe['Borough'], sorted_dataframe['Neighbourhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=2,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto_neighbourhoods)  
    
map_toronto_neighbourhoods

map_toronto_neighbourhoods.save("map_toronto_neighbourhoods.html")

#open map_toronto_neighbourhoods.html in browser
#if you cannot generate the maps open PGA_map_*.html from the zip file

In [118]:
address = 'York, Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of York, Toronto are {}, {}.'.format(latitude, longitude))

  This is separate from the ipykernel package so we can avoid doing imports until


The geograpical coordinates of York, Toronto are 43.67910515, -79.49118414007154.


In [122]:
sorted_dataframe['Borough'].str.contains('Toronto')

0      False
1      False
2      False
3      False
4      False
5       True
6      False
7       True
8       True
9       True
10     False
11     False
12     False
13      True
14      True
15      True
16     False
17     False
18      True
19      True
20      True
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29      True
30     False
31     False
32     False
33      True
34      True
35      True
36     False
37     False
38     False
39      True
40      True
41     False
42     False
43     False
44     False
45      True
46     False
47     False
48      True
49     False
50     False
51      True
52     False
53      True
54     False
55     False
56     False
57      True
58     False
59     False
60      True
61     False
62     False
63     False
64      True
65     False
66     False
67      True
68      True
69      True
70      True
71      True
72     False
73      True
74     False
75     False
76     False

# We are doing this for York, Toronto only.

In [124]:
toronto_data = sorted_dataframe[sorted_dataframe['Borough'].str.contains('Toronto')].reset_index(drop=True)
toronto_data.shape

(39, 5)

In [125]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(toronto_data['Latitude'], toronto_data['Longitude'], toronto_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_york_toronto)  
    
map_toronto

map_toronto.save("map_toronto.html")

#open map_york_toronto.html in browser
#if you cannot generate the maps open PGA_map_*.html from the zip file

In [126]:
neighbourhood_latitude = toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighbourhood_longitude = toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighbourhood_name = toronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighbourhood_name, 
                                                               neighbourhood_latitude, 
                                                               neighbourhood_longitude))

Latitude and longitude values of Berczy Park are 43.644770799999996, -79.3733064.


In [127]:
CLIENT_ID='WU2UX203XESJA0HFY55SV5RWZ1BBSKLI32QJYVRJSYA5LFBP'
CLIENT_SECRET='CRCZYN4G5KWRGZPK3BV4ELWGXR1BOY05S1S3U2RGZCMTTCOA'
VERSION='20180323'


In [128]:
LIMIT = 100

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(

CLIENT_ID, 

CLIENT_SECRET, 

VERSION, 

neighbourhood_latitude, 

neighbourhood_longitude, 

radius, 

LIMIT)

url

'https://api.foursquare.com/v2/venues/explore?&client_id=WU2UX203XESJA0HFY55SV5RWZ1BBSKLI32QJYVRJSYA5LFBP&client_secret=CRCZYN4G5KWRGZPK3BV4ELWGXR1BOY05S1S3U2RGZCMTTCOA&v=20180323&ll=43.644770799999996,-79.3733064&radius=500&limit=100'

In [129]:
toronto_results = requests.get(url).json()
toronto_results

{'meta': {'code': 200, 'requestId': '5ebe5ce6edbcad001bb7beab'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Downtown Toronto',
  'headerFullLocation': 'Downtown Toronto, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 56,
  'suggestedBounds': {'ne': {'lat': 43.6492708045, 'lng': -79.36709938085544},
   'sw': {'lat': 43.640270795499994, 'lng': -79.37951341914457}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4aeb719af964a52020c221e3',
       'name': 'LCBO',
       'contact': {},
       'location': {'address': '2 Cooper St',
        'crossStreet': 'at Queens Quay E',
        'lat': 43.64294379917171,
        'lng': -79.37243989044406,
        'labe

In [130]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [132]:
toronto_venues = toronto_results['response']['groups'][0]['items']
    
toronto_nearby_venues = json_normalize(toronto_venues) # flatten JSON

# filter columns
toronto_filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
toronto_nearby_venues = toronto_nearby_venues.loc[:, toronto_filtered_columns]

# filter the category for each row
toronto_nearby_venues['venue.categories'] = toronto_nearby_venues.apply(get_category_type, axis=1)

# clean columns
toronto_nearby_venues.columns = [col.split(".")[-1] for col in toronto_nearby_venues.columns]

toronto_nearby_venues.head()

toronto_nearby_venues.shape

(56, 4)

# Explore neighbourhood in Yark.

In [133]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [134]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=york_data['Latitude'],
                                   longitudes=york_data['Longitude']
                                  )

Berczy Park
Brockton, Parkdale Village, Exhibition Place
Business reply mail Processing Centre
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Central Bay Street


In [136]:
toronto_venues.shape

(17, 7)

In [137]:
toronto_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Berczy Park,4,4,4,4,4,4
"Brockton, Parkdale Village, Exhibition Place",4,4,4,4,4,4
Business reply mail Processing Centre,3,3,3,3,3,3
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",4,4,4,4,4,4
Central Bay Street,2,2,2,2,2,2


In [138]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

There are 14 uniques categories.


# Now group neighbourhood to group into clusters.

In [139]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
toronto_fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[toronto_fixed_columns]

toronto_onehot.head()

Unnamed: 0,Neighbourhood,Breakfast Spot,Bus Line,Convenience Store,Discount Store,Field,Hockey Arena,Park,Pizza Place,Pool,Restaurant,Sandwich Place,Trail,Turkish Restaurant,Women's Store
0,Berczy Park,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1,Berczy Park,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,Berczy Park,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,Berczy Park,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,"Brockton, Parkdale Village, Exhibition Place",0,0,0,0,0,0,0,0,0,0,1,0,0,0


In [140]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighbourhood,Breakfast Spot,Bus Line,Convenience Store,Discount Store,Field,Hockey Arena,Park,Pizza Place,Pool,Restaurant,Sandwich Place,Trail,Turkish Restaurant,Women's Store
0,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.25,0.0,0.0,0.0,0.0,0.25
1,"Brockton, Parkdale Village, Exhibition Place",0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.25,0.25,0.0,0.25,0.0
2,Business reply mail Processing Centre,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0
3,"CN Tower, King and Spadina, Railway Lands, Har...",0.25,0.25,0.25,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [142]:
num_top_venues = 3

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    toronto_temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    toronto_temp.columns = ['venue','freq']
    toronto_temp = toronto_temp.iloc[1:]
    toronto_temp['freq'] = toronto_temp['freq'].astype(float)
    toronto_temp = toronto_temp.round({'freq': 2})
    print(toronto_temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Berczy Park----
           venue  freq
0           Park  0.50
1           Pool  0.25
2  Women's Store  0.25


----Brockton, Parkdale Village, Exhibition Place----
            venue  freq
0  Discount Store  0.25
1      Restaurant  0.25
2  Sandwich Place  0.25


----Business reply mail Processing Centre----
          venue  freq
0         Field  0.33
1  Hockey Arena  0.33
2         Trail  0.33


----CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport----
               venue  freq
0     Breakfast Spot  0.25
1           Bus Line  0.25
2  Convenience Store  0.25


----Central Bay Street----
            venue  freq
0            Park   1.0
1  Breakfast Spot   0.0
2        Bus Line   0.0




In [150]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [151]:
num_top_venues = 14

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
toronto_neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)

toronto_neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue


In [152]:
toronto_neighbourhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

toronto_neighbourhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue
0,Berczy Park,,,,,,,,,,,,,,
1,"Brockton, Parkdale Village, Exhibition Place",,,,,,,,,,,,,,
2,Business reply mail Processing Centre,,,,,,,,,,,,,,
3,"CN Tower, King and Spadina, Railway Lands, Har...",,,,,,,,,,,,,,
4,Central Bay Street,,,,,,,,,,,,,,


In [153]:
print(toronto_grouped.shape)
for ind in np.arange(toronto_grouped.shape[0]):
    toronto_neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_neighbourhoods_venues_sorted.head()

(5, 15)


Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue
0,Berczy Park,Park,Women's Store,Pool,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pizza Place,Hockey Arena,Field,Discount Store,Convenience Store,Bus Line,Breakfast Spot
1,"Brockton, Parkdale Village, Exhibition Place",Turkish Restaurant,Sandwich Place,Restaurant,Discount Store,Women's Store,Trail,Pool,Pizza Place,Park,Hockey Arena,Field,Convenience Store,Bus Line,Breakfast Spot
2,Business reply mail Processing Centre,Trail,Hockey Arena,Field,Women's Store,Turkish Restaurant,Sandwich Place,Restaurant,Pool,Pizza Place,Park,Discount Store,Convenience Store,Bus Line,Breakfast Spot
3,"CN Tower, King and Spadina, Railway Lands, Har...",Pizza Place,Convenience Store,Bus Line,Breakfast Spot,Women's Store,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pool,Park,Hockey Arena,Field,Discount Store
4,Central Bay Street,Park,Women's Store,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pool,Pizza Place,Hockey Arena,Field,Discount Store,Convenience Store,Bus Line,Breakfast Spot


In [154]:
for ind in np.arange(toronto_grouped.shape[0]):
    toronto_neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_neighbourhoods_venues_sorted.head(2)

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue
0,Berczy Park,Park,Women's Store,Pool,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pizza Place,Hockey Arena,Field,Discount Store,Convenience Store,Bus Line,Breakfast Spot
1,"Brockton, Parkdale Village, Exhibition Place",Turkish Restaurant,Sandwich Place,Restaurant,Discount Store,Women's Store,Trail,Pool,Pizza Place,Park,Hockey Arena,Field,Convenience Store,Bus Line,Breakfast Spot


# Used the Folium library to generated maps to visualize neighborhoods on and how they cluster together

In [155]:
# set number of clusters
kclusters = 2

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
toronto_kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
toronto_kmeans.labels_[0:5] 

array([0, 1, 1, 1, 0])

In [156]:
# add clustering labels
toronto_neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', toronto_kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(toronto_neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue
0,M5E,DowntownToronto,Berczy Park,43.644771,-79.373306,0.0,Park,Women's Store,Pool,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pizza Place,Hockey Arena,Field,Discount Store,Convenience Store,Bus Line,Breakfast Spot
1,M6K,WestToronto,"Brockton, Parkdale Village, Exhibition Place",43.636847,-79.428191,1.0,Turkish Restaurant,Sandwich Place,Restaurant,Discount Store,Women's Store,Trail,Pool,Pizza Place,Park,Hockey Arena,Field,Convenience Store,Bus Line,Breakfast Spot
2,M7Y,EastToronto,Business reply mail Processing Centre,43.662744,-79.321558,1.0,Trail,Hockey Arena,Field,Women's Store,Turkish Restaurant,Sandwich Place,Restaurant,Pool,Pizza Place,Park,Discount Store,Convenience Store,Bus Line,Breakfast Spot
3,M5V,DowntownToronto,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,1.0,Pizza Place,Convenience Store,Bus Line,Breakfast Spot,Women's Store,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pool,Park,Hockey Arena,Field,Discount Store
4,M5G,DowntownToronto,Central Bay Street,43.657952,-79.387383,0.0,Park,Women's Store,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pool,Pizza Place,Hockey Arena,Field,Discount Store,Convenience Store,Bus Line,Breakfast Spot
5,M6G,DowntownToronto,Christie,43.669542,-79.422564,,,,,,,,,,,,,,,
6,M4Y,DowntownToronto,Church and Wellesley,43.66586,-79.38316,,,,,,,,,,,,,,,
7,M5L,DowntownToronto,"Commerce Court, Victoria Hotel",43.648198,-79.379817,,,,,,,,,,,,,,,
8,M4S,CentralToronto,Davisville,43.704324,-79.38879,,,,,,,,,,,,,,,
9,M4P,CentralToronto,Davisville North,43.712751,-79.390197,,,,,,,,,,,,,,,


In [157]:
# create map
import math
toronto_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    #print (cluster)
    if (not math.isnan(cluster)):
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[int(cluster)-1],
            fill=True,
            fill_color=rainbow[int(cluster)-1],
            fill_opacity=0.7).add_to(toronto_map_clusters)
       
toronto_map_clusters

toronto_map_clusters.save("toronto_map_clusters.html")

#open toronto_map_clusters.html in browser
#if you cannot generate the maps open PGA_map_*.html from the zip file

# Analyze 1st clusters 

In [161]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[0,1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue
0,M5E,DowntownToronto,0.0,Park,Women's Store,Pool,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pizza Place,Hockey Arena,Field,Discount Store,Convenience Store,Bus Line,Breakfast Spot
4,M5G,DowntownToronto,0.0,Park,Women's Store,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pool,Pizza Place,Hockey Arena,Field,Discount Store,Convenience Store,Bus Line,Breakfast Spot


# This cluster can be called Park and Women's Neighbourhood.

In [163]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[0,1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,PostalCode,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,11th Most Common Venue,12th Most Common Venue,13th Most Common Venue,14th Most Common Venue
1,M6K,WestToronto,1.0,Turkish Restaurant,Sandwich Place,Restaurant,Discount Store,Women's Store,Trail,Pool,Pizza Place,Park,Hockey Arena,Field,Convenience Store,Bus Line,Breakfast Spot
2,M7Y,EastToronto,1.0,Trail,Hockey Arena,Field,Women's Store,Turkish Restaurant,Sandwich Place,Restaurant,Pool,Pizza Place,Park,Discount Store,Convenience Store,Bus Line,Breakfast Spot
3,M5V,DowntownToronto,1.0,Pizza Place,Convenience Store,Bus Line,Breakfast Spot,Women's Store,Turkish Restaurant,Trail,Sandwich Place,Restaurant,Pool,Park,Hockey Arena,Field,Discount Store


# This cluster can be called Restaurent Neighbourhoods.