In [1]:
!conda install -c conda-forge beautifulsoup4 --yes

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    beautifulsoup4-4.6.3       |        py36_1000         141 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.2 MB

The following packages will be UPDATED:

    beautifulsoup4:  4.6.0-py36h49b8c8c_1             --> 4.6.3-py36_1000       conda-forge
    ca-certificates: 2018.8.24-ha4d7672_0 conda-forge --> 

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [7]:
!conda install -c conda-forge lxml --yes

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - lxml


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    lxml-4.2.5                 |   py36hc9114bc_0         6.0 MB  conda-forge

The following packages will be UPDATED:

    lxml: 4.2.1-py36h23eabaa_0 --> 4.2.5-py36hc9114bc_0 conda-forge


Downloading and Extracting Packages
lxml-4.2.5           | 6.0 MB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [13]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source,'lxml')
table=soup.find('table', class_='wikitable sortable')
df = pd.read_html(str(table))
df[0].columns=['Postcode','Borough','Neighbourhood']
df[0]
df_i=df[0][df[0].Borough!='Not assigned']
df_grouped=df_i.groupby('Postcode').agg({'Borough':'first','Neighbourhood': lambda x: ', '.join(x)}).reset_index()
def assign_neighbourhood(row):
    neighbourhood=row['Neighbourhood']
    if(neighbourhood=='Not assigned'):
        return row['Borough']
    else:
        return neighbourhood
    
df_grouped['Neighbourhood']=df_grouped.apply(assign_neighbourhood,axis=1)
df_grouped.rename(columns={'Postcode':'Postal Code'},inplace=True)

## Explanation

1. Installed Beautifulsoup package
2. Installed lxml package
3. Imported required libraries.
4. Scraped the wikipedia page using Beautifulsoup and lxml to get the table.
5. Converted the table to a dataframe using pandas read_html
6. Renamed the columns
7. Filtered rows where Borough is Not assigned
8. Grouped by Postcode and aggregated neighbourhoods using join
9. Defined a funtion to assign Borough value to neighbourhoods with value as Not assigned 
10. Modified the dataframe using apply

In [14]:
df_grouped.shape

(104, 3)

In [63]:
df_ll=pd.read_csv('https://cocl.us/Geospatial_data')
df_final=pd.merge(df_grouped,df_ll,on='Postal Code',how='inner')
df_toronto=df_final[df_final['Borough'].str.contains('Toronto')].reset_index()
df_toronto.drop(['index'],axis=1,inplace=True)
df_toronto.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [21]:
CLIENT_ID = 'G5MJSGEHGMC3PIAZT3GOI2AZ4JBJBN3PSV1R0HWB0RTLXJ3Q' # your Foursquare ID
CLIENT_SECRET = '0JCK4QBOBMLSEF1ESBHLFCNOKHM0NLQSTLUGQACASBPE4KRE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=100

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
           
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [23]:
toronto_venues = getNearbyVenues(names=df_toronto['Neighbourhood'],
                                   latitudes=df_toronto['Latitude'],
                                   longitudes=df_toronto['Longitude']
                                  )

In [33]:
toronto_venues.shape

(1699, 7)

In [42]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighbourhood']=toronto_venues['Neighborhood']
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Neighbourhood,Accessories Store,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped.shape

(38, 233)

In [54]:
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')
    

----Adelaide, King, Richmond----
                 venue  freq
0          Coffee Shop  0.07
1                 Café  0.06
2  American Restaurant  0.04
3           Steakhouse  0.04
4      Thai Restaurant  0.04


----Berczy Park----
          venue  freq
0   Coffee Shop  0.09
1  Cocktail Bar  0.06
2    Restaurant  0.04
3    Steakhouse  0.04
4      Beer Bar  0.04


----Brockton, Exhibition Place, Parkdale Village----
               venue  freq
0        Coffee Shop  0.14
1               Café  0.10
2     Breakfast Spot  0.10
3  Convenience Store  0.05
4      Burrito Place  0.05


----Business reply mail Processing Centre969 Eastern----
                venue  freq
0  Light Rail Station  0.11
1         Yoga Studio  0.06
2       Auto Workshop  0.06
3          Comic Shop  0.06
4    Recording Studio  0.06


----CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara----
              venue  freq
0   Airport Service  0.14
1  Airport Terminal  0.14


In [60]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [64]:
import numpy as np
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,Steakhouse,American Restaurant,Thai Restaurant,Cosmetics Shop,Restaurant,Bar,Gym,Hotel
1,Berczy Park,Coffee Shop,Cocktail Bar,Cheese Shop,Beer Bar,Steakhouse,Seafood Restaurant,Farmers Market,Restaurant,Café,Bakery
2,"Brockton, Exhibition Place, Parkdale Village",Coffee Shop,Breakfast Spot,Café,Gym,Climbing Gym,Convenience Store,Bar,Burrito Place,Stadium,Caribbean Restaurant
3,Business reply mail Processing Centre969 Eastern,Light Rail Station,Yoga Studio,Auto Workshop,Pizza Place,Butcher,Recording Studio,Burrito Place,Restaurant,Brewery,Skate Park
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Plane
5,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Café,Bakery,Pizza Place,Italian Restaurant,Pub,Park,Indian Restaurant,Butcher
6,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Bubble Tea Shop,Bar,Burger Joint,Japanese Restaurant,Ice Cream Shop,Salad Place
7,"Chinatown, Grange Park, Kensington Market",Café,Vegetarian / Vegan Restaurant,Bar,Chinese Restaurant,Mexican Restaurant,Vietnamese Restaurant,Bakery,Dumpling Restaurant,Coffee Shop,Comfort Food Restaurant
8,Christie,Grocery Store,Café,Park,Italian Restaurant,Coffee Shop,Nightclub,Restaurant,Diner,Baby Store,Athletics & Sports
9,Church and Wellesley,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Gay Bar,Burger Joint,Restaurant,Café,Gastropub,Fast Food Restaurant,Bubble Tea Shop


In [76]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 4

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [77]:
toronto_merged = df_toronto

# add clustering labels
toronto_merged['Cluster Labels'] = kmeans.labels_

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Neighborhood,Coffee Shop,Pub,Music Venue,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Italian Restaurant,Yoga Studio,Spa,Indian Restaurant,Diner,Bakery
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Sandwich Place,Pet Store,Hotel,Brewery,Liquor Store,Burger Joint,Fast Food Restaurant,Burrito Place,Fish & Chips Shop,Italian Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Fish Market,Juice Bar,Latin American Restaurant,New American Restaurant
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,0,Bus Line,Park,Swim School,Dim Sum Restaurant,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store


In [68]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors


!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

# All requested packages already installed.



In [79]:
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values


Solving environment: done

# All requested packages already installed.



In [86]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto



In [80]:
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighbourhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters



The geograpical coordinate of Manhattan are 43.653963, -79.387207.


In [81]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Toronto,0,Neighborhood,Coffee Shop,Pub,Music Venue,Dessert Shop,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant,Dumpling Restaurant
1,East Toronto,0,Greek Restaurant,Coffee Shop,Ice Cream Shop,Bookstore,Italian Restaurant,Yoga Studio,Spa,Indian Restaurant,Diner,Bakery
2,East Toronto,0,Sandwich Place,Pet Store,Hotel,Brewery,Liquor Store,Burger Joint,Fast Food Restaurant,Burrito Place,Fish & Chips Shop,Italian Restaurant
3,East Toronto,0,Café,Coffee Shop,Bakery,Italian Restaurant,Gastropub,American Restaurant,Fish Market,Juice Bar,Latin American Restaurant,New American Restaurant
4,Central Toronto,0,Bus Line,Park,Swim School,Dim Sum Restaurant,Diner,Farmers Market,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store
5,Central Toronto,0,Breakfast Spot,Clothing Store,Sandwich Place,Food & Drink Shop,Dance Studio,Hotel,Burger Joint,Park,Eastern European Restaurant,Dog Run
6,Central Toronto,0,Coffee Shop,Sporting Goods Shop,Clothing Store,Health & Beauty Service,Fast Food Restaurant,Metro Station,Mexican Restaurant,Diner,Dessert Shop,Park
7,Central Toronto,0,Pizza Place,Sandwich Place,Dessert Shop,Sushi Restaurant,Coffee Shop,Italian Restaurant,Café,Pharmacy,Seafood Restaurant,Toy / Game Store
8,Central Toronto,0,Playground,Park,Intersection,Yoga Studio,Dim Sum Restaurant,Falafel Restaurant,Event Space,Ethiopian Restaurant,Electronics Store,Eastern European Restaurant
9,Central Toronto,0,Coffee Shop,Pub,American Restaurant,Supermarket,Vietnamese Restaurant,Light Rail Station,Convenience Store,Pizza Place,Sushi Restaurant,Sports Bar


In [82]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
24,Central Toronto,1,Café,Sandwich Place,Coffee Shop,Pizza Place,BBQ Joint,Pub,French Restaurant,Indian Restaurant,Jewish Restaurant,Burger Joint
27,Downtown Toronto,1,Airport Lounge,Airport Service,Airport Terminal,Harbor / Marina,Boat or Ferry,Airport,Airport Food Court,Airport Gate,Sculpture Garden,Plane


In [83]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
28,Downtown Toronto,2,Coffee Shop,Café,Seafood Restaurant,Beer Bar,Hotel,Restaurant,Cocktail Bar,Cheese Shop,Farmers Market,Bakery


In [84]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
17,Downtown Toronto,3,Coffee Shop,Café,Italian Restaurant,Sandwich Place,Bubble Tea Shop,Bar,Burger Joint,Japanese Restaurant,Ice Cream Shop,Salad Place


<b>The exploration and clustering is limited to Toronto. </b>