In [1]:
import pandas as pd
import numpy as np
                        # import the library we use to open URLs
import urllib.request
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

                            # import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
import folium # map rendering library






Phase I:
Use the Notebook to build the code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe







In [2]:
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

In [4]:
# parse the HTML from our URL into the BeautifulSoup parse tree format and #print(soup.prettify())  to look at html to see underlying our chosen webpage
soup = BeautifulSoup(page, "lxml")
#print(soup.prettify())

In [5]:
table=soup.find('table')
soup.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   List of postal codes of Canada: M - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"652df336-7a6c-4f10-8854-52123f8436fb","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_postal_codes_of_Canada:_M","wgTitle":"List of postal codes of Canada: M","wgCurRevisionId":960187814,"wgRevisionId":960187814,"wgArticleId":539066,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Communications in Ontario","Postal codes in Canad

In [6]:
#set up 3 empty lists to store our data in. Just because we have only 3
column_names=['postalcode','borough','neighbourhood']
df = pd.DataFrame(columns=column_names)

In [7]:
# Load the data from BeautifulSoup parse tree format into table with columns defined above
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data

In [8]:
df.head() 

Unnamed: 0,postalcode,borough,neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [9]:
#Disqualify neighbourhood with Not assigned data and drop borough as the datail is not needed anymore 
df=df[df['neighbourhood']!='Not assigned'] 
df=df.drop('borough', axis=1)

In [10]:
df.head()

Unnamed: 0,postalcode,neighbourhood
2,M3A,Parkwoods
3,M4A,Victoria Village
4,M5A,"Regent Park, Harbourfront"
5,M6A,"Lawrence Manor, Lawrence Heights"
6,M7A,"Queen's Park, Ontario Provincial Government"


Phase II:
We have got a list of neighbourhoods and their relative post code. Now that we have other CSV file 'http://cocl.us/Geospatial_data' of post code and geographical coordinates (latitude and longitude), putting those files together by matching postcode will help us get the latutide and longitude of eah  neighbourhood in town.

In [11]:
#Load geodata from VCS file 
geo_df=pd.read_csv('http://cocl.us/Geospatial_data')

In [12]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#harmonise the columns names and merge 2 dataframes
geo_df.rename(columns={'Postal Code':'postalcode', 'Latitude':'latitude', 'Longitude':'longitude'}, inplace=True)
neighbourhoods = pd.merge(geo_df, df, on='postalcode')

In [14]:
# given that the postcode are not need anymore, let drop them
neighbourhoods=neighbourhoods[['neighbourhood', 'latitude', 'longitude']] 

In [15]:
#let remove or drop any duplicates and check the number of the rows and wiew few of them

print(neighbourhoods.shape)
neighbourhoods=neighbourhoods.drop_duplicates()
neighbourhoods.head()

(103, 3)


Unnamed: 0,neighbourhood,latitude,longitude
0,"Malvern, Rouge",43.806686,-79.194353
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Woburn,43.770992,-79.216917
4,Cedarbrae,43.773136,-79.239476


In [16]:
address = 'Toronto, ON, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of toronto are 43.6534817, -79.3839347.


In [17]:
#create the map of the town and mark it with its neighbourhood 
map_toronto= folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for latitude, longitude, neighbourhood in zip(neighbourhoods['latitude'], neighbourhoods['longitude'], neighbourhoods['neighbourhood']):
    label = '{}'.format(neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Phase III: 
At thi point we have all geocordinates (of each) neighbourhood in Toronto. we have also geocodinates of Toronto. Now we move forward to use FOURSQUARE and obtain details about different venues in town in relation to its different neighbourhood areas. Among the venues from FOURSQUARE search, we select only coffee shops and therefore present the most populated (by cofee shops) neighbourhoods which is the target of our work.  

In [18]:
CLIENT_ID = 'F2BNOO1203DBO5P3SVJ4NF3O3AHTVUEQYSWEA1GJO1KUU2GO' # Foursquare ID
CLIENT_SECRET = 'DSAQKGNWWJD2SE3TBMAYVN4TR5UDWTWHRMQTLCLYDUW3XHA0' # Foursquare Secret
VERSION = '20180604'

In [19]:
# assume I don't much Canada especially anything about Toronto at all. I decide to explore entire metropolitan, cluster and see what is there


# type your answer here

LIMIT = 200 
                                # limit of number of venues returned by Foursquare API
radius = 1000 
                                # define radius

                                # create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude,
    longitude, 
    radius, 
    LIMIT)
url 
                                # display URL




'https://api.foursquare.com/v2/venues/explore?&client_id=F2BNOO1203DBO5P3SVJ4NF3O3AHTVUEQYSWEA1GJO1KUU2GO&client_secret=DSAQKGNWWJD2SE3TBMAYVN4TR5UDWTWHRMQTLCLYDUW3XHA0&v=20180604&ll=43.706748299999994,-79.5940544&radius=1000&limit=200'

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=1000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:


neighbourhoods = getNearbyVenues(names=neighbourhoods['neighbourhood'],
                                   latitudes=neighbourhoods['latitude'],
                                   longitudes=neighbourhoods['longitude']
                                  )

Malvern, Rouge
Rouge Hill, Port Union, Highland Creek
Guildwood, Morningside, West Hill
Woburn
Cedarbrae
Scarborough Village
Kennedy Park, Ionview, East Birchmount Park
Golden Mile, Clairlea, Oakridge
Cliffside, Cliffcrest, Scarborough Village West
Birch Cliff, Cliffside West
Dorset Park, Wexford Heights, Scarborough Town Centre
Wexford, Maryvale
Agincourt
Clarks Corners, Tam O'Shanter, Sullivan
Milliken, Agincourt North, Steeles East, L'Amoreaux East
Steeles West, L'Amoreaux West
Upper Rouge
Hillcrest Village
Fairview, Henry Farm, Oriole
Bayview Village
York Mills, Silver Hills
Willowdale, Newtonbrook
Willowdale, Willowdale East
York Mills West
Willowdale, Willowdale West
Parkwoods
Don Mills
Don Mills
Bathurst Manor, Wilson Heights, Downsview North
Northwood Park, York University
Downsview
Downsview
Downsview
Downsview
Victoria Village
Parkview Hill, Woodbine Gardens
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto, Broadview North (Old East York)
The Danforth West, 

In [22]:
print(neighbourhoods.shape)
neighbourhoods.head()

(4932, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Malvern, Rouge",43.806686,-79.194353,Harvey's,43.80002,-79.198307,Restaurant
1,"Malvern, Rouge",43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
2,"Malvern, Rouge",43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
3,"Malvern, Rouge",43.806686,-79.194353,RBC Royal Bank,43.798782,-79.19709,Bank
4,"Malvern, Rouge",43.806686,-79.194353,Caribbean Wave,43.798558,-79.195777,Caribbean Restaurant


In [53]:
coffee_pop=neighbourhoods.drop(['Venue', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Venue Latitude', 'Venue Longitude'], axis=1)
coffee_pop=coffee_pop.loc[neighbourhoods['Venue Category']=='Coffee Shop']
coffee_pop.tail()

Unnamed: 0,Neighborhood,Venue Category
4867,Weston,Coffee Shop
4869,Weston,Coffee Shop
4882,Westmount,Coffee Shop
4910,"Kingsview Village, St. Phillips, Martin Grove ...",Coffee Shop
4930,"Northwest, West Humber - Clairville",Coffee Shop


In [54]:
coffee_pop=coffee_pop.groupby(['Neighborhood', 'Venue Category']).size().reset_index(name='number of coffeshops')
coffee_pop.rename(columns={'Neighborhood':'Neighborhood', 'number of coffeshops':'number of coffeshops'}, inplace=True)
coffee_pop=coffee_pop[coffee_pop['number of coffeshops']>5]
coffee_pop=coffee_pop.sort_values(by='number of coffeshops', ascending=False)


In [56]:
print(coffee_pop.shape)
coffee_pop.head()


(27, 3)


Unnamed: 0,Neighborhood,Venue Category,number of coffeshops
55,"Regent Park, Harbourfront",Coffee Shop,15
17,Davisville North,Coffee Shop,13
4,Berczy Park,Coffee Shop,12
65,Stn A PO Boxes,Coffee Shop,12
15,"Commerce Court, Victoria Hotel",Coffee Shop,10


In [58]:
coffee_pop.groupby('number of coffeshops').count()

Unnamed: 0_level_0,Neighborhood,Venue Category
number of coffeshops,Unnamed: 1_level_1,Unnamed: 2_level_1
6,7,7
7,6,6
8,3,3
9,6,6
10,1,1
12,2,2
13,1,1
15,1,1


What also can we get to supplement requested information. Though it is not part our assignment task, it is better to help the company be able to challenge their assumption that coffee shop is significant representative in town 

In [59]:
#Let's find out how many unique categories can be curated from all the returned venues

print('There are {} uniques categories.'.format(len(neighbourhoods['Venue Category'].unique())))

There are 335 uniques categories.


In [62]:
#Analyse each neighbourhoud
# one hot encoding
toronto_onehot = pd.get_dummies(neighbourhoods[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = neighbourhoods['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Zoo,Accessories Store,Afghan Restaurant,Airport,Airport Lounge,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Game Store,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
#Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

#Let's print each neighborhood along with the top 5 most common venues
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print()

----Agincourt----
                venue  freq
0  Chinese Restaurant  0.15
1       Shopping Mall  0.05
2              Bakery  0.05
3         Pizza Place  0.05
4      Sandwich Place  0.05

----Alderwood, Long Branch----
               venue  freq
0     Discount Store  0.12
1  Convenience Store  0.08
2        Pizza Place  0.08
3           Pharmacy  0.08
4               Park  0.08

----Bathurst Manor, Wilson Heights, Downsview North----
           venue  freq
0           Bank  0.07
1           Park  0.07
2    Coffee Shop  0.07
3  Grocery Store  0.03
4          Trail  0.03

----Bayview Village----
                 venue  freq
0                 Bank  0.12
1        Grocery Store  0.12
2  Japanese Restaurant  0.12
3          Gas Station  0.12
4   Chinese Restaurant  0.06

----Bedford Park, Lawrence Manor East----
                venue  freq
0  Italian Restaurant  0.08
1         Coffee Shop  0.08
2                Park  0.05
3          Restaurant  0.05
4      Sandwich Place  0.05

----Berczy Par