# In this notebook, we are going to use *BeatutifulSoup* to do some web scraping. Follow me, it's very cool!

In [1]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page
import pandas as pd
import numpy as np

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data  = requests.get(url).text 

In [3]:
soup = BeautifulSoup(data, 'html5lib')

In [5]:
# print(soup.prettify())

In [4]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.string=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
df=pd.DataFrame(table_contents)
df['Borough']=df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

# Voilà! See, that's what I am doing here.

In [5]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [6]:
df.shape

(103, 3)

# Let's try to get the coordinates by Geocoder.

In [7]:
# !pip install geocoder
# import geocoder # import geocoder

# latitude = []
# longitude = []

# for postal_code in df['PostalCode']:

# # initialize your variable to None
#     lat_lng_coords = None
#     postal_code = 'M3A'

# # loop until you get the coordinates
#     while(lat_lng_coords is None):
#              g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#              lat_lng_coords = g.latlng
#     latitude = latitude.append(lat_lng_coords[0])
#     longitude = longitude.append(lat_lng_coords[1])

# Oooops, it did not work. So we have to use the csv file instead.

In [8]:
df2 = pd.read_csv('../../Downloads/Geospatial_Coordinates.csv')
df2 = df2.rename(columns={'Postal Code': 'PostalCode'})

In [9]:
df = df.join(df2.set_index('PostalCode'), on='PostalCode', how='right').sort_index()

In [10]:
df.value_counts('Borough')

Borough
North York                24
Scarborough               17
Downtown Toronto          17
Etobicoke                 11
Central Toronto            9
West Toronto               6
York                       5
East York                  4
East Toronto               4
Queen's Park               1
Mississauga                1
Etobicoke Northwest        1
East York/East Toronto     1
East Toronto Business      1
Downtown Toronto Stn A     1
dtype: int64

In [11]:
df.shape

(103, 5)

In [12]:
print('Toronto has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

Toronto has 15 boroughs and 103 neighborhoods.


# Let's get the coordinates of Toronto and explore the neighborhoods using the map.

In [13]:
!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

Collecting geopy
  Downloading geopy-2.1.0-py3-none-any.whl (112 kB)
[K     |████████████████████████████████| 112 kB 11.7 MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49
  Downloading geographiclib-1.50-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-2.1.0
The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [14]:
!pip install folium
import folium # map rendering library
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Collecting folium
  Downloading folium-0.12.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 4.2 MB/s  eta 0:00:01
Collecting branca>=0.3.0
  Downloading branca-0.4.2-py3-none-any.whl (24 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.4.2 folium-0.12.1


In [26]:
Toronto_data = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
Toronto_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
2,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
3,M4E,East Toronto,The Beaches,43.676357,-79.293031
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [27]:
address = 'Downtown Toronto, TO'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Manhattan are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Manhattan are 43.6541737, -79.38081162653639.


In [28]:
map_dt_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Borough'], Toronto_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dt_toronto)  
    
map_dt_toronto

In [30]:
Toronto_data.loc[5, 'Neighborhood']

'Central Bay Street'

In [32]:
neighborhood_latitude = Toronto_data.loc[5, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Toronto_data.loc[5, 'Longitude'] # neighborhood longitude value

neighborhood_name = Toronto_data.loc[5, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Central Bay Street are 43.6579524, -79.3873826.


In [33]:
CLIENT_ID = '1DPGMGDNDTEIRPC4BNO30MK5BFIQ5LLE4TEOK0EJHSUFNPHG' # your Foursquare ID
CLIENT_SECRET = '5MZHZLG15KWWO5NNE4XXFVFNXW51KWH1UK0UNKQ4MLKYRZGN' # your Foursquare Secret
VERSION = '20210505' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=1DPGMGDNDTEIRPC4BNO30MK5BFIQ5LLE4TEOK0EJHSUFNPHG&client_secret=5MZHZLG15KWWO5NNE4XXFVFNXW51KWH1UK0UNKQ4MLKYRZGN&v=20210505&ll=43.6579524,-79.3873826&radius=500&limit=100'

In [35]:
results = requests.get(url).json()
# results

In [36]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [38]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = pd.json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Jimmy's Coffee,Coffee Shop,43.658421,-79.385613
1,Somethin' 2 Talk About,Middle Eastern Restaurant,43.658395,-79.385338
2,Hailed Coffee,Coffee Shop,43.658833,-79.383684
3,Tim Hortons,Coffee Shop,43.65857,-79.385123
4,NEO COFFEE BAR,Coffee Shop,43.66013,-79.38583


In [39]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

61 venues were returned by Foursquare.
