In [173]:
import requests
import lxml.html as lh
import pandas as pd

# First get the webpage and find the table (marked in html with /tr)

In [174]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Create a handle, page, to handle the contents of the website

page = requests.get(url)

#Store the contents of the website under doc

doc = lh.fromstring(page.content)

#Parse data that are stored between <tr>..</tr> of HTML

tr_elements = doc.xpath('//tr')

# Loop and import the table rows into a python list, find the column names

In [175]:
#Create empty list

col=[]
i=0

#For each row, store each first element (header) and an empty list

for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


# Iterate through the list and clean up the data

In [176]:
#Since the first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0 and t!="Not assigned":
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data)
        #Increment i for the next column
        i+=1

# Create a python dictionary from the clean data and make a pandas dataframe from that dictionary - shape the data as in the exercise instructions.

In [177]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

df.columns=['Postcode','Borough','Neighbourhood']

df.drop([0],axis=0,inplace=True)

df.reset_index()

# Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)

# More than one neighborhood can exist in one postal code area. 
# For example, in the table on the Wikipedia page, 
# you will notice that M5A is listed twice and has two neighborhoods: 
# Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods 
# separated with a comma as shown in row 11 in the above table.

df=df.groupby("Postcode").agg(lambda x:','.join(set(x)))

# If a cell has a borough but a Not assigned neighborhood, 
# then the neighborhood will be the same as the borough. 
# So for the 9th cell in the table on the Wikipedia page, 
# the value of the Borough and the Neighborhood columns will be Queen's Park.

df.loc[df['Neighbourhood']=="Not assigned",'Neighbourhood']=df.loc[df['Neighbourhood']=="Not assigned",'Borough']

df.shape


(103, 2)

# Then just for the heck of it print the first ten lines of the resulting dataframe.

In [178]:
df.head(10)

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern\n,Rouge\n"
M1C,Scarborough,"Highland Creek\n,Rouge Hill\n,Port Union\n"
M1E,Scarborough,"Guildwood\n,Morningside\n,West Hill\n"
M1G,Scarborough,Woburn\n
M1H,Scarborough,Cedarbrae\n
M1J,Scarborough,Scarborough Village\n
M1K,Scarborough,"Ionview\n,East Birchmount Park\n,Kennedy Park\n"
M1L,Scarborough,"Golden Mile\n,Oakridge\n,Clairlea\n"
M1M,Scarborough,"Cliffcrest\n,Cliffside\n,Scarborough Village W..."
M1N,Scarborough,"Birch Cliff\n,Cliffside West\n"


# Get the geodata .csv file

In [179]:
geo_data=pd.read_csv("https://cocl.us/Geospatial_data")
geo_data


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [180]:
df['Latitude']=geo_data['Latitude'].values
df['Longitude']=geo_data['Longitude'].values

df

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern\n,Rouge\n",43.806686,-79.194353
M1C,Scarborough,"Highland Creek\n,Rouge Hill\n,Port Union\n",43.784535,-79.160497
M1E,Scarborough,"Guildwood\n,Morningside\n,West Hill\n",43.763573,-79.188711
M1G,Scarborough,Woburn\n,43.770992,-79.216917
M1H,Scarborough,Cedarbrae\n,43.773136,-79.239476
M1J,Scarborough,Scarborough Village\n,43.744734,-79.239476
M1K,Scarborough,"Ionview\n,East Birchmount Park\n,Kennedy Park\n",43.727929,-79.262029
M1L,Scarborough,"Golden Mile\n,Oakridge\n,Clairlea\n",43.711112,-79.284577
M1M,Scarborough,"Cliffcrest\n,Cliffside\n,Scarborough Village W...",43.716316,-79.239476
M1N,Scarborough,"Birch Cliff\n,Cliffside West\n",43.692657,-79.264848


# Import the libraries needed for folium 

In [189]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
# uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata: ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: C:\Users\User\Anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.6.14               |           py37_0         2.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    geopy-1.19.0               |             py_0          53 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.2 MB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.49-py_0
  geopy              conda-forge/noarch::geopy-1.19.0-py_0

The following packages will be UPDATED:

  conda                      pkgs/main::conda-4.6.12-py37_1 --> conda-forge::cond



  current version: 4.6.12
  latest version: 4.6.14

Please update conda by running

    $ conda update -n base -c defaults conda


'pA' is not recognized as an internal or external command,
operable program or batch file.


Libraries imported.


# Check the number of boroughs and neighborhoods

In [184]:
print('The dataframe has {} boroughs and {} Neighbourhood.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 11 boroughs and 103 Neighbourhood.


# Set up the geo coordinates to draw a map of Toronto and it's neighborhoods

In [194]:
address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinates of Toronto are 43.653963, -79.387207.


# Draw that map!

In [196]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Set up your Foursquare credentials

In [226]:
CLIENT_ID = 'Insert Client_ID here' # your Foursquare ID
CLIENT_SECRET = 'Insert Secret here' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: Insert Client_ID here
CLIENT_SECRET:Insert Secret here


# Pick a borough to see what neighborhoods are there...  Let's look at York

In [202]:
toronto_data = df[df['Borough'] == 'York'].reset_index(drop=True)
toronto_data.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,York,Humewood-Cedarvale\n,43.693781,-79.428191
1,York,Caledonia-Fairbanks\n,43.689026,-79.453512
2,York,"Silverthorn\n,Del Ray\n,Keelesdale\n,Mount Den...",43.691116,-79.476013
3,York,"The Junction North\n,Runnymede\n",43.673185,-79.487262
4,York,Weston\n,43.706876,-79.518188


# Pick a neighborhood in that borough - #1 is Caledonia/Fairbanks

In [215]:
toronto_data.loc[1, 'Neighbourhood']

'Caledonia-Fairbanks\n'

# Get the latitude and longitude for Caledonia/Fairbanks

In [225]:
neighborhood_latitude = toronto_data.loc[1, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto_data.loc[1, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto_data.loc[1, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Caledonia-Fairbanks
 are 43.6890256, -79.453512.


# Construct the Foursquare search URL

In [220]:
radius = 500
LIMIT = 100

url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, neighborhood_latitude, neighborhood_longitude, VERSION, radius, LIMIT)


# Now use that url to get the Foursquare json search results

In [221]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ce1604e9fb6b7757e1e6f7d'},
 'response': {'headerLocation': 'Toronto',
  'headerFullLocation': 'Toronto',
  'headerLocationGranularity': 'city',
  'totalResults': 6,
  'suggestedBounds': {'ne': {'lat': 43.6935256045, 'lng': -79.44730040297749},
   'sw': {'lat': 43.6845255955, 'lng': -79.45972359702252}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4b9ec940f964a520c70137e3',
       'name': 'Shoppers Drug Mart',
       'location': {'address': '2343 Eglinton Ave W',
        'lat': 43.690650720838846,
        'lng': -79.45631000555339,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.690650720838846,
          'lng': -79.45631000555339}],
        'distance': 288,
        'postalCode': 'M6E 2L6',
        'cc': 'CA',


# Define a function to extract the categories of each venue in the json data

In [222]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# Now use that function to construct a dataset of nearby venues

In [223]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Shoppers Drug Mart,Pharmacy,43.690651,-79.45631
1,KFC,Fast Food Restaurant,43.690647,-79.456326
2,Nairn Park,Park,43.690654,-79.4563
3,Maximum Woman,Women's Store,43.690651,-79.456333
4,Walmart,Market,43.69066,-79.456317


# Finally count the number of nearby venues returned for this neighborhood

In [224]:
print('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

6 venues were returned by Foursquare.
