# Coursera Capstone Project
## This project is to cluster and explore neighborhoods in Toronto

## Assignment 1: Scraping wikipedia page to create dataframe of Toronto neighborhoods by Postal Code
    

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import urllib.request

In [3]:
#creating url for toronto postal codes
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
toronto_data = urllib.request.urlopen(url)

In [5]:
soup = BeautifulSoup(toronto_data, "lxml")

In [6]:
toronto_wiki_table = soup.find('table', class_='wikitable sortable')
toronto_wiki_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3B
</td>
<td>

In [7]:
#get data from HTML page and store in list
toronto_wiki_tr = toronto_wiki_table.find_all('tr')
data = []
for row in toronto_wiki_tr:
    td = []
    for t in row.find_all('td'):
        td.append(t.text.strip())
    data.append(td)

In [8]:
#convert list into a raw panda dataframe (pre removal of 'Not assigned' boroughs and neighborhoods)
toronto_df=pd.DataFrame(data,columns=['PostalCode', 'Borough', 'Neighborhood'])
toronto_df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


In [9]:
toronto_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181 entries, 0 to 180
Data columns (total 3 columns):
PostalCode      180 non-null object
Borough         180 non-null object
Neighborhood    180 non-null object
dtypes: object(3)
memory usage: 4.3+ KB


## Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
## If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [10]:
toronto_df = toronto_df[~toronto_df['Borough'].isnull()]
toronto_df.drop(toronto_df[toronto_df.Borough == 'Not assigned'].index, inplace = True)
toronto_df.reset_index(drop=True, inplace=True)
toronto_df['Neighborhood'].replace('Not assigned',toronto_df['Borough'],inplace=True)
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [11]:
toronto_df = toronto_df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(lambda x: ','.join(x)).reset_index()
toronto_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [12]:
toronto_df.shape

(103, 3)

## RETRIEVING LATITUDE AND LONGITUDE COORDINATES FOR TORONTO NEIGHBORHOODS


In [13]:
latlng_coord_df =pd.read_csv('http://cocl.us/Geospatial_data')
latlng_coord_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
latlng_coord_df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
latlng_coord_df.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
latlng_coord_df.shape

(103, 3)

## Latitude/Longitude coordinate dataframe for Toronto neighborhoods

In [16]:
#merging postal code dataframe with latitude/longitude dataframe
toronto_latlng = pd.merge(toronto_df, latlng_coord_df, on=["PostalCode"])
toronto_latlng

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [17]:
toronto_latlng.shape

(103, 5)

## Create map of Toronto neighborhoods

In [18]:
!conda install -c conda-forge geopy --yes

import matplotlib.cm as cm
import matplotlib.colors as colors

!conda install -c conda-forge folium=0.5.0 --yes
import folium 

print('Libraries imported.')


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    openssl-1.1.1g             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2020.4.5.2 |       hecda079_0         147 KB  conda-forge
    certifi-2020.4.5.2         |   py36h9f0ad1d_0         152 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0           conda-forge
    geopy:          

In [19]:
from geopy.geocoders import Nominatim
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geographical coordinate of Toronto are {}, {}.'.format(latitude, longitude))




The geographical coordinate of Toronto are 43.6534817, -79.3839347.


In [20]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_latlng['Latitude'], toronto_latlng['Longitude'], toronto_latlng['Borough'], toronto_latlng['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7, 
        parse_html=False).add_to(map_toronto)
map_toronto

In [22]:
bur_toronto_latlng = toronto_latlng[toronto_latlng['Borough'].str.contains('Toronto',regex=False)]
bur_toronto_latlng

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [23]:
bur_toronto_latlng.shape

(39, 5)

In [24]:
map_bur_toronto = folium.Map(location=[43.6534817, -79.3839347], zoom_start=10)

for lat, lng, borough, neighborhood in zip(bur_toronto_latlng['Latitude'], bur_toronto_latlng['Longitude'], bur_toronto_latlng['Borough'], bur_toronto_latlng['Neighborhood']):
    label = '{},{}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7, 
        parse_html=False).add_to(map_bur_toronto)
map_bur_toronto

In [25]:
CLIENT_ID = 'ZFWIFIVXOC5E3MJ4UVGWWEQZK0GJBLNEQD30B5GNLEJIZFF2'
CLIENT_SECRET ='SSADIVRHC5DCAZ3XSZTTEC2LAXXEDDV4UAOPFTMUXAPHGOLE'
VERSION = '20180605'

print ('Your credentials:')
print ('CLIENT_ID: ' + CLIENT_ID)
print ('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentials:
CLIENT_ID: ZFWIFIVXOC5E3MJ4UVGWWEQZK0GJBLNEQD30B5GNLEJIZFF2
CLIENT_SECRET:SSADIVRHC5DCAZ3XSZTTEC2LAXXEDDV4UAOPFTMUXAPHGOLE


In [26]:
bur_toronto_latlng.reset_index(drop=True, inplace=True)

In [27]:
bur_toronto_latlng.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [28]:
bur_toronto_latlng.loc[0,'Neighborhood']

'The Beaches'

In [29]:
neighborhood_latitude = bur_toronto_latlng.loc[0,'Latitude']
neighborhood_longitude = bur_toronto_latlng.loc[0,'Longitude']
neighborhood_name = bur_toronto_latlng.loc[0,'Neighborhood']
print ('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, neighborhood_latitude, neighborhood_longitude))

Latitude and longitude values of The Beaches are 43.67635739999999, -79.2930312.


In [30]:
LIMIT = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{},&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,neighborhood_latitude,
                                                                                                                         neighborhood_longitude,
                                                                                                                         radius, LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=ZFWIFIVXOC5E3MJ4UVGWWEQZK0GJBLNEQD30B5GNLEJIZFF2&client_secret=SSADIVRHC5DCAZ3XSZTTEC2LAXXEDDV4UAOPFTMUXAPHGOLE&v=20180605&ll=43.67635739999999,-79.2930312,&radius=500&limit=100'

In [31]:
import json
from pandas.io.json import json_normalize
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5ee3fc4c7a98810d7eaaf00c'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'The Beaches',
  'headerFullLocation': 'The Beaches, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 4,
  'suggestedBounds': {'ne': {'lat': 43.680857404499996,
    'lng': -79.28682091449052},
   'sw': {'lat': 43.67185739549999, 'lng': -79.29924148550948}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bd461bc77b29c74a07d9282',
       'name': 'Glen Manor Ravine',
       'location': {'address': 'Glen Manor',
        'crossStreet': 'Queen St.',
        'lat': 43.67682094413784,
        'lng': -79.29394208780985,
        'labeledLatLngs': [{'labe

In [32]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [33]:
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues)

filtered_columns = ['venue.name','venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()



Unnamed: 0,name,categories,lat,lng
0,Glen Manor Ravine,Trail,43.676821,-79.293942
1,The Big Carrot Natural Food Market,Health Food Store,43.678879,-79.297734
2,Grover Pub and Grub,Pub,43.679181,-79.297215
3,Upper Beaches,Neighborhood,43.680563,-79.292869


In [34]:
print ('{} venues were returned by Foursquare.'.format(nearby_venues.shape[0]))

4 venues were returned by Foursquare.


In [51]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print (name)
        
 #       url2 = 'https://api.foursquare.com/v2/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}+limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT)
  #      url2
     #   results = requests.get(url2).json()['response']['groups'][0]['items']
        
    #    venues_list.append([( 
   #         name,
  #          lat,
  #          lng,
  #          v['venue']['name'],
  #          v['venue']['location']['lat'],
  #          v['venue']['location']['lng'],
  #          v['venue']['categories'][0]['name']) for v in results])
        
 #   nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
 #   nearby_venues.columns = ['Neighborhood',
#                             'Neighborhood Latitude',
#                             'Neighborhood Longitude',
#                             'Venue',
#                             'Venue Latitude',
#                             'Venue Longitude',
#                             'Venue Category']
#    return(nearby_venues)

In [47]:
toronto_burough_venues = getNearbyVenues(names=bur_toronto_latlng['Neighborhood'],
                                        latitudes=bur_toronto_latlng['Latitude'],
                                        longitudes=bur_toronto_latlng['Longitude']
                                        )

The Beaches


KeyError: 'groups'

In [48]:
print(toronto_burough_venues.shape)
toronto_burough_venues.head()

NameError: name 'toronto_burough_venues' is not defined