## Neighborhood Segmentation and Clustering - Week 3

Scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe

In [1]:
# Libraries
import folium
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize

In [2]:
def scrape_site(url):
    headers = requests.utils.default_headers()
    headers.update({
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
    })
    r = requests.get(url, headers)
    raw_html = r.content
    soup = BeautifulSoup(raw_html, 'html.parser')
    return soup

In [3]:
def get_table(soup, class_name):
    information = []
    table = soup.find("table", class_=class_name)
    table_rows = table.find_all('tr')
    for row in table_rows:
        info = row.text.split('\n')[1:-1]
        information.append(info)
    return information

Create Dataframe from url

In [4]:
# Set info
url   = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
soup = scrape_site(url)
data_table = get_table(soup, 'wikitable')
data_table = pd.DataFrame(data_table[1:], columns=data_table[0])

# Fix name Neighbourhood to Neighborhood
data_table = data_table.rename(columns={data_table.columns[2]: "Neighborhood" })
data_table

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


Process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [5]:
clean_table = data_table[data_table.Borough != 'Not assigned']
clean_table.reset_index(drop=True)
clean_table

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


Merge Neighborhood and group by Borough

In [6]:
grouped_data = clean_table.groupby(['Postcode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
grouped_data

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


 Print the number of rows of the dataframe

In [7]:
print('The DataFrame shape is:', grouped_data.shape)

The DataFrame shape is: (103, 3)


## Add Latitude and Longitude columns

In [8]:
geo_data = pd.read_csv("http://cocl.us/Geospatial_data")
geo_data.rename(columns={'Postal Code':'Postcode'}, inplace=True)
geo_data.set_index("Postcode")
grouped_data.set_index("Postcode")
toronto_df = pd.merge(grouped_data, geo_data)
toronto_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [9]:
address = 'Toronto, Canada'
geolocator = Nominatim(user_agent="Ontario_Clustering")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('The geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, ON, Canada are 43.653963, -79.387207.


In [10]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(toronto_df['Latitude'], toronto_df['Longitude'], toronto_df['Borough'], toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='#373F51',
        fill=True,
        fill_color='#373F51',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

## Explore Toronto Neighbourhoods

In [11]:
# Filter Neighbourhoods
filter_df = toronto_df[toronto_df['Borough'].str.contains('Toronto')]
neighbourhoods_df = filter_df.reset_index(drop=True)
neighbourhoods_df.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


Select Studio District neighbourhood properties

In [12]:
neighbourhood = neighbourhoods_df.loc[1]
neighbourhood_latitude = neighbourhood['Latitude']
neighbourhood_longitude = neighbourhood['Longitude']
neighbourhood_name = neighbourhood['Neighborhood']

In [13]:
# Set Foursquare API config
CLIENT_ID = 'EXERCOQ420QGNLZ3JWZDOYYC0P2TSG3UW5VRSQMG40YJJAGL'
CLIENT_SECRET = 'DQJQABOZFJBWKBMWPTJFEUV5KNNKYFIBMOJBQRP5BSGTMHIQ'
VERSION = '20190419' # Set version name like yearmonthday string

In [14]:
def format_url(lat, lng, radius=500, limit=100):
    return 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, 
        lat, lng, VERSION, radius, limit)

def get_api_data(url):
    result = requests.get(url).json()
    return result

In [15]:
url = format_url(neighbourhood_latitude, neighbourhood_longitude)
api_response = get_api_data(url)
api_response

{'meta': {'code': 200, 'requestId': '5cba83d51ed2196d944066bf'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Greektown',
  'headerFullLocation': 'Greektown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 44,
  'suggestedBounds': {'ne': {'lat': 43.6840571045, 'lng': -79.34597738331301},
   'sw': {'lat': 43.675057095499994, 'lng': -79.35839861668698}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4dc72844d22dafda2fcf75e6',
       'name': 'Dolce Gelato',
       'location': {'address': '414 Danforth Ave',
        'lat': 43.677772998450614,
        'lng': -79.35118737317053,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.6777729984

In [16]:
venues = api_response['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) 
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]
nearby_venues

Unnamed: 0,venue.name,venue.categories,venue.location.lat,venue.location.lng
0,Dolce Gelato,"[{'id': '4bf58dd8d48988d1c9941735', 'name': 'I...",43.677773,-79.351187
1,Pantheon,"[{'id': '4bf58dd8d48988d10e941735', 'name': 'G...",43.677621,-79.351434
2,MenEssentials,"[{'id': '4bf58dd8d48988d10c951735', 'name': 'C...",43.67782,-79.351265
3,Mezes,"[{'id': '4bf58dd8d48988d10e941735', 'name': 'G...",43.677962,-79.350196
4,Messini Authentic Gyros,"[{'id': '4bf58dd8d48988d10e941735', 'name': 'G...",43.677827,-79.350569
5,Cafe Fiorentina,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",43.677743,-79.350115
6,Christina's On The Danforth,"[{'id': '4bf58dd8d48988d10e941735', 'name': 'G...",43.67824,-79.349185
7,Louis Cifer Brew Works,"[{'id': '50327c8591d4c4b30a586d5d', 'name': 'B...",43.677663,-79.351313
8,Moksha Yoga Danforth,"[{'id': '4bf58dd8d48988d102941735', 'name': 'Y...",43.677622,-79.352116
9,7 Numbers,"[{'id': '4bf58dd8d48988d110941735', 'name': 'I...",43.677062,-79.353934


In [17]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [18]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Dolce Gelato,Ice Cream Shop,43.677773,-79.351187
1,Pantheon,Greek Restaurant,43.677621,-79.351434
2,MenEssentials,Cosmetics Shop,43.67782,-79.351265
3,Mezes,Greek Restaurant,43.677962,-79.350196
4,Messini Authentic Gyros,Greek Restaurant,43.677827,-79.350569
