# Segmenting and Clustering Neighbourhoods in Torongo

A Coursera Data Science Capstone Assignment

In [2]:
#import necessary modules
import pandas as pd
import numpy as np

## 1. Web Scraping for Toronto Neighbourhood Data Set

The data will be scraped from wikipedia at https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [3]:
#install needed packages
!pip install lxml html5lib beautifulsoup4



In [56]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
dfs = pd.read_html(URL)

In [25]:
print('There are {} tables on the page'.format(len(dfs))) #this show that there are 

There are 3 tables on the page


In [57]:
df = dfs[0] #Inspection shows that our table of interest is the first
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Renaming column Postal Code to PostalCode

In [58]:
df.rename(columns={'Postal Code':'PostalCode'},inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [59]:
df.drop(df[df.Borough=='Not assigned'].index,inplace=True)
df.index = range(len(df))
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.  These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  in the above table.

In [67]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


#### If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [64]:
df['Neighbourhood'] = df['Neighbourhood'].apply(lambda x: df['Borough'] if x == 'Not Assigned' else x)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


## Assumptions: 
- The website is available
- The table of interest is available as the first on the page - index 0
- The Schema of the tables are assumed consistent as-is

In [65]:
df.shape

(103, 3)

## 2. Latitude and Longitude for each Neighbourhod

In [None]:
CLIENT_ID = 'O1VAWMONTH2GQ5MB1WYOSWR1IUCYU3C1ODPTYWIZQFNXWNTF' # your Foursquare ID
CLIENT_SECRET = 'LOX2OZZDHUQFVY1N2NEDLY0JS4QNIUWQYY0CPVYZJPFCWTKD' # your Foursquare Secret
ACCESS_TOKEN = 'HI5GYLJH05GATPAGPIQTKOSY4DYCV5CBX2G2DV2LWIP12WPE' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [80]:
!pip install geopy geocoder
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

Collecting geocoder
  Downloading geocoder-1.38.1-py2.py3-none-any.whl (98 kB)
[K     |████████████████████████████████| 98 kB 7.9 MB/s  eta 0:00:01
Collecting ratelim
  Downloading ratelim-0.1.6-py2.py3-none-any.whl (4.0 kB)
Installing collected packages: ratelim, geocoder
Successfully installed geocoder-1.38.1 ratelim-0.1.6


In [102]:
import geocoder # import geocoder

for index, row in df.iterrows():
    i=0
    postal_code = row['PostalCode']
    latitude = 0
    longitude = 0
    location = None
    while(location is None and i<2):
        #using geocoder
        #location = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
        
        #using geolocator
        geolocator = Nominatim(user_agent="foursquare_agent")
        location = geolocator.geocode('{}, Toronto, Ontario'.format(postal_code))
        
        i += 1
    if(location is not None):
        #geolocator
        latitude = location.latitude
        longitude = location.longitude
        #print(location.latitude,location.longitude)
        
        #google eocoder
        #latitude = lat_lng_coords[0]
        #longitude = lat_lng_coords[1]
        
    df.loc[index,'Latitude'] = latitude
    df.loc[index,'Longitude'] = longitude
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.653482,-79.383935
1,M4A,North York,Victoria Village,0.0,0.0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",0.0,0.0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",0.0,0.0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.653482,-79.383935


In [107]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.653482,-79.383935
1,M4A,North York,Victoria Village,0.0,0.0
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",0.0,0.0
3,M6A,North York,"Lawrence Manor, Lawrence Heights",0.0,0.0
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.653482,-79.383935


In [105]:
cvURI = 'https://cocl.us/Geospatial_data'
dfloc = pd.read_csv(cvURI)
dfloc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [122]:
for index, row in df.iterrows():
    match = None
    match = dfloc[dfloc['Postal Code']==row['PostalCode']]
    if(match is not None):
        longitude = match['Longitude'].values[0]
        latitude = match['Latitude'].values[0]
    else:        
        longitude = row['Longitude']
        latitude = row['Latitude']
    df.loc[index,'Latitude'] = latitude
    df.loc[index,'Longitude'] = longitude
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
