# Applied Data Science Capstone Week 3

## Toronto WIKI Page Wrangling

In [33]:
import pandas as pd

In [34]:
#URL for the postcal codes of Canada
URL = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"


In [35]:
import requests 
from bs4 import BeautifulSoup
response = requests.get(URL).text

In [36]:
#load the html contents of the URL into a response object
toronto = BeautifulSoup(response, 'lxml')

In [37]:
toronto_table = toronto.find('table',{'class': 'wikitable sortable'})
toronto_table_rows = toronto_table.find_all('tr')

In [38]:
#Get the data needed to create a dataframe
data = []
for row in toronto_table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])

#df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
#df = df[~df['PostalCode'].isnull()]

In [39]:
df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
10,M8A,Not assigned,Not assigned


In [40]:
#Delete dataframs that have a Neighbourhood of "Not Assigned"
df = df[~df['Borough'].str.contains('Not assigned')]
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [41]:
#group the dataframe by Postcode and Borough and concatenate all neighborhoods into comma separated list
df = df.groupby(['PostalCode', 'Borough'])['Neighbourhood'].apply(list).apply(lambda x: ', '.join(x)).to_frame()
df.reset_index(inplace = True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [42]:
#create a list of neighborhoods, replacing the borough where neighborhood is 'Not assigned'
new_neigh = df['Neighbourhood'].where(df['Neighbourhood'] != 'Not assigned', other = df['Borough'], axis = 0)
#construct new dataframe using postcode and borough from the previous dataframe and neighborhood from the above list
df = pd.concat([df['PostalCode'], df['Borough'], new_neigh], axis = 1)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [43]:
#get the shape
df.shape

(103, 3)

## Geocoding 

In [12]:
#install the geocoder package
!conda install -c conda-forge geocoder -y
import geocoder

Solving environment: done

# All requested packages already installed.



In [44]:
#function to return lat/long for a Postal Code
#Google doesn't work very well, so we are using arcgis as the service

def get_lat_long(postal_code):

    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

In [45]:
## Use the new function to get the Lat/Long for each row of the dataframe
df[['Latitude','Longitude']] = df.apply(
    lambda row: pd.Series(get_lat_long(row['PostalCode'])), axis = 1)


In [47]:
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.21759
4,M1H,Scarborough,Cedarbrae,43.769688,-79.23944
5,M1J,Scarborough,Scarborough Village,43.743125,-79.23175
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.726276,-79.263625
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.713054,-79.285055
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.724235,-79.227925
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.69677,-79.259967
