# Capstone Project - Web scraping

## 1. Imports

In [39]:
# standard computing libraries
import pandas as pd
import numpy as np

# webscraping
from bs4 import BeautifulSoup
import requests

# geocoder for retrieving coordinates of postcodes
import geocoder

# k-means clustering
from sklearn.cluster import KMeans

## 2. Scraping postal codes for Toronto from Wikipedia

In [19]:
postcode = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [20]:
postcodesoup = BeautifulSoup(postcode.content) 

In [21]:
list_postcodes = list()

for tr in postcodesoup.find_all("tr")[2:-5]:
    
    row = (td.text for td in tr.find_all("td"))
    
    list_postcodes.append(row)

## 3. Creating and preparing a pandas DataFrame

In [22]:
df_raw = pd.DataFrame(list_postcodes, columns=["Postcode","Borough","Neighbourhood"])

### 3.1. Preparing helper functions

In [23]:
def concatNeighbourhood(row):
    
    return pd.Series(dict(Borough = row['Borough'].unique()[0],
                        Neighbourhood = ', '.join(row['Neighbourhood'])))

### 3.2 Cleaning and preparing DataFrame
I used modern possibilities of method chaining to keep my code clean. 

In [24]:
df_post = (df_raw
            .query('Borough != "Not assigned"')
            .assign(Neighbourhood = lambda x: x["Neighbourhood"].str[:-1].replace('Not assigned', x["Borough"]))
            .groupby('Postcode').apply(concatNeighbourhood)
            .reset_index()
)

df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [25]:
df_post.shape

(103, 3)

## 4. Getting coordinates of postcodes

### 4.1 Testing geocoder package 

In [26]:
#postal_code = 'M5G'

# initialize your variable to None
#lat_lng_coords = None

# loop until you get the coordinates
#while(lat_lng_coords is None):
#  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#  lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

Runs infinitely, meaning that i can't get a single pair of coordinates for a postcode therefore we need to use the CSV file supplied in the course.

### 4.2 Read coordinates CSV to dataframe

In [27]:
df_coord = pd.read_csv('Geospatial_Coordinates.csv')

df_coord.head()

### 4.3 Add coordinates to dataframe containing neighbourhoods

I used pd.merge() to join the dataframes, then I dropped the column containing the duplicate postal code and renamed the existing Postcode column to match the example column in the course

In [32]:
df_post_coord = (pd.merge(df_post, df_coord, left_on="Postcode", right_on="Postal Code")
                   .drop("Postal Code", axis=1)
                   .rename(columns={"Postcode":"PostalCode"}))

df_post_coord.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## 5. Cluster the neighbourhoods

### 5.1 Pick neighbourhoods with borough containing 'Toronto'

In [36]:
df_toronto = df_post_coord[df_post_coord['Borough'].str.contains('Toronto')]

df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


(38, 5)