# Capstone Project - Web scraping

## 1. Imports

In [3]:
import pandas as pd
import numpy as np

from bs4 import BeautifulSoup
import requests

## 2. Scraping postal codes for Toronto from Wikipedia

In [4]:
postcode = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")

In [5]:
postcodesoup = BeautifulSoup(postcode.content) 

In [6]:
list_postcodes = list()

for tr in postcodesoup.find_all("tr")[2:-5]:
    
    row = (td.text for td in tr.find_all("td"))
    
    list_postcodes.append(row)

## 3. Creating and preparing a pandas DataFrame

In [7]:
df_raw = pd.DataFrame(list_postcodes, columns=["Postcode","Borough","Neighbourhood"])

### 3.1. Preparing helper functions

In [8]:
def concatNeighbourhood(row):
    
    return pd.Series(dict(Borough = row['Borough'].unique()[0],
                        Neighbourhood = "%s" % ', '.join(row['Neighbourhood'])))

### 3.2 Cleaning and preparing DataFrame
I used modern possibilities of method chaining to keep my code clean. 

In [9]:
df_post = (df_raw
            .query('Borough != "Not assigned"')
            .assign(Neighbourhood = lambda x: x["Neighbourhood"].str[:-1].replace('Not assigned', x["Borough"]))
            .groupby('Postcode').apply(concatNeighbourhood)
            .reset_index()
)

df_post.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [10]:
df_post.shape()

TypeError: 'tuple' object is not callable