# Week 3 - Applied Data Science Capstone - Peer Assignment

## Problem 1

### Introduction

In this assignment the table of the postal codes published in the wikipedia article will be scraped.

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analysis

from bs4 import BeautifulSoup # library for website scraping
import requests

### 1. Scrape table from wikipedia webpage

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text # Get source code of wikipedia webpage

soup   = BeautifulSoup(source, 'lxml') # Use library to scrape source code
tables = soup.find_all('table', class_='sortable') # Find all tables contained in the source code

for table in tables: # Iterate over tables
    ths = table.find_all('th') # Find table header
    headings = [th.text.strip() for th in ths]
    if headings[:3] == ['Postcode', 'Borough', 'Neighbourhood']:
        break
        
df      = pd.DataFrame(columns=headings) # Create dataframe with headings

for i, tr in enumerate(table.find_all('tr')): # iterate over table
    tds = tr.find_all('td')
    if not tds:
        continue
    postcode, borough, neighbourhood = [td.text.strip() for td in tds[:3]] # Strip text and assign to dataframe
    df.loc[i, 'Postcode']       = postcode
    df.loc[i, 'Borough']        = borough
    df.loc[i, 'Neighbourhood']  = neighbourhood
    
df.head()
    

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### 2. Only process the cells that have an assigned borough

In [3]:
df.replace('Not assigned', np.nan, regex=True, inplace=True) # Replace string with np.nan
df.dropna(subset=['Borough'], inplace=True) # Apply drop nan function of pandas on dataframe

### 3. Combine neighbourhoods into one row separated with a comma and assign __Not assigned__ neighbourhoods the same as borough

In [4]:
def to_comma_sep_row(df):
    if df['Neighbourhood'].dropna().size == 0:
        df['Neighbourhood'] = df['Borough']
    else:
        df['Neighbourhood'] = ', '.join(df['Neighbourhood'].unique().tolist())
    return df
    
df = df.groupby(['Postcode', 'Borough']).apply(to_comma_sep_row).drop_duplicates()

In [5]:
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### 4. Print number of rows of dataframe

In [6]:
df.shape

(103, 3)

### 5. Write DataFrame to CSV-file

In [7]:
df.to_csv('w3_p1.csv.bz2', compression='bz2', index=False)