# Step One - Web Scraping of Canada data from wikipedia.org

In [16]:
### install beautiful soup v4

!conda install -c conda-forge beautifulsoup4 


Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.8.0       |           py36_0         144 KB  conda-forge

The following packages will be UPDATED:

    beautifulsoup4: 4.7.1-py36_1 --> 4.8.0-py36_0 conda-forge


Downloading and Extracting Packages
beautifulsoup4-4.8.0 | 144 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done


In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

html_doc = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'


In [2]:
### retreive the web page for html_doc 

page = requests.get(html_doc)


In [3]:
### process the contents of the web page request 

soup = BeautifulSoup(page.content, 'html5lib')


In [4]:
### capture the table of Canadian postal codes from the web page 

table = soup.find('table', attrs={'class':'wikitable sortable'})


In [5]:
### capture all the rows of the table 
table_rows = table.find_all('tr')


In [6]:
###print(table_rows)

In [48]:
### copy rows of table from the web page to a new list
postal_codes = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text for tr in td]
    postal_codes.append(row)


In [49]:
### convert list to a DF with column headings

df_postal_codes = pd.DataFrame(postal_codes, columns = ["Postcode", "Borough", "Neighbourhood"]) 


In [50]:
df_postal_codes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


In [51]:
### drop first row from the DF that from converting the list to a DF
df_postal_codes = df_postal_codes.drop(0)

In [52]:
### remove rows where Borough = Not assigned
df_postal_codes_refined = df_postal_codes[df_postal_codes['Borough'] != "Not assigned"]


In [53]:
### remove \n from the neightbourhood column 
### if the Neighbourhood = "Not assigned" then set it to the Borough column value

for index, row in df_postal_codes_refined.iterrows():
    
    name = row['Neighbourhood']
    name = name.strip('\n')
    #if name[name_len-2:name_len] == "\n":
    df_postal_codes_refined.at[index, 'Neighbourhood'] = name  
    
    if row['Neighbourhood'] == "Not assigned":
        df_postal_codes_refined.at[index, 'Neighbourhood'] = row['Borough']  



In [54]:
### don't need to set an index for now
###df_postal_codes_refined.set_index(['Postcode'], inplace=True)


In [66]:
print(df_postal_codes_refined.head())
print(len(df_postal_codes_refined))

  Postcode           Borough     Neighbourhood
3      M3A        North York         Parkwoods
4      M4A        North York  Victoria Village
5      M5A  Downtown Toronto      Harbourfront
6      M5A  Downtown Toronto       Regent Park
7      M6A        North York  Lawrence Heights
211


In [67]:
df_postal_codes_final = df_postal_codes_refined.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(','.join).reset_index()


In [68]:

print(df_postal_codes_final.head())

  Postcode      Borough                         Neighbourhood
0      M1B  Scarborough                         Rouge,Malvern
1      M1C  Scarborough  Highland Creek,Rouge Hill,Port Union
2      M1E  Scarborough       Guildwood,Morningside,West Hill
3      M1G  Scarborough                                Woburn
4      M1H  Scarborough                             Cedarbrae


In [69]:
df_postal_codes_final.shape

(103, 3)