# Wikipedia Scraping

Scrapes the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M, in order to obtain the data that is in the table of postal codes and to transform the data into a pandas dataframe.

### Import Libraries

In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
from urllib.request import urlopen

def BeautifulTablesFromPage(article):

    from pandas import DataFrame
    from bs4 import BeautifulSoup
    
    soup = BeautifulSoup(article, 'html.parser')
    tables = soup.find_all('table', class_='sortable')
    
    all_tables_content = []
    for table in tables:
        ths = table.find_all('th')
        table_headings = [th.text.strip() for th in ths]
    
        table_content = []
        for tr in table.find_all('tr'):
            tds = tr.find_all('td')
            if not tds:
                continue
            table_content.append([td.text.strip() for td in tds])
        
        df = DataFrame(table_content)
        df.columns = table_headings
        all_tables_content.append(df)
        
        return(all_tables_content)

### Scraping

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

# save the file locally
with open('List_of_postal_codes_of_Canada:_M', 'w') as art: 
    art.write(urlopen(url).read().decode())

# Load article
article = open('List_of_postal_codes_of_Canada:_M').read()

all_tables = BeautifulTablesFromPage(article)

postal_codes_of_Canada = all_tables[0].copy()

### Data Manipulation

In [4]:
#There are no duplicates in the postals codes
len(postal_codes_of_Canada['Postal Code'].unique())


#If Borough is Not assigned then drop line 
postal_codes_of_Canada = postal_codes_of_Canada[postal_codes_of_Canada['Borough'] != 'Not assigned']
postal_codes_of_Canada.reset_index(drop=True,inplace=True)

#If Neighbourhood is Not assigned then replace it with Borough
for i, row in postal_codes_of_Canada.iterrows():
    if row['Neighbourhood'] == 'Not assigned':
        postal_codes_of_Canada.loc[i]['Neighbourhood'] = row['Borough']

### Result

In [5]:
print(postal_codes_of_Canada.shape)

(103, 3)


In [6]:
postal_codes_of_Canada.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
