## Import Libraries

In [344]:
import pandas as pd # well I'm not using arrays anymore...progress 
import requests # for accesing Wiki URL
from bs4 import BeautifulSoup # Using BeautifulSoup for HTML Scrapping
import lxml # lxml supports BeautifulSoup

## Getting the 'Soup' of the WikiPage

In [345]:
wikiPage = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text # connect and get the WebPage
soup = BeautifulSoup(wikiPage, 'lxml') # returns the HTML page, as is the Markup.

In [346]:
table = soup.find('table', {'class': 'wikitable sortable'}) # finding table in the page, the class is given right beside the table tag in HTML page
rows = table.findAll('tr') # find table row and create a list of them

## Filtering the 'Soup' until we get the sweet Table we desire

In [347]:
text=[]
for row in rows:
    text.append(row.get_text().strip().split('\n')) 

# theres '\n' after and before every row so we strip (the string), to get pospostcode, borough, neighbourhood we use split with ('\n')
    

#### The Table is ready now we seperate columns to feed to pandas.

In [348]:
# create three lists for the three columns we need
postcode = []
borough = []
neighbourhood = []

# assign every 0th element in each sub-list to postcode, 1st to borough, 2nd to neighbourhood
for i in range(0, len(text)):
    postcode.append(text[i][0])
    borough.append(text[i][1])
    neighbourhood.append(text[i][2])

## Create DataFrame using pandas

In [349]:
# create df with proper column names
df = pd.DataFrame(list(zip(postcode,borough,neighbourhood)), columns =['PostalCode', 'Borough', 'Neighborhood'])
# drop row 0 because it contains the strings 'postcode', 'borough' and 'neighbourhood' 
# I know there are simpler ways of doing it, I'm in a hurry
df = df.drop(0)

#### Condition 1.<br>
<p>"Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned."</p>

In [350]:
df = df[df.Borough != 'Not assigned'] # if burough *is not* not assigned then add it to the new df

#### Condition 2.<br>
<p>"More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above table."</p>

In [351]:
newdf = df.groupby(['PostalCode','Borough'], sort = False).agg(lambda x: ','.join(x)) # 1. groupby PostalCode and Borough, then aggregate the strings and join them using ','
newdf.reset_index(level=['PostalCode','Borough'], inplace=True)

#### Condition 3.<br>
<p>"If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough. So for the 9th cell in the table on the Wikipedia page, the value of the Borough and the Neighborhood columns will be Queen's Park."</p>

In [352]:
newdf.loc[newdf['Neighborhood'] == 'Not assigned', ['Neighborhood']] = newdf['Borough']

## The Final DataFrame and its Shape:

In [355]:
newdf.head()
newdf.shape

(103, 3)