# **Creating a Dataframe out of a table on a Website** 
Applied Data Science Capstone Project <br>
by Collins Opoku-Baah<br> 
March 8th, 2019

#### Import all the necessary libraries

In [63]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

#### Get the table from the wikipedia

In [64]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [65]:
soup = BeautifulSoup(r.text, 'html.parser')

__Convert the table into a list that contains information with the tag 'td'<br>
     Separate the info into Postcode, Borough and Neighborhood__

In [66]:
webtable = soup.table

Postcode = []
Borough = []
Neighborhood = []

for num, tabElmt in enumerate(webtable.find_all('td')):
    
    if num%3 == 0:
        Postcode.append(tabElmt.text.rstrip())
        
    elif num%3 == 1:
        Borough.append(tabElmt.text.rstrip())
        
    elif num%3 == 2:
        Neighborhood.append(tabElmt.text.rstrip())
            

__Then, create an empty dataframe with the column headings as below__

In [67]:
newtable = pd.DataFrame(columns = ['PostalCode', 'Borough', 'Neighborhood'])
newtable.head()

Unnamed: 0,PostalCode,Borough,Neighborhood


__We will populate the dataframe with data from the separated lists <br>
The first step is to find the distinct elements in Postcode, and then <br>
for each element, find the associated Borough and Neighborhoods.__ 

In [68]:
UniPost = list(set(Postcode))

for i in UniPost:
    post = i
    neigh = ""
    for num, j in enumerate(Postcode): 
        if j == i:
            borough = Borough[num]
            neigh += Neighborhood[num] + ', '
            
    neigh = neigh[:-2]
    newtable = newtable.append({'PostalCode': post,
                               'Borough': borough,
                               'Neighborhood': neigh}, ignore_index = True)
newtable.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9W,Etobicoke,Northwest
1,M3Y,Not assigned,Not assigned
2,M7P,Not assigned,Not assigned
3,M5S,Downtown Toronto,"Harbord, University of Toronto"
4,M3J,North York,"Northwood Park, York University"


__Drop any row without an assigned Borough name__

In [69]:
newtable.drop(index = newtable.index[newtable['Borough']=='Not assigned'], inplace = True)
newtable.reset_index(drop = True, inplace = True)
newtable.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9W,Etobicoke,Northwest
1,M5S,Downtown Toronto,"Harbord, University of Toronto"
2,M3J,North York,"Northwood Park, York University"
3,M2H,North York,Hillcrest Village
4,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."


__For each unassigned Neighborhood, replace with the Borough Name__

In [70]:
for num, i in enumerate(newtable['Neighborhood']):
    if i == 'Not assigned':
        newtable.loc[num, 'Neighborhood'] = newtable.loc[num, 'Borough']
        
newtable.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M9W,Etobicoke,Northwest
1,M5S,Downtown Toronto,"Harbord, University of Toronto"
2,M3J,North York,"Northwood Park, York University"
3,M2H,North York,Hillcrest Village
4,M9C,Etobicoke,"Bloordale Gardens, Eringate, Markland Wood, Ol..."


__Determine the size of the Dataframe__

In [71]:
r, c = newtable.shape
print('The number of rows of the dataframe is ', r)

The number of rows of the dataframe is  103
