## Applied Data Science Capstone - Week 3 - First Link

### Website scraping with BeautifulSoup

In [205]:

# Beautiful Soup
from bs4 import BeautifulSoup
import requests

# url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# Getting the webpage, creating a Response object.
response = requests.get(url)
 
# Extracting the source code of the page.
data = response.text
 
# Passing the source code to BeautifulSoup to create a BeautifulSoup object for it.
soup = BeautifulSoup(data, 'lxml')

# Find the Table
My_table = soup.find('table',{'class':'wikitable sortable'})

### Create the DataFrame

In [206]:
# import Pandas
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood']

# extract information from My_table
rows_list = []
for tr in My_table.findAll('tr'):
    tds = tr.findAll('td')
    if not tds:
        continue
    postcode, borough, neighborhood = [td.text.strip() for td in tds[:3]]
    rows_list.append([postcode, borough, neighborhood])

# create dataframe (without cleaning)
postalcodes = pd.DataFrame(rows_list, columns=column_names)

### Clean the DataFrame and show the first 5 rows of the resulting DataFrame

In [207]:
# Drop rows where 'Borough' == 'Not assigned'
indexNames = postalcodes[postalcodes['Borough'] == 'Not assigned'].index
postalcodes.drop(indexNames, inplace=True)
postalcodes = postalcodes.reset_index()
del postalcodes['index']

# Assign 'Borough' where 'Neighborhood' == Not assigned'
postalcodes.loc[postalcodes['Neighborhood'] == ('Not assigned'), 'Neighborhood'] = postalcodes['Borough']

# Combine Neighborhoods in rows with the same PostalCode and Borough
postalcodes_group = postalcodes.groupby(['PostalCode', 'Borough'], sort=False).agg(lambda x: ', '.join(x))
postalcodes_group = postalcodes_group.reset_index()

# Show the first 5 rows from the new DataFrame
postalcodes_group.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


### Print the number of rows of the resulting DataFrame.

In [208]:
number_of_rows = postalcodes_group.shape[0]
print('The number of rows is ', number_of_rows)

The number of rows is  103
