In [5]:
from bs4 import BeautifulSoup
import json

The following script fetch data from a web page using the BeautifulSoup library. Data is taken from Wikipedia and represents the population and geographic values of NYC's boroughs.

Page: https://en.wikipedia.org/wiki/Boroughs_of_New_York_City

In [3]:
def load_html_from_memory():
    file_name = "data/Boroughs_of_New_York_City_Wikipedia.html"
    with open(file_name) as f:
        return BeautifulSoup(f, 'html.parser')

stock_soup = load_html_from_memory()

# get boroughs table from html file
table_name = "wikitable sortable jquery-tablesorter" # name of table tag found manually
boroughs_table = stock_soup.find("table", {"class":table_name})
    
# get boroughs table tags
boroughs_table_tags = boroughs_table.find_all()

# get boroughs table row tags

# table_row_tags_dict: dictionary containing information about the five boroughs of NYC. 
# the keys of the dictionary are the names of the boroughs, and the values are list of
# floats containing the values found in the table of the Wikipedia page. Position 4 of the
# list is the Land area of the borough in square km.
table_row_tags_dict = {}

table_row_values = []
number_of_boroughs_scrapped = 0
max_number_of_boroughs = 5

for tag in boroughs_table_tags:
    
    # gets table body (where borough values are contained)
    if tag.name == "tbody": 
        
        # gets rows of the table (in the form of tags)
        table_row_tags = tag.find_all()
        
        # initialize borough name, used as key for adding values to dictionary
        current_borough_name = ""
        
        for row_tag in table_row_tags:  
            
            if "href" in row_tag.attrs:
                # restarts boroughs values
                table_row_values = []
                # updates current_borough_name, used as key for the dictionary of boroughs
                current_borough_name = row_tag["title"]
                # increase number of boroughs that has been scrapped
                number_of_boroughs_scrapped += 1
                # if 5 boroughs had been added, stops iterating
                if number_of_boroughs_scrapped > max_number_of_boroughs:
                    break
                
            if row_tag.name == "td" and len(row_tag.attrs.keys()) == 0:
                # converts value from string to float
                #    - [:-1] is used to removed the las 2 characters of the string: "\n"
                #    - replace(",", "") is used to remove commas from the string
                #    - dots (".") in the string are not removed. They serve as decimal indicators
                formatted_value = float(row_tag.text[:-1].replace(",", ""))
                # adds formatted value to borough list
                table_row_values.append(formatted_value)
                # adds/updates borough list to borough dictionary (where borough name is the key)
                table_row_tags_dict[current_borough_name] = table_row_values

print(table_row_tags_dict)    

{'The Bronx': [1471160.0, 28.787, 19570.0, 42.1, 109.04, 34653.0, 13231.0], 'Brooklyn': [2648771.0, 63.303, 23900.0, 70.82, 183.42, 37137.0, 14649.0], 'Manhattan': [1664727.0, 629.682, 378250.0, 22.83, 59.13, 72033.0, 27826.0], 'Queens': [2358582.0, 73.842, 31310.0, 108.53, 281.09, 21460.0, 8354.0], 'Staten Island': [479458.0, 11.249, 23460.0, 58.37, 151.18, 8112.0, 3132.0]}


In [7]:
# exports borough data to a json file
with open('data/borough_data.json', 'w') as fp:
    json.dump(table_row_tags_dict, fp)