# Web Scraping of Data.gov Dataset Catalog Using Python and BeautifulSoup
### David Lowe
### January 31, 2020

SUMMARY: The purpose of this project is to practice web scraping by extracting specific pieces of information from a website. The Python web scraping code leverages the Beautiful module.

INTRODUCTION: Data.gov is a government data repository website managed and hosted by the U.S. General Services Administration. The purpose of this exercise is to practice web scraping by gathering the dataset entries from Data.gov’s web pages. This iteration of the script automatically traverses the web pages to capture all dataset entries and store all captured information in a JSON output file.

Starting URLs: https://catalog.data.gov/dataset

## Section 1. Prepare Environment

In [1]:
import os
import shutil
import smtplib
import sys
import pandas as pd
from email.message import EmailMessage
from datetime import datetime
from random import randint
from time import sleep
import requests
from requests.exceptions import HTTPError
from requests.exceptions import ConnectionError
from bs4 import BeautifulSoup
# from selenium import webdriver
# from selenium.webdriver.firefox.options import Options

In [2]:
# Begin the timer for the script processing
startTimeScript = datetime.now()

# Set up the verbose and debug flags to print detailed messages for debugging (setting True will activate!)
verbose = False
debug = False

# Set up the flag to send status emails (setting to True will send the status emails!)
notifyStatus = True

# Set up the flag to write the output to a JSON document (setting to TRUE will create the document!)
writeToJSON = True

# Set up the mountStorage flag to mount G Drive for storing files (setting True will mount the drive!)
mountStorage = False

# Set up the executeDownload flag to download files (setting True will download!)
executeDownload = False

In [3]:
# Colab-Specific Setup - Mount Google Drive for storing downloaded files
if (mountStorage):
    from google.colab import drive
    drive.mount('/content/gdrive')

In [4]:
# Set up the email notification function
def email_notify(msg_text):
    sender = os.environ.get('MAIL_SENDER')
    receiver = os.environ.get('MAIL_RECEIVER')
    gateway = os.environ.get('SMTP_GATEWAY')
    smtpuser = os.environ.get('SMTP_USERNAME')
    password = os.environ.get('SMTP_PASSWORD')
    if sender==None or receiver==None or gateway==None or smtpuser==None or password==None:
        sys.exit("Incomplete email setup info. Script Processing Aborted!!!")
    msg = EmailMessage()
    msg.set_content(msg_text)
    msg['Subject'] = 'Notification from Python Web Scraping Script'
    msg['From'] = sender
    msg['To'] = receiver
    server = smtplib.SMTP(gateway, 587)
    server.starttls()
    server.login(smtpuser, password)
    server.send_message(msg)
    server.quit()

In [5]:
def access_url_bsoup(url):
    # Creating an html document from the URL
    uastring = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"
    headers={'User-Agent': uastring}
    # Adding random wait time so we do not hammer the website needlessly
    waitTime = randint(3,6)
    print("Waiting " + str(waitTime) + " seconds to finish retrieving the URL.")
    sleep(waitTime)
    print('Attempting to access the web page: ' + url)
    try:
        s = requests.Session()
        resp = s.get(url, headers=headers)
        if (debug): print(resp.text)
    except HTTPError as e:
        print('The server could not serve up the web page!')
        sys.exit("Script processing cannot continue!!!")
    except ConnectionError as e:
        print('The server could not be reached due to connection issues!')
        sys.exit("Script processing cannot continue!!!")

    if (resp.status_code==requests.codes.ok):
        print('Successfully accessed the web page: ' + url)
        bsoup_obj = BeautifulSoup(resp.text, 'lxml')
        return(bsoup_obj)

In [6]:
def access_url_sel(url):
    wait_time = 5
    firefox_options = Options()
    firefox_options.headless = True
    print('Attempting to access the web page: ' + url)
    browser = webdriver.Firefox(options=firefox_options)
    browser.get(url)
    print("Waiting " + str(waitTime) + " seconds to finish retrieving the URL.")
    sleep(wait_time)
    innerHTML = browser.execute_script("return document.body.innerHTML")
    sleep(wait_time)
    bsoup_obj = BeautifulSoup(innerHTML, 'lxml')
    if (debug): print(bsoup_obj.prettify())
    browser.quit()
    return bsoup_obj

In [7]:
def assign_resource(resource_element):
    if (debug): print(resource_element)
    resource_url = resource_element.a['href']
    if (verbose): print("Found resource URL:", resource_url)
    resource_format = resource_element.a['data-format']
    if (verbose): print("Found resource format:", resource_format)
    return(resource_url, resource_format)

In [8]:
def download_to_local(doc_path):
    # Adding random wait time so we do not hammer the website needlessly
    waitTime = randint(2,5)
    print("Waiting " + str(waitTime) + " seconds to retrieve " + doc_path)
    sleep(waitTime)
    local_file = doc_path.split('/')[-1]
    if (os.path.isfile(local_file) == False):
        with requests.get(doc_path, stream=True) as r:
            with open(local_file, 'wb') as f:
                shutil.copyfileobj(r.raw, f)
        print('Downladed file: ' + local_file)
    else:
        print('Skipped existing file: ' + local_file)

In [9]:
def download_to_gdrive(doc_path):
    # Adding random wait time so we do not hammer the website needlessly
    waitTime = randint(2,5)
    print("Waiting " + str(waitTime) + " seconds to retrieve " + doc_path)
    sleep(waitTime)
    local_file = doc_path.split('/')[-1]
    gdrivePrefix = '/content/gdrive/My Drive/Colab_Downloads/'
    dest_file = gdrivePrefix + local_file
    with requests.get(doc_path, stream=True) as r:
        with open(dest_file, 'wb') as f:
            shutil.copyfileobj(r.raw, f)
    print('Downladed file: ' + dest_file)

In [10]:
if (notifyStatus): email_notify("Phase 1 Prepare Environment completed! "+datetime.now().strftime('%a %B %d, %Y %I:%M:%S %p'))

## Section 1. Perform the Scraping and Processing

In [11]:
if (notifyStatus): email_notify("Phase 2 Perform the Scraping and Processing has begun! "+datetime.now().strftime('%a %B %d, %Y %I:%M:%S %p'))

In [12]:
# Setting up a dataframe to capture the records
df = pd.DataFrame(columns=['dataset_title', 'org_type', 'agency_name', 'dataset_url', 'description', 
                           'resource1_url', 'resource1_format', 'resource2_url', 'resource2_format', 
                           'resource3_url', 'resource3_format', 'resource4_url', 'resource4_format', 
                           'resource5_url', 'resource5_format', 'resource6_url', 'resource6_format'])

In [13]:
# Setting up the necessary operating parameters
num_entries = 0
done = False

# Defining the starting and ending page numbers to visit
num_pages = 1 # 1
max_pages = 500 # 13086

In [14]:
# Specifying the URL of desired web page to be scrapped
website_url = "https://catalog.data.gov"
dataset_page_url = website_url + "/dataset?page=" + str(num_pages)

In [15]:
while not done:
    # Gather all blog links from the blog page
    home_page_browser = access_url_bsoup(dataset_page_url)
    main_page_container = home_page_browser.find("section", class_="module")
    collection = main_page_container.find_all("li", class_="dataset-item")

    for item in collection:
        if (debug): print(item)
        org_type_element = item.find("span", class_="organization-type")
        if (org_type_element == None):
            org_type = '[Not Found]'
        else:
            org_type = org_type_element['data-organization-type']
            if (verbose): print("Found organization type:", org_type)

        data_heading_element = item.find("h3", class_="dataset-heading")
        if (data_heading_element == None):
            dataset_title = '[Not Found]'
            dataset_url = '[Not Found]'
        else:
            dataset_title = data_heading_element.a.text
            if (verbose): print("Found dataset title:", dataset_title)
            dataset_url = website_url + data_heading_element.a['href']
            if (verbose): print("Found dataset url:", dataset_url)

        org_name_element = item.find("p", class_="dataset-organization")
        if (org_name_element == None):
            agency_name = '[Not Found]'
        else:
            agency_name = org_name_element.text.rstrip(" —")
            if (verbose): print("Found agency name:", agency_name)

        description_element = item.find("div", class_="notes").div
        if (description_element == None):
            description = '[Not Found]'
        else:
            description = description_element.text
            if (verbose): print("Found dataset description:", description)

        resource1_url, resource1_format = '[Not Applicable]', '[Not Applicable]'
        resource2_url, resource2_format = '[Not Applicable]', '[Not Applicable]'
        resource3_url, resource3_format = '[Not Applicable]', '[Not Applicable]'
        resource4_url, resource4_format = '[Not Applicable]', '[Not Applicable]'
        resource5_url, resource5_format = '[Not Applicable]', '[Not Applicable]'
        resource6_url, resource6_format = '[Not Applicable]', '[Not Applicable]'
        data_resources_list = item.find("ul", class_="dataset-resources")
        if (data_resources_list != None):
            data_resources_element = data_resources_list.find_all("li")
            if (debug): print(data_resources_element)
            resource_index = 0
            for individual_res in data_resources_element:
                if (resource_index == 0): resource1_url, resource1_format = assign_resource(individual_res)
                if (resource_index == 1): resource2_url, resource2_format = assign_resource(individual_res)
                if (resource_index == 2): resource3_url, resource3_format = assign_resource(individual_res)
                if (resource_index == 3): resource4_url, resource4_format = assign_resource(individual_res)
                if (resource_index == 4): resource5_url, resource5_format = assign_resource(individual_res)
                if (resource_index == 5): resource6_url, resource6_format = assign_resource(individual_res)
                resource_index = resource_index + 1

        df.loc[num_entries] = [dataset_title, org_type, agency_name, dataset_url, description,
                               resource1_url, resource1_format, resource2_url, resource2_format,
                               resource3_url, resource3_format, resource4_url, resource4_format,
                               resource5_url, resource5_format, resource6_url, resource6_format]
        num_entries = num_entries + 1

    if ((num_pages % 100)==0): email_notify("Finished parsing page number " + str(num_pages) + " at " + datetime.now().strftime('%a %B %d, %Y %I:%M:%S %p'))
    pagination_css = home_page_browser.find("div", class_="pagination")
    active_page = int(pagination_css.find("li", class_="active").text)
    if (active_page == max_pages):
        done = True
    else:
        num_pages = num_pages + 1
        dataset_page_url = website_url + "/dataset?page=" + str(num_pages)

Waiting 3 seconds to finish retrieving the URL.
Attempting to access the web page: https://catalog.data.gov/dataset?page=1
Successfully accessed the web page: https://catalog.data.gov/dataset?page=1
Waiting 5 seconds to finish retrieving the URL.
Attempting to access the web page: https://catalog.data.gov/dataset?page=2
Successfully accessed the web page: https://catalog.data.gov/dataset?page=2
Waiting 6 seconds to finish retrieving the URL.
Attempting to access the web page: https://catalog.data.gov/dataset?page=3
Successfully accessed the web page: https://catalog.data.gov/dataset?page=3
Waiting 5 seconds to finish retrieving the URL.
Attempting to access the web page: https://catalog.data.gov/dataset?page=4
Successfully accessed the web page: https://catalog.data.gov/dataset?page=4
Waiting 6 seconds to finish retrieving the URL.
Attempting to access the web page: https://catalog.data.gov/dataset?page=5
Successfully accessed the web page: https://catalog.data.gov/dataset?page=5
Waiti

In [16]:
print('Finished finding all available web pages:', num_pages)
print('Total datasets processed:', num_entries)

Finished finding all available web pages: 500
Total datasets processed: 9865


In [17]:
if (writeToJSON):
    out_file = df.to_json(orient='records')
    with open('web-scraping-py-bsoup-datagov-datasets.json', 'w') as f:
        f.write(out_file)

In [18]:
if (notifyStatus): email_notify("Phase 2 Perform the Scraping and Processing completed! "+datetime.now().strftime('%a %B %d, %Y %I:%M:%S %p'))

In [19]:
print ('Total time for the script:',(datetime.now() - startTimeScript))

Total time for the script: 0:57:17.814986
