In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import json

## How to get types of content on the UMD webpages ##

Data Format

In [3]:
template_data = {"Link": "",
                 "Site_Title": "",
                 "Header": "",
                 "Content": "",
                 }

Getting main content

In [4]:
def get_main_content(soup, page_template):
  main_data = []

  mc = soup.find(id='main-content')
  if mc:
    ec = mc.find(class_='editor-content')
    if ec:

      new_data = page_template.copy()
      for child in ec.children:
        if child.name in ['h1', 'h2', 'h3']:
          main_data.append(new_data)
          new_data = page_template.copy()
          new_data['Header'] = child.get_text()

        if child.name == 'p':
          new_data['Content'] += child.get_text(strip=True) + " "

        if child.name == 'ul':
          for li in child.find_all('li'):
            new_data['Content'] += li.get_text() + " "
      if new_data:
        main_data.append(new_data)

  return main_data

Getting text

In [5]:
def get_text_content(soup, page_template):
  text_data = []
  editor_content = []

  psuft = soup.find_all(class_='page-section-ut_feature')
  sutf = soup.find_all(class_='section-ut_feature')
  psutt = soup.find_all(class_='page-section-ut_text')
  sutt = soup.find_all(class_='section-ut_text')
  psutiwt = soup.find_all(class_='page-section-ut_image_with_text')
  sutiwt = soup.find_all(class_='section-ut_image_with_text')

  search = psuft + sutf + psutt + sutt + psutiwt + sutiwt
  for thing in search:
    for ec in thing.find_all(class_='editor-content'):
      editor_content.append(ec)

  for ec in editor_content:
    new_data = page_template.copy()

    ul = ec.find('ul')
    if ul:
      for li in ul.find_all('li'):
        list_data = new_data.copy()
        list_data['Content'] = li.get_text()
        text_data.append(list_data)

    header1 = ec.find('h1')
    if header1:
      new_data['Header'] = header1.get_text()
    else:
      header2 = ec.find('h2')
      if header2:
        new_data['Header'] = header2.get_text()
      else:
        header3 = ec.find('h3')
        if header3:
          new_data['Header'] = header3.get_text()
        else:
          header4 = ec.find('h4')
          if header4:
            new_data['Header'] = header4.get_text()

    ps = ec.find_all('p')
    if ps:
      for p in ps:
        new_data['Content'] += p.get_text() + " "

    else:
      spans = ec.find_all('span')
      if spans:
        for span in spans:
          new_data['Content'] = span.get_text() + " "

    text_data.append(new_data)

  return text_data

Get Accordion

In [6]:
def get_accordion_content(soup, page_template):
  accordion_data = []

  for a in soup.find_all(class_='accordion'):
    for card in a.find_all(class_='card'):
      new_data = page_template.copy()

      ch = card.find(class_='card-header')
      if ch:
        new_data['Header'] = ch.get_text(strip=True)

      cb = card.find(class_='card-body')
      if cb:
        new_data['Content'] = cb.get_text(strip=True)

      accordion_data.append(new_data)

  return accordion_data

Getting Card Groups

In [7]:
def get_card_groups(soup, page_template):
  card_groups = []

  for cg in soup.find_all(class_='card-group'):
    for cw in cg.find_all(class_='card-wrap'):
      new_data = page_template.copy()

      ctitle = cw.find(class_='card-title')
      if ctitle:
        new_data['Header'] = ctitle.get_text(strip=True)

      ctext = cw.find(class_='card-text')
      if ctext:
        new_data['Content'] = ctext.get_text(strip=True)

      card_groups.append(new_data)

  return card_groups

Getting Slideshows

In [8]:
def get_slideshow_data(soup, page_template):
  slideshow_data = []

  sus = soup.find_all(class_='section-ut_slideshow')
  psuts = soup.find_all(class_='page-section-ut_slideshow')

  search = sus + psuts
  for thing in search:
    for si in thing.find_all(class_='slideshow-item'):
      new_data = page_template.copy()

      sct = si.find(class_='slideshow-caption-title')
      if sct:
        new_data['Header'] = sct.get_text()

      scc = si.find(class_='slideshow-caption-content')
      if scc:
        new_data['Content'] = scc.get_text()

      slideshow_data.append(new_data)
  return slideshow_data

Types of cleaning

In [9]:
def clean_contents(data):
  cleaned_data = []
  for chunk in data:
    if len(chunk['Content']) < 50:
      continue
    chunk['Content'] = chunk['Content'].replace("\xa0", " ")
    cleaned_data.append(chunk)

  return cleaned_data

## Class to scrape UMD type websites ##

In [10]:
class UMDWebScraper:
  def __init__(self, url):
    self.url = url
    self.visited_links = set()
    self.data = []

  def fetch_page(self, url):
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        return BeautifulSoup(response.text, "html.parser")
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return None

  def extract_links(self, soup, base_url):
    links = set()
    for a_tag in soup.find_all("a", href=True):
        link = urljoin(base_url, a_tag["href"])
        if link.startswith(self.url) and link not in self.visited_links \
        and not link.startswith(self.url + "sites/default/files") \
        and '#' not in link and '?' not in link:
            links.add(link)
    return links

  def extract_content(self, soup, page_template):

    main_content = get_main_content(soup, page_template)
    text_content = get_text_content(soup, page_template)
    accordion_content = get_accordion_content(soup, page_template)
    card_group_content = get_card_groups(soup, page_template)
    slideshow_content = get_slideshow_data(soup, page_template)

    content = main_content + text_content + accordion_content + card_group_content + slideshow_content

    return clean_contents(content)

  def scrape(self, url):
    if url in self.visited_links:
        return

    print(f"Scraping: {url}")
    self.visited_links.add(url)

    soup = self.fetch_page(url)
    if not soup:
        return

    page_template = template_data.copy()
    page_template['Link'] = url
    page_template['Site_Title'] = soup.title.string

    self.data += self.extract_content(soup, page_template)

    new_links = self.extract_links(soup, url)
    for link in new_links:
        self.scrape(link)

  def save_data(self, filename="umd_sustainability_data.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(self.data, f, indent=4, ensure_ascii=False)
    print(f"Data saved to {filename}")

Creating Scraper Objects

In [1]:
sites = ["https://sustainability.umd.edu/",
         "https://sustainingprogress.umd.edu/"]

In [11]:
import re

def get_site_name(site):
  site_re = re.compile(r'^https://([A-Za-z0-9]*).umd.edu/$')

  matched = re.match(site_re, site)
  if matched:
    return matched.group(1)
  else:
    raise Exception("Not good site name (not umd)")

In [14]:
data_count = 0

for site in sites:
  scraper = UMDWebScraper(site)
  scraper.scrape(site)

  site_name = get_site_name(site)
  print(f"Total for {site_name} site: {len(scraper.data)}")
  data_count += len(scraper.data)
  scraper.save_data(f"umd_{site_name}_data.json")

Scraping: https://sustainability.umd.edu/
Scraping: https://sustainability.umd.edu/transportation
Scraping: https://sustainability.umd.edu/buildings
Scraping: https://sustainability.umd.edu/food
Scraping: https://sustainability.umd.edu/progress/reports-other-resources
Scraping: https://sustainability.umd.edu/topics
Scraping: https://sustainability.umd.edu/waste
Scraping: https://sustainability.umd.edu/progress-commitments
Scraping: https://sustainability.umd.edu/education-and-research/majors-minors-graduate-programs
Scraping: https://sustainability.umd.edu/sustainability-grants/other-funding-sources
Scraping: https://sustainability.umd.edu/education-and-research/sustainability-teaching-fellows
Failed to fetch https://sustainability.umd.edu/education-and-research/sustainability-teaching-fellows: 403 Client Error: Forbidden for url: https://sustainability.umd.edu/education-and-research/archive-sustainability-teaching-fellows
Scraping: https://sustainability.umd.edu/about/contact-us
Scrap

In [15]:
data_count

1620