### UChicago MS-ADS program web scraping

In [1]:
import time
import json
import re
from urllib.parse import urljoin, urldefrag
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup, Tag
import html
from collections import defaultdict

In [7]:
BASE_URL = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"
visited = set() #Queue of links to visit
urls_to_visit = [BASE_URL]
documents = []

def setup_selenium(): #use Selenium to render pages
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    driver = webdriver.Chrome(options=options)
    return driver

def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return html.unescape(text)

def extract_tabs_accordions(soup, url, page_title): #Extract information stored in tabs and accordions such as courses
    entries = []
    tabs = soup.select('ul.tabs > li.tabs-title > a')
    for tab in tabs:
        tab_name = tab.get_text(strip=True)
        tab_id = tab.get('href', '').lstrip('#')
        tab_panel = soup.find('div', id=tab_id)
        if not tab_panel:
            continue

        accordion_items = tab_panel.select('ul.accordion li.accordion__item')
        for item in accordion_items:
            title_tag = item.select_one('a.accordion-title')
            desc_tag = item.select_one('div.accordion__content div.textblock')
            if title_tag and desc_tag:
                title = clean_text(title_tag.get_text())
                description = clean_text(desc_tag.get_text())
                entries.append({
                    "url": url,
                    "section": tab_name,
                    "title": title,
                    "text": f"{description}"
                })
    return entries

def extract_text_sections(soup, url, page_title): #Extract main content in sections
    for tab_block in soup.select(".tabs-content"):
        tab_block.decompose()

    sections = []
    current_heading = None
    current_text = []

    root = soup.find("main") or soup.body

    for tag in root.descendants:
        if isinstance(tag, Tag):
            if tag.name in ['h2', 'h3']:
                if current_heading or current_text:
                    section_text = clean_text(' '.join(current_text))
                    if section_text:
                        sections.append({
                            "url": url,
                            "section": page_title,
                            "title":current_heading,
                            "text": f"{section_text}"
                        })
                    current_text = []
                current_heading = clean_text(tag.get_text())

            elif tag.name == 'p':
                text = clean_text(tag.get_text())
                if text:
                    current_text.append(text)

    if current_heading or current_text:
        section_text = clean_text(' '.join(current_text))
        if section_text:
            sections.append({
                "url": url,
                "section": page_title,
                "title":current_heading,
                "text": f"{section_text}"
            })

    return sections
    
def extract_people_profiles(soup, url, page_title): #Extract faculty profiles that are stored in gridder list
    people = []
    person_blocks = soup.select('div.gridder-content')

    for person in person_blocks:
        person_id = person.get('id', '')
        name = ' '.join(part.capitalize() for part in person_id.replace('-', ' ').split()) if person_id else "N/A"

        bio = person.select("div.textblock p")
        description = ' '.join(clean_text(p.get_text()) for p in bio if p.get_text(strip=True))

        if description:
            people.append({
                "url": url,
                "section":page_title,
                "title":name,
                "text": f"{description}"
            })
    return people

def main():
    driver = setup_selenium()

    while urls_to_visit:
        url = urls_to_visit.pop(0)
        if url in visited:
            continue

        print(f"Scraping {url}")
        try:
            driver.get(url)
            time.sleep(2)

            page_html = driver.page_source
            soup = BeautifulSoup(page_html, 'html.parser')

            visited.add(url)

            page_title = soup.title.string.strip() if soup.title else 'Untitled Page'
            #Extract information from different structures
            tab_data = extract_tabs_accordions(soup, url, page_title)
            people_data = extract_people_profiles(soup, url, page_title)
            section_data = extract_text_sections(soup, url, page_title)

            documents.extend(tab_data)
            documents.extend(section_data)
            documents.extend(people_data)

            # Get more sublinks to visit
            for a_tag in soup.find_all("a", href=True):
                href = a_tag['href']
                full_url = urljoin(url, href)
                full_url, _ = urldefrag(full_url)  # Remove #fragments to avoid redundant urls

                if full_url.startswith(BASE_URL) and full_url not in visited and full_url not in urls_to_visit: #Only look at univisited sublinks for the program
                    urls_to_visit.append(full_url)

        except Exception as e:
            print(f"Error scraping {url}: {e}")

    driver.quit()
    section_title_map = defaultdict(list)

    for doc in documents:
        section = doc["section"]
        title = doc["title"]
        if title:
            section_title_map[section].append(title)

    # Create one summary row per section
    summary_rows = []
    for section, titles in section_title_map.items():
        summary_text = "\n".join(f", {t}" for t in titles)
        summary_rows.append({
            "url": "",  # Optional: set this to a representative URL if needed
            "section": section,
            "title": section,
            "text": f"{summary_text}"
        })

    # Add summaries to the documents list
    documents.extend(summary_rows)

    # Save data to JSON
    with open("uchicago_msads_data.json", "w", encoding="utf-8") as f:
        json.dump(documents, f, indent=2, ensure_ascii=False)
    print(f"Saved {len(documents)} text chunks.")

if __name__ == "__main__":
    main()

Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/in-person-program/
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/online-program/%20
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/online-program/
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/capstone-projects/
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/course-progressions/
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/how-to-apply/
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/events-deadlines/
Scraping https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/our-stude