In [21]:
# import requests
# from bs4 import BeautifulSoup

# url = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"
# response = requests.get(url)
# soup = BeautifulSoup(response.text, 'html.parser')

# # Extract from section blocks 
# print("== Section Blocks ==")
# sections = soup.find_all('section')

# seen_links = set() #get all unique links

# for section in sections:
#     h2 = section.find('h2')
#     if h2:
#         section_title = h2.get_text(strip=True)
#         print(f"\n--- {section_title} ---")
    
#     links = section.find_all('a', href=True)
#     for link in links:
#         href = link['href'].strip()
#         # Skip mailto, anchors, and empty
#         if href.startswith("#") or href.startswith("mailto:") or not href:
#             continue
        
#         # Normalize and de-duplicate
#         if href.startswith('/'):
#             href = urljoin(url, href)
#         elif not href.startswith("http"):
#             href = urljoin(url, href)
        
#         if href not in seen_links:
#             seen_links.add(href)
#             print(f"Link Text: {link.get_text(strip=True) or 'No text'}")
#             print(f"URL: {href}")


== Section Blocks ==
Link Text: How to Apply
URL: https://datascience.uchicago.edu/how-to-apply/

--- Programs ---
Link Text: Apply today!
URL: https://apply-psd.uchicago.edu/apply/
Link Text: No text
URL: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/in-person-program/
Link Text: No text
URL: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/online-program/%20
Link Text: No text
URL: https://www.chicagobooth.edu/mba/joint-degree/mba-ms-applied-data-science?sc_lang=en

--- Apply Today! ---

--- How to Apply ---

--- Industry Leading Faculty ---
Link Text: Learn More
URL: https://datascience.uchicago.edu/about-us/

--- Data in Action - Capstone Projects ---
Link Text: Sample Capstone Projects
URL: https://datascience.uchicago.edu/capstone-projects/

--- Start Your Application ---
Link Text: Online
URL: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/online-program/

--- R

In [25]:
# import re
# for link in seen_links:
#     print(f"\n===== 📄 Scraping content from: {link} =====\n")
#     try:
#         sub_response = requests.get(link, timeout=10)
#         sub_soup = BeautifulSoup(sub_response.text, 'html.parser')

#         # Remove non-content elements
#         for tag in sub_soup(['script', 'style', 'noscript']):
#             tag.decompose()

#         # Extract text
#         all_text = sub_soup.get_text(separator='\n')
#         clean_text = re.sub(r'\n\s*\n+', '\n\n', all_text.strip())  # Remove excess newlines
#         print(clean_text)

#     except Exception as e:
#         print(f"❌ Error scraping {link}: {e}")


===== 📄 Scraping content from: https://datascience.uchicago.edu/events/grad-school-qa-ms-in-applied-data-science-for-uchicago-undergraduates/ =====

Grad School Q&A: MS in Applied Data Science (for UChicago Undergraduates) – DSI

Skip to main content

About

About the Data Science Institute

The Data Science Institute (DSI) executes the University of Chicago’s bold, innovative vision of Data Science as a new discipline. 

Jobs & Opportunities

Open faculty, postdoctoral, staff, and student roles with the UChicago Data Science Institute and our partners.

Visiting DSI @ UChicago

Contact

Research

				Initiatives			

AI + Science

A new paradigm of transformational AI-enabled scientific discovery across the physical and biological sciences.

Data & Democracy

Protecting democracy in the digital age through cross-disciplinary research and convening key stakeholders.

Internet Equity

Measuring and analyzing Internet performance and reliability to address inequity in U.S. communities.



In [27]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import json

# Base page
base_url = "https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/"
response = requests.get(base_url)
soup = BeautifulSoup(response.text, 'html.parser')

# Step 1: Collect all sub-links with their link texts
link_info_list = []

for section in soup.find_all('section'):
    section_title_tag = section.find('h2')
    section_title = section_title_tag.get_text(strip=True) if section_title_tag else "No Section Title"

    links = section.find_all('a', href=True)
    for link in links:
        href = link['href'].strip()
        link_text = link.get_text(strip=True) or "No Link Text"
        if href.startswith('#') or href.startswith('mailto:') or not href:
            continue
        full_url = urljoin(base_url, href)

        link_info_list.append({
            "section_title": section_title,
            "link_text": link_text,
            "url": full_url
        })

# Step 2: Visit each link and add the scraped text
scraped_data = []

for info in link_info_list:
    url = info["url"]
    print(f"\n===== 📄 Scraping content from: {url} =====\n")
    try:
        sub_response = requests.get(url, timeout=10)
        sub_soup = BeautifulSoup(sub_response.text, 'html.parser')

        # Remove unwanted tags
        for tag in sub_soup(['script', 'style', 'noscript']):
            tag.decompose()

        # Get raw text
        all_text = sub_soup.get_text(separator='\n')
        raw_text = all_text.strip()

        # Save the result
        scraped_data.append({
            "section_title": info["section_title"],
            "link_text": info["link_text"],
            "url": url,
            "content": raw_text
        })

    except Exception as e:
        print(f"❌ Error scraping {url}: {e}")
        scraped_data.append({
            "section_title": info["section_title"],
            "link_text": info["link_text"],
            "url": url,
            "content": f"Error scraping: {str(e)}"
        })





===== 📄 Scraping content from: https://datascience.uchicago.edu/how-to-apply/ =====


===== 📄 Scraping content from: https://apply-psd.uchicago.edu/apply/ =====


===== 📄 Scraping content from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/in-person-program/ =====


===== 📄 Scraping content from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/in-person-program/ =====


===== 📄 Scraping content from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/in-person-program/ =====


===== 📄 Scraping content from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/online-program/%20 =====


===== 📄 Scraping content from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-science/online-program/%20 =====


===== 📄 Scraping content from: https://datascience.uchicago.edu/education/masters-programs/ms-in-applied-data-s

In [30]:
# Step 3: Export the collected data
# Export to CSV
import pandas as pd
df = pd.DataFrame(scraped_data)

def clean_text(text):
    if not text:
        return ""
    
    # 1. Replace multiple \n into one space
    text = re.sub(r'\n+', ' ', text)

    # 2. Remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    # 3. Remove "boilerplate" sections (optional for now, you can fine-tune later)
    boilerplate_keywords = [
        "Skip to main content", "Get Updated", "Newsletter Signup", 
        "Follow", "Accessibility", "Nondiscrimination Statement", 
        "Physical Sciences Division", "University of Chicago ©", "Copyright"
    ]
    for keyword in boilerplate_keywords:
        text = text.replace(keyword, "")

    # 4. Strip leading/trailing spaces
    return text.strip()

df["content_cleaned"] = df["content"].apply(clean_text)

In [34]:
df.iloc[0][4]

  df.iloc[0][4]


"How to Apply – DSI  MS in Applied Data Science – Autumn 2025 3rd Priority Deadline 5/6 close About About the Data Science Institute The Data Science Institute (DSI) executes the University of Chicago’s bold, innovative vision of Data Science as a new discipline. Jobs & Opportunities Open faculty, postdoctoral, staff, and student roles with the UChicago Data Science Institute and our partners. Visiting DSI @ UChicago Contact Research Initiatives AI + Science A new paradigm of transformational AI-enabled scientific discovery across the physical and biological sciences. Data & Democracy Protecting democracy in the digital age through cross-disciplinary research and convening key stakeholders. Internet Equity Measuring and analyzing Internet performance and reliability to address inequity in U.S. communities. AICE: AI for Climate Inter-discplinary integration of AI with fundamental domain knowledge to accelerate and transform climate research with a focus on both scientific advances and s

In [35]:
df[["section_title", "link_text", "url", "content_cleaned"]].to_csv("ms_applied_data_science_cleaned.csv", index=False)