In [1]:
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment

# 1. Retrieve the Maryland tourism web page and print HTTP status code
url_md = "https://www.visitmaryland.org/"
try:
    resp_md = requests.get(url_md, timeout=20)
    print("1) HTTP status code for visitmaryland.org:", resp_md.status_code)
except requests.exceptions.RequestException as e:
    print("1) Error fetching visitmaryland.org:", e)

# 2. Extract visible text from the Maryland main page (ignore script/style)
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def visible_text_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    texts = soup.find_all(text=True)
    visible_texts = filter(tag_visible, texts)
    # join and strip extra whitespace
    return " ".join(t.strip() for t in visible_texts if t.strip())

if "resp_md" in locals() and isinstance(resp_md, requests.Response):
    text_md = visible_text_from_html(resp_md.text)
    print("\n2) First 1000 characters of visible text from visitmaryland.org:\n")
    print(text_md[:1000])  # just show a sample so itâ€™s not huge

# 3. Fetch Wikipedia page for Natural Language Processing and extract headings
url_nlp = "https://en.wikipedia.org/wiki/Natural_language_processing"
try:
    resp_nlp = requests.get(url_nlp, timeout=20)
    resp_nlp.raise_for_status()
    soup_nlp = BeautifulSoup(resp_nlp.text, "html.parser")

    headings = []
    for level in ["h1", "h2", "h3"]:
        for h in soup_nlp.find_all(level):
            headings.append((level, h.get_text(strip=True)))

    print("\n3) Headings (h1, h2, h3) from NLP Wikipedia page:\n")
    for level, text in headings:
        print(f"{level}: {text}")
except requests.exceptions.RequestException as e:
    print("Error fetching NLP Wikipedia page:", e)

# 4. Extract and print all URLs (href values) from the NLP Wikipedia page
if "soup_nlp" in locals():
    links = []
    for a in soup_nlp.find_all("a", href=True):
        links.append(a["href"])

    print("\n4) First 100 URLs from NLP Wikipedia page:\n")
    for href in links[:100]:  # limit to first 100 so output is manageable
        print(href)
    print(f"\nTotal links found: {len(links)}")

# 5. Extract the first paragraph and save to nlp_intro.txt
if "soup_nlp" in locals():
    first_p = soup_nlp.find("p")
    if first_p:
        first_p_text = first_p.get_text(strip=True)
        with open("nlp_intro.txt", "w", encoding="utf-8") as f:
            f.write(first_p_text)
        print("\n5) Saved first paragraph to nlp_intro.txt")
        print("First paragraph preview:\n")
        print(first_p_text)
    else:
        print("No <p> tag found on the NLP page.")


1) HTTP status code for visitmaryland.org: 403

2) First 1000 characters of visible text from visitmaryland.org:

Enable JavaScript and cookies to continue
Error fetching NLP Wikipedia page: 403 Client Error: Forbidden for url: https://en.wikipedia.org/wiki/Natural_language_processing


  texts = soup.find_all(text=True)
