In [22]:
import requests
from bs4 import BeautifulSoup
import json

# URL to fetch the HTML content from
url = "https://about.bgov.com/news/"

# Send a GET request to the URL
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all teaser news containers
    teaser_divs = soup.find_all('div', class_='teaser news ft-container')
    
    # Extract URLs
    urls = []
    for teaser in teaser_divs:
        # Find the <a> tags within each teaser
        a_tags = teaser.find_all('a', href=True)
        for a in a_tags:
            # Append the href attribute to the list of URLs
            urls.append(a['href'])
    
    # Print or use the list of URLs
    #print(urls)
else:
    print(f"Failed to retrieve content, status code: {response.status_code}")


# Remove duplicate URLs
unique_urls = list(set(urls))
print(unique_urls)


# List to store the extracted content
articles = []

# Process each unique URL
for url in unique_urls:
    html_content = fetch_html(url)
    article_content = extract_content(html_content)
    if article_content:
        articles.append({
            'url': url,
            **article_content
        })

# Write the content to a JSON file
with open('extracted_articles.json', 'w', encoding='utf-8') as f:
    json.dump(articles, f, ensure_ascii=False, indent=4)

['https://about.bgov.com/news/biden-battles-covid-calls-to-quit-what-to-know-in-washington/', 'https://about.bgov.com/news/harris-working-to-wrangle-democrats-what-to-know-in-washington/', 'https://about.bgov.com/news/harris-now-dems-presumptive-nominee-what-to-know-in-washington/', 'https://about.bgov.com/news/trump-speech-cements-control-of-gop-what-to-know-in-washington/', 'https://about.bgov.com/news/what-to-know-in-washington-harris-sets-eyes-on-three-vp-picks/', 'https://about.bgov.com/news/obamas-endorse-harris-in-video-what-to-know-in-washington/', 'https://about.bgov.com/news/harris-looks-to-expand-electoral-map-what-to-know-in-washington/', 'https://about.bgov.com/news/trump-talks-taxes-tariffs-taiwan-what-to-know-in-washington/', 'https://about.bgov.com/news/harris-rise-spurs-downballot-query-what-to-know-in-washington/', 'https://about.bgov.com/news/peek-into-harris-network-what-to-know-in-washington/']


In [16]:
import requests
from bs4 import BeautifulSoup

In [17]:
# Function to fetch HTML content from a URL
def fetch_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        return response.content
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

In [18]:
# Function to extract content from HTML
def extract_content(html):
    if not html:
        return None

    soup = BeautifulSoup(html, 'html.parser')

    # Extract the title
    title_tag = soup.find('h1', class_='news__header__title')
    title = title_tag.get_text(strip=True) if title_tag else 'No title found'

    # Extract the date
    time_tag = soup.find('time')
    date = time_tag.get_text(strip=True) if time_tag else 'No date found'

    # Extract the article body
    content_div = soup.find('div', class_='news__content')
    paragraphs = content_div.find_all('p') if content_div else []
    content = '\n'.join(p.get_text(strip=True) for p in paragraphs)

    return {
        'title': title,
        'date': date,
        'content': content
    }
