In [None]:
import requests
from bs4 import BeautifulSoup
import time  # For adding a delay between requests

def extract_article_content(article_url):
    response = requests.get(article_url)
    if response.status_code != 200:
        return None

    soup = BeautifulSoup(response.content, 'html.parser')

    # Extracting the article content using the provided tag and class
    content_paragraphs = soup.find_all('p', class_='sc-77igqf-0 fnnahv')
    content = " ".join(p.text for p in content_paragraphs)

    return content

def updated_scrape_the_onion():
    base_url = 'https://www.theonion.com/'
    response = requests.get(base_url)

    # Check if the request was successful
    if response.status_code != 200:
        print("Failed to retrieve the webpage.")
        return []

    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all article elements using the tag and class for titles
    articles = soup.find_all('h4', class_='sc-1qoge05-0 jqRtko')

    scraped_data = []
    for article in articles:
        title = article.text
        link = article.find_parent('a')['href']
        content = extract_article_content(link)
        scraped_data.append({
            "title": title,
            "link": link,
            "content": content
        })

        # Adding a delay of 2 seconds between requests to be respectful to the website's server
        time.sleep(2)

    return scraped_data

# Testing the function
data = updated_scrape_the_onion()
for article in data:
    print(article["title"], "->", article["link"])
    print("Content:", article["content"])
    print("----------------------------------------------------------")


Senator Dianne Feinstein, Trailblazer In Being Old, Dead At 90 -> https://www.theonion.com/senator-dianne-feinstein-trailblazer-in-being-old-dea-1850887265
Content: WASHINGTON—Having been alive as far back as 1933, Sen. Dianne Feinstein of California, a trailblazer in being old, died Thursday night at age 90. “Sen. Feinstein, born Dianne Goldman, started off young but through hard work and dedication rose through the ranks to become very, very old,” said the late Democrat’s chief of staff, James Sauls, who in prepared remarks to reporters cited Feinstein’s many career highlights, such as turning 88 years old in 2021, turning 89 years old in 2022, and, just this year, turning 90. “Her remarkable persistence as the oldest sitting U.S. senator made her a role model to nonagenarians everywhere. Across the nation tonight, little girls will find inspiration in her story, knowing that they, too, can one day grow up to be incredibly old.” Sauls added that while Feinstein did not fulfill her lo

In [None]:
import os

def get_starting_count(output_dir):
    """
    Get the next count based on existing files in the directory.
    """
    existing_files = os.listdir(output_dir)
    if not existing_files:
        return 1

    # Extract numbers from filenames and get the maximum
    counts = [int(file.split('.')[0]) for file in existing_files if file.split('.')[0].isdigit()]
    if not counts:
        return 1
    return max(counts) + 1

def save_to_txt(data):
    """
    Save each article dictionary in the list to a txt file.
    """
    output_dir = "theonion_articles"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    count = get_starting_count(output_dir)
    for article in data:
        # Use the count as the filename, formatted with leading zeros
        filename = f"{count:04}.txt"
        filepath = os.path.join(output_dir, filename)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(article["title"] + "\n\n")
            f.write(article["content"])

        count += 1

In [None]:
# Example usage
# data = updated_scrape_the_onion()
save_to_txt(data)