In [22]:
import requests                         # For making HTTP requests to fetch web content
from bs4 import BeautifulSoup          # For parsing HTML (though less used here as feedparser/newspaper3k handle much of it)
import feedparser                       # Specifically for parsing RSS/Atom feeds
from newspaper import Article as NewspaperArticle, ArticleException # For extracting main content from news article URLs
from datetime import datetime, timezone  # For handling and standardizing dates and times
import time                             # For adding polite delays between requests
import pandas as pd                     # For displaying data structures like DataFrames in the notebook

# --- Scraping Configuration ---
# Define the RSS feeds to be scraped.
# Ensure these URLs are active and relevant to the project's analysis topic.
# Note: It's good practice to verify feed validity regularly as sources can change.
NEWS_RSS_FEEDS = {
    "Guardian Environment": "https://www.theguardian.com/environment/rss", # Known good, relevant feed
    "Ars Technica": "http://feeds.arstechnica.com/arstechnica/index/"     # General tech news, good for testing scraper
    # The Reuters Environment feed previously considered was found to be inactive.
    # Add other verified RSS feeds relevant to "Advancements in Renewable Energy Technologies".
}

# Define the Reddit subreddits to be scraped.
REDDIT_SUBREDDITS = {
    "RenewableEnergySub": "RenewableEnergy"  # Internal key mapping to the actual subreddit name
}
# Limit the number of recent posts fetched from each Reddit source per cycle.
# Kept small for efficient testing within the notebook; can be increased for more data.
REDDIT_POST_LIMIT = 5

print("Imports for web scraping are complete.")
print("Scraping configurations (RSS feeds, Reddit targets, post limits) are loaded.")
print(f"Target RSS Feeds for this session: {list(NEWS_RSS_FEEDS.keys())}")
print(f"Target Subreddits for this session: {list(REDDIT_SUBREDDITS.values())} (Limit: {REDDIT_POST_LIMIT} posts each)")

Imports for web scraping are complete.
Scraping configurations (RSS feeds, Reddit targets, post limits) are loaded.
Target RSS Feeds for this session: ['Guardian Environment', 'Ars Technica']
Target Subreddits for this session: ['RenewableEnergy'] (Limit: 5 posts each)


In [23]:
from datetime import datetime, timezone # Ensure these are imported if not already in a previous cell of this notebook
import time                         # Required for time.mktime

def parse_datetime(date_string):
    """
    Parses various common date string formats and returns a timezone-aware 
    datetime object, standardized to UTC.
    
    Handles formats commonly found in RSS feeds (RFC 822, RFC 3339 variations), 
    ISO 8601 timestamps, and Unix timestamps (e.g., from Reddit).
    """
    if not date_string:
        return None # Return None for empty or None input

    # Prioritized list of common date formats to attempt parsing
    common_formats = [
        '%a, %d %b %Y %H:%M:%S %z',        # e.g., 'Tue, 21 May 2025 10:00:00 +0000'
        '%a, %d %b %Y %H:%M:%S %Z',        # e.g., 'Tue, 21 May 2025 10:00:00 GMT'
        '%Y-%m-%dT%H:%M:%S%z',            # e.g., '2025-05-21T10:00:00+00:00' (ISO 8601)
        '%Y-%m-%dT%H:%M:%S.%f%z',        # e.g., '2025-05-21T10:00:00.123456+00:00' (ISO 8601 with microseconds)
        '%Y-%m-%d %H:%M:%S',              # e.g., '2025-05-21 10:00:00' (Assumed UTC if no timezone)
        '%d %b %Y %H:%M:%S %Z',            # Less common RSS variant
    ]
    
    dt_object = None

    # Attempt to parse if input is a Unix timestamp (integer or float)
    if isinstance(date_string, (int, float)):
        try:
            return datetime.fromtimestamp(date_string, timezone.utc)
        except (ValueError, TypeError):
            pass # If conversion fails, proceed to string parsing methods

    # Attempt to parse if input is a string
    if isinstance(date_string, str):
        for fmt in common_formats:
            try:
                dt_object = datetime.strptime(date_string, fmt)
                # Standardize to UTC: If datetime object is naive (no timezone info), assume UTC.
                if dt_object.tzinfo is None or dt_object.tzinfo.utcoffset(dt_object) is None:
                    dt_object = dt_object.replace(tzinfo=timezone.utc)
                return dt_object
            except ValueError:
                continue # If current format fails, try the next one
    
    # Attempt to parse if input is a feedparser.struct_time object
    elif hasattr(date_string, 'tm_year'): 
         try:
            # Convert struct_time to Unix timestamp, then to datetime object
            return datetime.fromtimestamp(time.mktime(date_string), timezone.utc)
         except (ValueError, TypeError):
            pass

    # Fallback: If all parsing attempts fail, log a warning (optional) and return a default.
    # For critical applications, raising an error or returning None might be preferred.
    # print(f"Warning: Date string '{date_string}' could not be parsed into a known format. Defaulting to current UTC time.")
    return datetime.now(timezone.utc) 

print("Date parsing utility function 'parse_datetime' defined.")

# --- Test Cases for parse_datetime ---
print("\n--- Testing 'parse_datetime' with various date formats: ---")
test_dates_and_types = [
    ("Tue, 21 May 2025 10:00:00 +0000", "Standard RSS with offset"),
    ("Tue, 21 May 2025 10:00:00 GMT",   "Standard RSS with timezone name"),
    ("2025-05-21T10:00:00+00:00",       "ISO 8601 with offset"),
    (1679400000,                        "Unix timestamp (represents 2023-03-21 12:00:00 UTC)"),
    (None,                              "None input"),
    ("Invalid Date String",             "Unparseable string"),
    (time.gmtime(1679400000),           "feedparser.struct_time object (simulated)") # Test for struct_time
]

results = []
for original_value, description in test_dates_and_types:
    parsed_datetime = parse_datetime(original_value)
    results.append({
        "Description": description,
        "Original Value": str(original_value), # Convert all to string for consistent display
        "Parsed Datetime": parsed_datetime,
        "Output Type": str(type(parsed_datetime))
    })

# Display results in a more structured way if pandas is available and preferred
try:
    import pandas as pd
    results_df = pd.DataFrame(results)
    display(results_df) # display() is a Jupyter-specific function for rich display
except ImportError:
    for res in results:
        print(f"Description: {res['Description']}")
        print(f"  Original: {res['Original Value']}")
        print(f"  Parsed:   {res['Parsed Datetime']} (Type: {res['Output Type']})")

Date parsing utility function 'parse_datetime' defined.

--- Testing 'parse_datetime' with various date formats: ---


Unnamed: 0,Description,Original Value,Parsed Datetime,Output Type
0,Standard RSS with offset,"Tue, 21 May 2025 10:00:00 +0000",2025-05-21 10:00:00+00:00,<class 'datetime.datetime'>
1,Standard RSS with timezone name,"Tue, 21 May 2025 10:00:00 GMT",2025-05-21 10:00:00+00:00,<class 'datetime.datetime'>
2,ISO 8601 with offset,2025-05-21T10:00:00+00:00,2025-05-21 10:00:00+00:00,<class 'datetime.datetime'>
3,Unix timestamp (represents 2023-03-21 12:00:00...,1679400000,2023-03-21 12:00:00+00:00,<class 'datetime.datetime'>
4,None input,,NaT,<class 'NoneType'>
5,Unparseable string,Invalid Date String,2025-05-23 05:19:30.092392+00:00,<class 'datetime.datetime'>
6,feedparser.struct_time object (simulated),"time.struct_time(tm_year=2023, tm_mon=3, tm_md...",2023-03-21 06:30:00+00:00,<class 'datetime.datetime'>


In [24]:
from newspaper import Article as NewspaperArticle, ArticleException # Ensure this import is at the top of the cell or notebook
from datetime import datetime, timezone # Ensure datetime and timezone are available

def fetch_article_content(url):
    """
    Fetches, parses, and extracts the main textual content and publication date 
    from a news article at the given URL using the newspaper3k library.

    Args:
        url (str): The URL of the news article.

    Returns:
        tuple: (text, pub_date)
                 - text (str): The extracted main text of the article.
                 - pub_date (datetime.datetime): The extracted publication date, standardized to UTC.
                 Returns (None, None) if fetching or parsing fails, or if the URL is invalid.
    """
    if not url or not isinstance(url, str) or not url.startswith(('http://', 'https://')):
        # print(f"Debug: Invalid or empty URL provided: {url}")
        return None, None
    
    try:
        # Initialize the Article object from newspaper3k.
        # fetch_images=False: Disables downloading of images, speeding up the process.
        # memoize_articles=False: Disables caching of articles to disk. Useful if content might change
        #                         or to avoid using disk cache during development across sessions.
        article_obj = NewspaperArticle(url, fetch_images=False, memoize_articles=False)
        
        # Download the HTML content of the article.
        article_obj.download()
        
        # Parse the downloaded HTML to extract data.
        article_obj.parse()
        
        text = article_obj.text
        
        # Extract and standardize the publication date.
        pub_date = article_obj.publish_date
        if pub_date:
            # If the extracted date is naive (no timezone info), assume it's UTC.
            if pub_date.tzinfo is None or pub_date.tzinfo.utcoffset(pub_date) is None:
                pub_date = pub_date.replace(tzinfo=timezone.utc)
        
        return text, pub_date
        
    except ArticleException as e:
        # newspaper3k specific exception for issues during download or parsing.
        # print(f"Debug: Newspaper3k ArticleException for URL '{url}': {e}")
        return None, None
    except Exception as e:
        # Catch any other unexpected errors during the process.
        # print(f"Debug: Unexpected error fetching/parsing article URL '{url}': {e}")
        return None, None

print("Article content extraction function 'fetch_article_content' defined.")

# --- Test Cases for fetch_article_content ---
print("\n--- Testing 'fetch_article_content' with various URLs: ---")
# Note: The success of these tests depends on the current accessibility and structure 
# of these specific URLs, which can change over time.
# Using a known, stable news article URL is best for repeatable tests if available.

test_article_urls_for_content = [
    ("Valid Article (Example - The Guardian, if still accessible)", "https://www.theguardian.com/environment/2023/oct/26/hope-for-coral-great-barrier-reef-shows-tentative-signs-of-recovery"),
    ("Section Page (Reuters Technology - newspaper3k might struggle)", "https://www.reuters.com/technology/"),
    ("Non-Existent Page (example.com)", "https://www.example.com/this-page-does-not-exist-123"),
    ("Invalid URL (None)", None),
    ("Invalid URL (Empty String)", "")
]

results_fetch_content = []
for description, test_url in test_article_urls_for_content:
    print(f"\nAttempting to fetch: {description} - URL: {test_url}")
    content, pub_date = fetch_article_content(test_url)
    
    result_entry = {
        "Description": description,
        "URL": str(test_url),
        "Publication Date": pub_date,
        "Content Extracted": bool(content and content.strip()),
        "Content Preview (first 70 chars)": content[:70].replace('\n', ' ') + "..." if content and content.strip() else "N/A"
    }
    results_fetch_content.append(result_entry)
    
    if content and content.strip():
        print(f"  Successfully extracted content.")
        print(f"  Publication Date: {pub_date}")
    else:
        print(f"  Could not retrieve valid content.")

# Display results in a structured way using Pandas DataFrame
try:
    import pandas as pd
    results_df_fetch = pd.DataFrame(results_fetch_content)
    display(results_df_fetch)
except ImportError:
    print("\n(Pandas not available, printing raw results)")
    for res in results_fetch_content:
        print(res)

Article content extraction function 'fetch_article_content' defined.

--- Testing 'fetch_article_content' with various URLs: ---

Attempting to fetch: Valid Article (Example - The Guardian, if still accessible) - URL: https://www.theguardian.com/environment/2023/oct/26/hope-for-coral-great-barrier-reef-shows-tentative-signs-of-recovery
  Could not retrieve valid content.

Attempting to fetch: Section Page (Reuters Technology - newspaper3k might struggle) - URL: https://www.reuters.com/technology/
  Could not retrieve valid content.

Attempting to fetch: Non-Existent Page (example.com) - URL: https://www.example.com/this-page-does-not-exist-123
  Could not retrieve valid content.

Attempting to fetch: Invalid URL (None) - URL: None
  Could not retrieve valid content.

Attempting to fetch: Invalid URL (Empty String) - URL: 
  Could not retrieve valid content.


Unnamed: 0,Description,URL,Publication Date,Content Extracted,Content Preview (first 70 chars)
0,"Valid Article (Example - The Guardian, if stil...",https://www.theguardian.com/environment/2023/o...,,False,
1,Section Page (Reuters Technology - newspaper3k...,https://www.reuters.com/technology/,,False,
2,Non-Existent Page (example.com),https://www.example.com/this-page-does-not-exi...,,False,
3,Invalid URL (None),,,False,
4,Invalid URL (Empty String),,,False,


In [25]:
# Assumes fetch_article_content and parse_datetime are defined in previous cells
# Assumes REDDIT_POST_LIMIT (used here for consistent demo item limit) and NEWS_RSS_FEEDS are defined in Cell 1

def scrape_rss_feed(source_name, rss_url):
    """
    Scrapes articles from a given RSS feed URL.
    It first fetches the feed content using 'requests' for better control over headers,
    then parses it with 'feedparser'. For each feed item, it attempts to fetch the
    full article content using the 'fetch_article_content' function (which utilizes newspaper3k).

    Args:
        source_name (str): A descriptive name for the RSS feed source.
        rss_url (str): The URL of the RSS feed.

    Returns:
        list: A list of dictionaries, where each dictionary represents a scraped article
              and includes its title, URL, publication date, full content, and source name.
              Returns an empty list if scraping fails or no articles are processed.
    """
    # print(f"Attempting to scrape RSS feed: {source_name} ({rss_url})") # Kept for verbosity if desired
    articles = []
    feed_data = None # Initialize to handle potential errors before assignment

    # Standard headers to mimic a browser request, can improve success rate
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/xml, text/xml, application/rss+xml, application/atom+xml, */*' # Be explicit about accepted types
    }
    
    try:
        # Step 1: Fetch the raw RSS feed content using requests
        response = requests.get(rss_url, headers=headers, timeout=15) # Added a timeout
        response.raise_for_status() # Raises an HTTPError for bad responses (4XX or 5XX)
        
        feed_content = response.content # Use .content for binary data; feedparser handles decoding
        
        # Step 2: Parse the fetched content using feedparser
        feed_data = feedparser.parse(feed_content)
        
        # Using REDDIT_POST_LIMIT here just to limit items for consistent notebook testing speed
        entries_to_process = feed_data.entries[:REDDIT_POST_LIMIT] 
        # print(f"Found {len(feed_data.entries)} entries in '{source_name}' RSS feed, processing up to {len(entries_to_process)}.")

        if not feed_data.entries:
            # print(f"Warning: No entries found in the feed for {source_name} at {rss_url}.")
            if feed_data.bozo: # Check if feedparser encountered issues (e.g., malformed XML)
                # print(f"  Feedparser 'bozo' flag is set, indicating potential parsing problems. Exception: {feed_data.bozo_exception}")
                pass # Silencing for cleaner notebook output, uncomment for debugging

        # Step 3: Process each entry from the parsed feed
        for entry in entries_to_process:
            title = entry.get('title')
            url = entry.get('link') # The direct link to the article
            
            # Extract and parse publication date from the feed entry
            pub_date_parsed_struct = entry.get('published_parsed') or entry.get('updated_parsed')
            publication_date_from_feed = parse_datetime(pub_date_parsed_struct) if pub_date_parsed_struct else None

            if not (title and url):
                # print(f"Skipping entry from '{source_name}' due to missing title or URL.")
                continue

            # Step 4: Fetch full article content using the extracted URL
            # print(f"  Fetching content for: '{title[:50]}...' from {url}")
            full_content, publication_date_from_article = fetch_article_content(url)
            
            # Determine the most reliable publication date
            final_publication_date = publication_date_from_feed
            if not final_publication_date and publication_date_from_article:
                final_publication_date = publication_date_from_article
            elif not final_publication_date: # If both are None, fallback to current time
                final_publication_date = datetime.now(timezone.utc)

            # Store the article data, using a placeholder if full content extraction failed
            content_to_store = full_content if full_content and full_content.strip() else f"Content not retrievable - {title}"
            articles.append({
                'title': title,
                'url': url,
                'publication_date': final_publication_date,
                'full_content': content_to_store,
                'source_name': source_name
            })
            
            time.sleep(0.2) # Polite delay between fetching full articles to avoid overwhelming servers

    except requests.exceptions.RequestException as req_err:
        print(f"Network Error: Could not fetch RSS feed '{rss_url}'. Reason: {req_err}")
    except Exception as e:
        print(f"Error during RSS scraping for '{source_name}' ({rss_url}): {e}")
        # import traceback # Uncomment for detailed error traceback during debugging
        # traceback.print_exc()
        
    # Final check and reporting for this source
    if feed_data and feed_data.entries and not articles and entries_to_process:
        # This condition means entries were found in the feed, but none were successfully processed 
        # (e.g., fetch_article_content failed for all of them).
        print(f"Info: Found {len(feed_data.entries)} entries in '{source_name}', but article content extraction might have failed for all processed entries.")
    elif not articles:
        # This means either no entries were found in the feed initially, or an early error occurred.
        # print(f"Info: No articles were successfully processed from '{source_name}'.")
        if feed_data is None:
            # print("  (Reason: Feed data could not be fetched or parsed, likely due to a network error or malformed feed).")
            pass
        elif not feed_data.entries:
            # print("  (Reason: The RSS feed was parsed but contained no entries).")
            pass

    return articles

print("'scrape_rss_feed' function has been defined.")

# --- Test Execution for 'scrape_rss_feed' ---
# This section demonstrates how to call the function and view its output.
# Ensure NEWS_RSS_FEEDS is defined (usually in Cell 1 of this notebook).
print("\n--- Testing 'scrape_rss_feed' with configured RSS feeds: ---")

test_results_rss = []
if NEWS_RSS_FEEDS:
    for feed_name, feed_url in NEWS_RSS_FEEDS.items():
        print(f"\nAttempting to process feed: {feed_name} ({feed_url})")
        scraped_articles_from_feed = scrape_rss_feed(feed_name, feed_url)
        if scraped_articles_from_feed:
            print(f"  Successfully processed {len(scraped_articles_from_feed)} entries from '{feed_name}'.")
            test_results_rss.extend(scraped_articles_from_feed) # Add to combined list for display
            # Display a sample from this specific feed
            display(pd.DataFrame(scraped_articles_from_feed).head(2)[['title', 'publication_date', 'source_name']])
        else:
            print(f"  No articles were processed from '{feed_name}'.")
else:
    print("No RSS feeds are configured in NEWS_RSS_FEEDS for testing.")

# Display a combined sample if multiple feeds were processed
if test_results_rss:
    print("\n--- Combined RSS Scraping Test Summary (Sample) ---")
    all_rss_df = pd.DataFrame(test_results_rss)
    display(all_rss_df.head()[['title', 'publication_date', 'source_name']])
    print(f"Total articles processed from all tested RSS feeds: {len(all_rss_df)}")
else:
    print("\nNo articles were processed from any RSS feeds during this test.")

'scrape_rss_feed' function has been defined.

--- Testing 'scrape_rss_feed' with configured RSS feeds: ---

Attempting to process feed: Guardian Environment (https://www.theguardian.com/environment/rss)
  Successfully processed 5 entries from 'Guardian Environment'.


Unnamed: 0,title,publication_date,source_name
0,Revealed: three tonnes of uranium legally dump...,2025-05-22 08:30:33+00:00,Guardian Environment
1,‘Unprecedented’ marine heatwave hits waters ar...,2025-05-22 07:57:36+00:00,Guardian Environment



Attempting to process feed: Ars Technica (http://feeds.arstechnica.com/arstechnica/index/)
  Successfully processed 5 entries from 'Ars Technica'.


Unnamed: 0,title,publication_date,source_name
0,"In 3.5 years, Notepad.exe has gone from “barel...",2025-05-22 17:16:32+00:00,Ars Technica
1,The Pentagon seems to be fed up with ULA’s roc...,2025-05-22 17:02:48+00:00,Ars Technica



--- Combined RSS Scraping Test Summary (Sample) ---


Unnamed: 0,title,publication_date,source_name
0,Revealed: three tonnes of uranium legally dump...,2025-05-22 08:30:33+00:00,Guardian Environment
1,‘Unprecedented’ marine heatwave hits waters ar...,2025-05-22 07:57:36+00:00,Guardian Environment
2,How an idealistic tree-planting project turned...,2025-05-21 23:30:49+00:00,Guardian Environment
3,‘Waste collection is green work’: how a pro-po...,2025-05-22 08:10:29+00:00,Guardian Environment
4,"Trump’s tax bill to cost 830,000 jobs and driv...",2025-05-22 05:30:09+00:00,Guardian Environment


Total articles processed from all tested RSS feeds: 10


In [26]:
# Assumes fetch_article_content and parse_datetime are defined in previous cells
# Assumes REDDIT_POST_LIMIT and REDDIT_SUBREDDITS are defined in Cell 1 of this notebook

def scrape_reddit_forum(source_name_prefix, subreddit_name, limit=5):
    """
    Scrapes recent posts from a specified subreddit using its public JSON endpoint.
    For posts that are links to external articles, it attempts to fetch the full
    article content using the 'fetch_article_content' function.

    Args:
        source_name_prefix (str): A prefix for the source name (e.g., 'RedditTopic').
        subreddit_name (str): The name of the subreddit to scrape (e.g., 'RenewableEnergy').
        limit (int): The maximum number of recent posts to fetch.

    Returns:
        list: A list of dictionaries, where each dictionary represents a scraped post/article.
              Returns an empty list if scraping fails or no posts are processed.
    """
    # print(f"Attempting to scrape Reddit: r/{subreddit_name}, limit={limit}") # Kept for verbosity if desired
    articles = []
    
    # It's good practice to set a custom User-Agent for web scraping to be identifiable
    # and to avoid being blocked as a generic bot. Replace 'yourcontact@example.com'
    # with your actual contact info if you were running this extensively.
    headers = {
        'User-agent': f'Mozilla/5.0 (compatible; {source_name_prefix}_TrendAnalyzerBot/0.2; +yourcontact@example.com)'
    }
    
    try:
        # Construct the URL for accessing the subreddit's 'new' posts via its public JSON API.
        url = f"https://www.reddit.com/r/{subreddit_name}/new.json?limit={limit}"
        response = requests.get(url, headers=headers, timeout=15) # Timeout for the request
        response.raise_for_status() # Will raise an HTTPError for bad status codes (4xx or 5xx)
        
        data = response.json() # Parse the JSON response

        # Validate the structure of the received JSON data
        if 'data' not in data or 'children' not in data['data']:
            print(f"  Error: Unexpected JSON structure from r/{subreddit_name}. Missing 'data' or 'children' field.")
            return articles # Return empty list if structure is not as expected

        posts_to_process = data['data']['children']
        # print(f"Found {len(posts_to_process)} posts in r/{subreddit_name} JSON response to process.")

        for post in posts_to_process:
            post_data = post['data']
            title = post_data.get('title')
            permalink = post_data.get('permalink')
            
            if not (title and permalink):
                # print(f"  Skipping post from r/{subreddit_name} due to missing title or permalink (ID: {post_data.get('id')}).")
                continue
                
            full_url = f"https://www.reddit.com{permalink}" # URL of the Reddit post itself
            created_utc = post_data.get('created_utc')    # Timestamp of post creation
            publication_date = parse_datetime(created_utc) # Standardize to datetime object
            
            content = ""
            source_display_name = f"{source_name_prefix} r/{subreddit_name}" # Consistent source naming

            # Differentiate between self-posts (text directly on Reddit) and link posts
            if post_data.get('is_self', False): # Check if it's a self-post
                content = post_data.get('selftext', '')
                # print(f"  Processing self-post from r/{subreddit_name}: '{title[:50]}...'")
            elif 'url_overridden_by_dest' in post_data: # Check if it's a link post
                external_url = post_data['url_overridden_by_dest']
                # print(f"  Processing link post from r/{subreddit_name}: '{title[:50]}...' to {external_url[:50]}")
                fetched_content, _ = fetch_article_content(external_url) # Attempt to get external article content
                if fetched_content and fetched_content.strip():
                    content = fetched_content
                else:
                    # print(f"    Could not fetch external content for {external_url}, using post title as fallback content.")
                    content = title # Fallback if external content fetching fails or is empty
            else: # Other post types (e.g., direct image/video links not pointing to an article)
                # print(f"  Processing other post type from r/{subreddit_name} (e.g., direct media): '{title[:50]}...'")
                content = title # Fallback to using post title as content

            # Add to list, using a placeholder if content is still empty after attempts
            content_to_store = content if content and content.strip() else f"Content not retrievable or empty - {title}"
            articles.append({
                'title': title,
                'url': full_url, 
                'publication_date': publication_date,
                'full_content': content_to_store,
                'source_name': source_display_name
            })
            
            time.sleep(0.5) # Polite delay to respect Reddit's servers when accessing public JSON

    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP Error encountered while scraping Reddit r/{subreddit_name}: {http_err}")
        # It can be helpful to see the response text if an HTTP error occurs (e.g., 403 Forbidden, 404 Not Found)
        # Be cautious with printing response.text if it might be very large or contain sensitive info in a real app.
        # if hasattr(http_err, 'response') and http_err.response is not None:
            # print(f"  Response content sample: {http_err.response.text[:500]}")
    except requests.exceptions.RequestException as req_err:
        print(f"Network Error (requests): Could not scrape Reddit r/{subreddit_name}. Reason: {req_err}")
    except Exception as e:
        print(f"An unexpected error occurred during Reddit scraping for r/{subreddit_name}: {e}")
        # import traceback # Uncomment for detailed traceback during debugging
        # traceback.print_exc()
        
    # Final reporting for this subreddit scrape attempt
    # Using 'posts_to_process' which is defined inside the try block.
    # Initialize it before the try block for safer access in this final reporting.
    num_posts_found_in_json = len(data['data']['children']) if 'data' in locals() and data and 'data' in data and 'children' in data['data'] else 0
    
    if num_posts_found_in_json > 0 and not articles:
        print(f"Info: Found {num_posts_found_in_json} posts in r/{subreddit_name} JSON, but no articles were successfully processed (content fetching might have failed for all).")
    elif not articles:
         print(f"Info: No articles were successfully processed from r/{subreddit_name}.")

    return articles

print("'scrape_reddit_forum' function has been defined.")

# --- Test Execution for 'scrape_reddit_forum' ---
# This section demonstrates how to call the function and view its output.
# Ensure REDDIT_SUBREDDITS and REDDIT_POST_LIMIT are defined (usually in Cell 1).
print("\n--- Testing 'scrape_reddit_forum' with configured subreddits: ---")

test_results_reddit = []
if REDDIT_SUBREDDITS:
    for key, sub_name in REDDIT_SUBREDDITS.items(): # key is e.g., "RenewableEnergySub", sub_name is "RenewableEnergy"
        print(f"\nAttempting to process subreddit: r/{sub_name} (using key: '{key}')")
        scraped_posts_from_sub = scrape_reddit_forum(key, sub_name, limit=REDDIT_POST_LIMIT)
        if scraped_posts_from_sub:
            print(f"  Successfully processed {len(scraped_posts_from_sub)} posts from r/{sub_name}.")
            test_results_reddit.extend(scraped_posts_from_sub)
             # Display a sample from this specific subreddit
            display(pd.DataFrame(scraped_posts_from_sub).head(2)[['title', 'publication_date', 'source_name']])
        else:
            print(f"  No posts were processed from r/{sub_name}.")
else:
    print("No subreddits are configured in REDDIT_SUBREDDITS for testing.")

# Display a combined sample if multiple subreddits were processed
if test_results_reddit:
    print("\n--- Combined Reddit Scraping Test Summary (Sample) ---")
    all_reddit_df = pd.DataFrame(test_results_reddit)
    display(all_reddit_df.head()[['title', 'publication_date', 'source_name']])
    print(f"Total posts processed from all tested subreddits: {len(all_reddit_df)}")
else:
    print("\nNo posts were processed from any subreddits during this test.")

'scrape_reddit_forum' function has been defined.

--- Testing 'scrape_reddit_forum' with configured subreddits: ---

Attempting to process subreddit: r/RenewableEnergy (using key: 'RenewableEnergySub')
  Successfully processed 5 posts from r/RenewableEnergy.


Unnamed: 0,title,publication_date,source_name
0,Solar shines as Germany's top electricity sour...,2025-05-23 04:21:26+00:00,RenewableEnergySub r/RenewableEnergy
1,House GOP moves to slash renewable energy tax ...,2025-05-22 20:01:50+00:00,RenewableEnergySub r/RenewableEnergy



--- Combined Reddit Scraping Test Summary (Sample) ---


Unnamed: 0,title,publication_date,source_name
0,Solar shines as Germany's top electricity sour...,2025-05-23 04:21:26+00:00,RenewableEnergySub r/RenewableEnergy
1,House GOP moves to slash renewable energy tax ...,2025-05-22 20:01:50+00:00,RenewableEnergySub r/RenewableEnergy
2,Alabama enacts ‘all-of-the-above’ energy plan,2025-05-22 18:39:16+00:00,RenewableEnergySub r/RenewableEnergy
3,Fluence just took a big step to make grid batt...,2025-05-22 12:01:57+00:00,RenewableEnergySub r/RenewableEnergy
4,New York awards contracts for 26 large-scale r...,2025-05-22 10:59:12+00:00,RenewableEnergySub r/RenewableEnergy


Total posts processed from all tested subreddits: 5


In [27]:
def run_scrapers_notebook():
    """
    Runs all configured scrapers (RSS and Reddit) and aggregates their results.
    Returns a Pandas DataFrame of all scraped articles.
    """
    all_articles = []
    
    # Scrape RSS Feeds
    if NEWS_RSS_FEEDS:
        print("\n--- Scraping RSS Feeds ---")
        for name, url in NEWS_RSS_FEEDS.items():
            print(f"Processing RSS: {name}...")
            rss_results = scrape_rss_feed(name, url) # This function already prints its own progress
            if rss_results:
                all_articles.extend(rss_results)
            print(f"Finished processing RSS: {name}. Found {len(rss_results)} articles.")
    else:
        print("No RSS feeds configured to scrape.")

    # Scrape Reddit Subreddits
    if REDDIT_SUBREDDITS:
        print("\n--- Scraping Reddit Subreddits ---")
        for key, sub_name in REDDIT_SUBREDDITS.items(): # key is like "RenewableEnergySub", sub_name is "RenewableEnergy"
            print(f"Processing Reddit: r/{sub_name}...")
            reddit_results = scrape_reddit_forum(key, sub_name, limit=REDDIT_POST_LIMIT) # Uses key as source_name_prefix
            if reddit_results:
                all_articles.extend(reddit_results)
            print(f"Finished processing Reddit: r/{sub_name}. Found {len(reddit_results)} posts.")
    else:
        print("No Subreddits configured to scrape.")
    
    print(f"\n--- Total articles/posts gathered from all sources: {len(all_articles)} ---")
    
    if all_articles:
        # Convert to DataFrame for easier handling and viewing
        df_all_articles = pd.DataFrame(all_articles)
        # Standardize publication_date to datetime objects if not already (should be handled by individual scrapers)
        if 'publication_date' in df_all_articles.columns:
            df_all_articles['publication_date'] = pd.to_datetime(df_all_articles['publication_date'], errors='coerce', utc=True)
        return df_all_articles
    
    return pd.DataFrame() # Return empty DataFrame if nothing scraped

print("run_scrapers_notebook function defined.")

run_scrapers_notebook function defined.


In [28]:
print("Running all configured scrapers via run_scrapers_notebook()...")
print(f"This will use REDDIT_POST_LIMIT = {REDDIT_POST_LIMIT} for all sources in this notebook context.")

# Before running, ensure your NEWS_RSS_FEEDS in Cell 1 contains valid and relevant URLs
# that worked in your earlier tests.
# If NEWS_RSS_FEEDS is empty or has non-working URLs, it will only scrape Reddit (if configured).

start_time_combined = time.time()
scraped_data_df = run_scrapers_notebook()
end_time_combined = time.time()

print(f"\nCombined scraping process took {end_time_combined - start_time_combined:.2f} seconds.")

if not scraped_data_df.empty:
    print("\n--- Sample of all scraped data (first 5 rows): ---")
    display(scraped_data_df.head())
    
    print("\n--- Dataframe Info: ---")
    scraped_data_df.info()
    
    print("\n--- Value Counts for 'source_name': ---")
    display(scraped_data_df['source_name'].value_counts())
    
    # Optional: Save all scraped data to a CSV for use in the next notebook
    # This can be helpful so you don't have to re-scrape every time.
    try:
        scraped_data_df.to_csv("scraped_articles_combined_notebook.csv", index=False)
        print("\nSuccessfully saved all scraped data to 'scraped_articles_combined_notebook.csv'")
    except Exception as e:
        print(f"\nError saving to CSV: {e}")
else:
    print("\nNo articles were scraped in the combined run from any source.")

Running all configured scrapers via run_scrapers_notebook()...
This will use REDDIT_POST_LIMIT = 5 for all sources in this notebook context.

--- Scraping RSS Feeds ---
Processing RSS: Guardian Environment...
Finished processing RSS: Guardian Environment. Found 5 articles.
Processing RSS: Ars Technica...
Finished processing RSS: Ars Technica. Found 5 articles.

--- Scraping Reddit Subreddits ---
Processing Reddit: r/RenewableEnergy...
Finished processing Reddit: r/RenewableEnergy. Found 5 posts.

--- Total articles/posts gathered from all sources: 15 ---

Combined scraping process took 21.90 seconds.

--- Sample of all scraped data (first 5 rows): ---


Unnamed: 0,title,url,publication_date,full_content,source_name
0,Revealed: three tonnes of uranium legally dump...,https://www.theguardian.com/environment/2025/m...,2025-05-22 08:30:33+00:00,The Environment Agency has allowed a firm to d...,Guardian Environment
1,‘Unprecedented’ marine heatwave hits waters ar...,https://www.theguardian.com/environment/2025/m...,2025-05-22 07:57:36+00:00,The sea off the coast of the UK and Ireland is...,Guardian Environment
2,How an idealistic tree-planting project turned...,https://www.theguardian.com/environment/2025/m...,2025-05-21 23:30:49+00:00,"For his entire life, John Lmakato has lived in...",Guardian Environment
3,‘Waste collection is green work’: how a pro-po...,https://www.theguardian.com/environment/2025/m...,2025-05-22 08:10:29+00:00,"Three decades ago, Rajabai Sawant used to pick...",Guardian Environment
4,"Trump’s tax bill to cost 830,000 jobs and driv...",https://www.theguardian.com/environment/2025/m...,2025-05-22 05:30:09+00:00,A Republican push to dismantle clean energy in...,Guardian Environment



--- Dataframe Info: ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   title             15 non-null     object             
 1   url               15 non-null     object             
 2   publication_date  15 non-null     datetime64[ns, UTC]
 3   full_content      15 non-null     object             
 4   source_name       15 non-null     object             
dtypes: datetime64[ns, UTC](1), object(4)
memory usage: 728.0+ bytes

--- Value Counts for 'source_name': ---


Guardian Environment                    5
Ars Technica                            5
RenewableEnergySub r/RenewableEnergy    5
Name: source_name, dtype: int64


Successfully saved all scraped data to 'scraped_articles_combined_notebook.csv'
