In [1]:
import feedparser
import csv
import json
import schedule
import time
from datetime import datetime
import os

# Function to fetch articles from RSS feeds
def fetch_rss_feeds(feed_urls):
    all_articles = []
    for url in feed_urls:
        print(f"Fetching from: {url}")
        feed = feedparser.parse(url)
        
        if feed.bozo:  # Check if there's a problem parsing the feed
            print(f"Error parsing feed: {url}")
            continue
        
        print(f"Found {len(feed.entries)} articles in {url}")
        
        # Extract articles
        for entry in feed.entries:
            all_articles.append({
                "source": feed.feed.title if 'title' in feed.feed else 'Unknown',
                "title": entry.title,
                "link": entry.link,
                "summary": entry.summary if 'summary' in entry else '',
                "published": entry.published if 'published' in entry else 'Unknown'
            })
    return all_articles

# Function to remove duplicates based on article link
def remove_duplicates(new_articles, existing_articles):
    existing_links = set(article['link'] for article in existing_articles)
    unique_articles = [article for article in new_articles if article['link'] not in existing_links]
    return unique_articles

# Function to save articles to CSV
def save_to_csv(articles, filename="news_data.csv"):
    if not articles:
        print("No new articles to save!")
        return
    
    # Check if the file already exists
    if os.path.exists(filename):
        # Read existing data
        with open(filename, 'r', encoding='utf-8') as f:
            existing_articles = list(csv.DictReader(f))
    else:
        # If the file doesn't exist, start fresh
        existing_articles = []

    # Remove duplicates
    unique_articles = remove_duplicates(articles, existing_articles)

    # Save new unique articles
    with open(filename, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=articles[0].keys())
        # Write header only if the file is new
        if not existing_articles:
            writer.writeheader()
        writer.writerows(unique_articles)
    
    print(f"Saved {len(unique_articles)} new articles to {filename}")

# Function to fetch and save articles every hour
def fetch_and_save():
    print(f"Fetching news articles at {datetime.now()}")
    rss_feed_urls = [
        "http://feeds.bbci.co.uk/news/rss.xml",
        "http://rss.cnn.com/rss/cnn_topstories.rss",
        "https://www.theguardian.com/uk/rss",
        "https://rss.nytimes.com/services/xml/rss/nyt/HomePage.xml"
    ]
    
    # Fetch new articles
    news_articles = fetch_rss_feeds(rss_feed_urls)
    
    # Save to CSV and remove duplicates
    save_to_csv(news_articles)

# Schedule the task every hour
schedule.every(1).hour.do(fetch_and_save)

# Run the scheduler
if __name__ == "__main__":
    while True:
        schedule.run_pending()
        time.sleep(1)


KeyboardInterrupt: 