In [90]:
# import requests
# from defusedxml import ElementTree as ET

# # Fetching content
# # url = input("paste url: ")
# r = requests.get("https://hnrss.org/frontpage")
# # r.status_code == requests.codes.ok
# content = r.text

# # # Parsing data
# root = ET.fromstring(content)
# for title in root.iter('title'):
#     print(title.text)

In [135]:
import requests
from defusedxml import ElementTree as ET

# Fetch the XML content
def fetch_feed(url):
    try:
        # headers = {'User-Agent': 'Mozilla/5.0'}  If you need headers...
        response = requests.get(url)  # Remember to add them to the requests
        response.raise_for_status()
        return response.text
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch feed: {e}")
        return None
    
# Extract namespaces for dynamic namespaces handling
def extract_namespaces(xml_text):
    root = ET.fromstring(xml_text)
    # namespaces dict to use for extraction
    ns = {}
   # Capture all namespaces from root
    for key,value in root.attrib.items():
        if key.startswith('xmlns'):
            # Extract namespace prefix after colon
            prefix = key.split(':')[-1] if ':' in key else 'atom'
            ns[prefix] = value
    return ns

# Detect type of feed for correct parsing
def detect_feed_type(root):
    if root.tag.endswith('feed'):  # Atom feed
        return 'atom'
    elif root.tag == 'rss':  # RSS 2.0
        return 'rss'
    else:
        return 'unknown'

# Title extraction: 
# (practice purpose, since we would need more than the title 
# for the real application).

def get_feed_title(xml_text):
    root = ET.fromstring(xml_text)
    ns = extract_namespaces(xml_text)

    # Try both Atom and RSS formats
    title = root.find('.//title', namespaces=ns) or \
            root.find('.//channel/title', namespaces=ns)

    return title.text if title is not None else "No title found"

# Parse the root element
def parse_items(xml_text):
    root = ET.fromstring(xml_text)
    ns = extract_namespaces(xml_text)
    
    # Handle default namespaces
    if not ns:
        ns[''] = 'http://www.w3.org/2005/Atom'

    items = []
    # Atom format
    for entry in root.findall('.//entry', namespaces=ns):
        title = entry.find('.//title', namespaces=ns)
        content = entry.find('.//content')
        link = entry.find('.//atom:link[@href]', namespaces=ns)
        if title is not None and content is not None and link is not None:
            items.append({
                'title': title.text,
                'content': content.text,
                'link': link.attrib.get('href', link.text)
            })
    # RSS Format
    if not items:
        for item in root.findall('.//item'):
            title = item.find('title')
            description = item.find('description')
            link = item.find('link')
            if title is not None and description is not None and link is not None:
                items.append({
                    'title': title.text,
                    'description': description.text,
                    'link': link.text
                }) 
            
    return items


In [137]:
# test_feeds = [
#     ("BBC (RSS 2.0)", "http://feeds.bbci.co.uk/news/rss.xml"),
#     ("Reddit (Atom)", "https://www.reddit.com/r/python/.rss"),
#     ("NASA (Atom)", "https://www.nasa.gov/feeds/iotd-feed/")
# ]

# for name, url in test_feeds:
#     parser = RSSParser(url)
#     if parser.fetch():
#         data = parser.parse()
#         print(f"\n{name}:")
#         print(f"Title: {data['title']}")
#         print(f"First item: {data['items'][0]['title'][:50]}...")

In [145]:
def debug():
    url = "https://www.nasa.gov/rss/dyn/breaking_news.rss"
    xml_text = fetch_feed(url)

    # Check title
    print("Feed:", get_feed_title(xml_text))  # Should return Python

    # Check items
    items = parse_items(xml_text)
    print(f"Title:, {items[0]['title']}, Link: {items[0]['link']}" )

debug()

Feed: NASA
Title:, How to Fly NASA’s Orion Spacecraft, Link: https://www.nasa.gov/missions/artemis/how-to-fly-nasas-orion-spacecraft/


In [146]:
reddit_data = str(fetch_feed("https://www.reddit.com/r/python/.rss"))
print(reddit_data[:100])  # First 500 characters

<?xml version="1.0" encoding="UTF-8"?><feed xmlns="http://www.w3.org/2005/Atom" xmlns:media="http://


In [147]:
# Test with Atom feed
reddit_data = fetch_feed("https://github.blog/feed/")
print(get_feed_title(reddit_data))  # Should return "Python"
print(parse_items(reddit_data)[0])  # First post's title/link

The GitHub Blog
{'title': 'Considerations for making a tree view component accessible', 'desciption': '<p>A deep dive on the work that went into making the component that powers repository and pull request file trees.</p>\n<p>The post <a href="https://github.blog/engineering/user-experience/considerations-for-making-a-tree-view-component-accessible/">Considerations for making a tree view component accessible</a> appeared first on <a href="https://github.blog">The GitHub Blog</a>.</p>\n', 'link': 'https://github.blog/engineering/user-experience/considerations-for-making-a-tree-view-component-accessible/'}


In [148]:
# Test with RSS 2.0
bbc_data = fetch_feed("http://feeds.bbci.co.uk/news/rss.xml")
print(get_feed_title(bbc_data))  # "BBC News - Home"
print(parse_items(bbc_data)[0])

BBC News
{'title': "Woman arrested for second time over girls' school crash deaths", 'desciption': 'Nuria Sajjad and Selena Lau were at a school tea party when a Land Rover crashed into the playground.', 'link': 'https://www.bbc.com/news/articles/cm27dg7e7ddo'}


In [76]:
# feed_data = fetch_feed("https://hnrss.org/frontpage")
# if feed_data:
#     print(extract_namespace(feed_data))

In [77]:
# feed_data = fetch_feed("https://www.reddit.com/r/python/.rss")
# if feed_data:
#     print(feed_data[:200])

In [75]:
# # test
# feed_data = fetch_feed("https://www.reddit.com/r/python/.rss")
# if feed_data:
#     print(get_feed_title(feed_data))