In [5]:
# Robust crypto RSS collector with real User-Agent and per-feed diagnostics

import requests, feedparser, pandas as pd
from pathlib import Path
from datetime import datetime, timezone

FEEDS = [
    # News
    "https://www.coindesk.com/arc/outboundfeeds/rss/?outputType=xml",
    "https://cointelegraph.com/rss",
    "https://www.theblock.co/rss",
    "https://decrypt.co/feed",
    "https://bitcoinmagazine.com/.rss/full/",
    # Exchange / research blogs (often good sentiment/context)
    "https://www.binance.com/en/feed/rss",          # sometimes needs UA
    "https://blog.kraken.com/feed",
    "https://www.okx.com/learn/rss",
]

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
    )
}

def parse_entry_dt(e):
    for k in ("published", "updated"):
        if k in e:
            try:
                return pd.to_datetime(e[k], utc=True, errors="coerce")
            except Exception:
                pass
    for k in ("published_parsed", "updated_parsed"):
        t = getattr(e, k, None)
        if t:
            try:
                return pd.Timestamp(datetime(*t[:6], tzinfo=timezone.utc))
            except Exception:
                pass
    return pd.NaT

def fetch_and_parse(url, timeout=25):
    try:
        resp = requests.get(url, headers=HEADERS, timeout=timeout)
        resp.raise_for_status()
        # Some feeds return bytes/xml; pass content to feedparser
        return feedparser.parse(resp.content)
    except Exception as ex:
        print(f"✗ {url} -> fetch error: {ex}")
        return None

rows = []
print("Fetching feeds...")
for url in FEEDS:
    feed = fetch_and_parse(url)
    if not feed or not getattr(feed, "entries", None):
        title = getattr(feed, "feed", {}).get("title") if feed else None
        print(f"• {title or url}: 0 entries")
        continue

    source = feed.feed.get("title", url)
    count = 0
    for e in feed.entries:
        rows.append({
            "source": source,
            "title": e.get("title"),
            "link": e.get("link"),
            "published": parse_entry_dt(e),
            "summary": e.get("summary"),
        })
        count += 1
    print(f"• {source}: {count} entries")

df = pd.DataFrame(rows)

if df.empty:
    print("\nNo entries parsed. If this persists, your network/DNS may be blocking RSS or some feeds. "
          "Try running again later, or test a single feed in the browser to confirm it loads.")
else:
    if "published" not in df.columns:
        df["published"] = pd.NaT
    df = df.drop_duplicates(subset=["link"]).sort_values("published", na_position="last")
    out_dir = Path("crypto_rss/crypto_rss_data")
    out_dir.mkdir(parents=True, exist_ok=True)
    df.to_csv(out_dir / "crypto_news_rss.csv", index=False)
    print(f"\nSaved {len(df)} items -> {out_dir/'crypto_news_rss.csv'}")

df.tail(5) if not df.empty else df

Fetching feeds...
• CoinDesk: Bitcoin, Ethereum, Crypto News and Price Data: 25 entries
• Cointelegraph.com News: 31 entries
✗ https://www.theblock.co/rss -> fetch error: 404 Client Error: Not Found for url: https://www.theblock.co/rss
• https://www.theblock.co/rss: 0 entries
• Decrypt: 57 entries
• Bitcoin Magazine: 10 entries
• https://www.binance.com/en/feed/rss: 0 entries
• Kraken Blog: 10 entries
✗ https://www.okx.com/learn/rss -> fetch error: 404 Client Error: Not Found for url: https://www.okx.com/learn/rss
• https://www.okx.com/learn/rss: 0 entries

Saved 133 items -> crypto_rss/crypto_rss_data/crypto_news_rss.csv


Unnamed: 0,source,title,link,published,summary
3,"CoinDesk: Bitcoin, Ethereum, Crypto News and P...","Top Crypto Traders Flip Bearish on BTC, ETH in...",https://www.coindesk.com/markets/2025/08/18/to...,2025-08-18 14:28:38+00:00,
2,"CoinDesk: Bitcoin, Ethereum, Crypto News and P...","BTCS to Pay First-Ever Ether Dividend, Loyalty...",https://www.coindesk.com/business/2025/08/18/b...,2025-08-18 14:41:06+00:00,
1,"CoinDesk: Bitcoin, Ethereum, Crypto News and P...",Core Scientific Faces Valuation Disconnect; PT...,https://www.coindesk.com/markets/2025/08/18/co...,2025-08-18 14:47:39+00:00,
0,"CoinDesk: Bitcoin, Ethereum, Crypto News and P...",Insurance Against Price Slides in BlackRock's ...,https://www.coindesk.com/markets/2025/08/18/in...,2025-08-18 14:49:13+00:00,
56,Decrypt,Famed Short Seller Warns Strategy’s $51M Bitco...,https://decrypt.co/335607/short-seller-strateg...,2025-08-18 15:02:01+00:00,Short-seller James Chanos calls Strategy's lat...
