In [97]:
pip install beautifulsoup4 lxml

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
import json
import requests
import feedparser
from datetime import datetime
from pathlib import Path

In [7]:
# ========== CONFIG ==========
NEWSAPI_KEY = "186dd4ccd2234f6a89f850bf16effb06"
QUERY = "fintech OR lending OR investment OR credit OR platform OR loan"
LANGUAGE = "en"
PAGE_SIZE = 100

RSS_FEEDS = {
    "Markets": "https://feeds.bloomberg.com/markets/news.rss",
    "Politics": "https://feeds.bloomberg.com/politics/news.rss",
    "Business": "https://feeds.bloomberg.com/business/news.rss",
    "Technology": "https://feeds.bloomberg.com/technology/news.rss",
    "Economics": "https://feeds.bloomberg.com/economics/news.rss",
    "Industries": "https://feeds.bloomberg.com/industries/news.rss"
}

In [5]:
# ========== NEWSAPI FETCH ==========
def fetch_newsapi():
    print("Fetching from NewsAPI...")
    url = "https://newsapi.org/v2/everything"
    params = {
        "q": QUERY,
        "language": LANGUAGE,
        "pageSize": PAGE_SIZE,
        "sortBy": "publishedAt",
        "apiKey": NEWSAPI_KEY,
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"NewsAPI error: {response.status_code} - {response.text}")
        return []

    articles = response.json().get("articles", [])
    print(f"→ NewsAPI: {len(articles)} articles fetched.")
    
    return [
        {
            "source": f"{a['source']['name']} [NewsAPI]",
            "url": a["url"],
            "title": a["title"],
            "published_at": a["publishedAt"],
            "content": a["content"],
            "platforms_mentioned": [],
        }
        for a in articles
    ]

In [1]:
# ========== BLOOMBERG RSS FETCH ==========
def fetch_bloomberg_rss():
    print("Fetching Bloomberg RSS feeds...")
    all_articles = []
    for name, feed_url in RSS_FEEDS.items():
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            content = getattr(entry, 'summary', entry.get('description', ""))
            all_articles.append({
                "source":           f"Bloomberg - {name} [RSS]",
                "url":              entry.link,
                "title":            entry.title,
                "published_at":     entry.published if "published" in entry else "",
                "content":          content,
                "platforms_mentioned": [],
            })
    print(f"→ Bloomberg RSS: {len(all_articles)} articles fetched.")
    return all_articles

In [1]:
# ========== SEC FETCH ==========
def fetch_sec_press_releases():
    RSS_URL = "https://www.sec.gov/news/pressreleases.rss"
    feed = feedparser.parse(RSS_URL)

    entries = []
    for e in feed.entries:
        entries.append({
            "source": "SEC Press Releases [RSS]",
            "url":       e.link,
            "title":     e.title,
            "published_at": getattr(e, "published", ""),
            "content":     e.get("summary", ""),
            "platforms_mentioned": [],
        })

    print(f"→ SEC Press Releases: {len(entries)} fetched.")
    return entries

In [11]:
# ========== GNEWS FETCH ==========
def fetch_gnews_financial_times():
    print("Fetching from GNews (lending and credit)...")
    api_key = "c4f8fe7bbdaea71cd2ec22279906c40f"
    url = "https://gnews.io/api/v4/search"
    params = {
        "q": "lending OR credit",
        "in": "title,description",
        "lang": "en",
        "country": "us",
        "max": 100,
        "token": api_key
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print(f"GNews error: {response.status_code} - {response.text}")
        return []

    articles = response.json().get("articles", [])
    print(f"→ GNews: {len(articles)} articles fetched.")

    return [
        {
            "source": f"{a.get('source', {}).get('name', 'N/A')} [GNews]",
            "url": a.get("url", ""),
            "title": a.get("title", ""),
            "published_at": a.get("publishedAt", ""),
            "content": a.get("description", ""),
            "platforms_mentioned": [],
        }
        for a in articles
    ]

In [13]:
# ========== INVESTING.COM RSS FETCH ==========
def fetch_investing_rss():
    print("Fetching Investing.com RSS feeds...")
    feeds = {
        "Investing.com (English) [RSS]": "https://www.investing.com/rss/news_25.rss?limit=20",
        "Investing.com (German) [RSS]": "https://de.investing.com/rss/news_95.rss"
    }
    articles = []
    for label, feed_url in feeds.items():
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            articles.append({
                "source": label,
                "url": entry.link,
                "title": entry.title,
                "published_at": entry.published if "published" in entry else "",
                "content": entry.get("summary", ""),
                "platforms_mentioned": [],
            })
    print(f"→ Investing.com RSS: {len(articles)} articles fetched.")
    return articles

In [3]:
# ========== CRUNCHBASE FETCH ==========
import requests, json
from bs4 import BeautifulSoup
from dateutil import parser

def fetch_crunchbase_sections():
    """
    Scrape three Crunchbase News sections and deep‑fetch each
    article’s JSON‑LD to extract a proper published_at and content.
    """
    BASE_URL = "https://news.crunchbase.com"
    sections = [
        {
            "label": "Crunchbase News – Fintech [Scrape]",
            "url": f"{BASE_URL}/sections/fintech-ecommerce/",
            "keywords": {"lending", "credit", "finance", "regulation", "regulations"},
        },
        {
            "label": "Crunchbase News – IPO [Scrape]",
            "url": f"{BASE_URL}/sections/public/ipo/",
            "keywords": None,
        },
        {
            "label": "Crunchbase News – Seed Funding [Scrape]",
            "url": f"{BASE_URL}/sections/seed/",
            "keywords": None,
        },
    ]

    headers = {"User-Agent": "Mozilla/5.0"}
    articles = []

    for sec in sections:
        section_resp = requests.get(sec["url"], headers=headers)
        section_resp.raise_for_status()
        soup = BeautifulSoup(section_resp.text, "lxml")

        # each H2 with a link is one article teaser on the section page
        for h2 in soup.find_all("h2"):
            link_tag = h2.find("a", href=True)
            if not link_tag:
                continue

            title = link_tag.get_text(strip=True)
            href  = link_tag["href"]
            url   = href if href.startswith("http") else (BASE_URL + href)

            # now deep‑fetch the article page
            art = requests.get(url, headers=headers)
            art.raise_for_status()
            art_soup = BeautifulSoup(art.text, "lxml")

            # find the JSON‑LD with "@type": "NewsArticle"
            published_iso = ""
            content_snip = ""
            for script in art_soup.find_all("script", type="application/ld+json"):
                try:
                    data = json.loads(script.string)
                except Exception:
                    continue

                # handle list or single object
                if isinstance(data, list):
                    # find the NewsArticle entry
                    for entry in data:
                        if entry.get("@type") == "NewsArticle":
                            data = entry
                            break
                if data.get("@type") != "NewsArticle":
                    continue

                # extract publish date
                dp = data.get("datePublished") or data.get("uploadDate")
                if dp:
                    try:
                        # normalize to ISO 8601 UTC
                        dt = parser.isoparse(dp)
                        published_iso = dt.date().isoformat() 
                    except Exception:
                        ppublished_iso = dp.split("T")[0] if "T" in dp else dp
                # extract a snippet: articleBody is full text, description is summary
                content_snip = data.get("description") or data.get("articleBody","")
                break  # stop after first NewsArticle

            # if JSON-LD failed, you could fallback to section‑page teaser
            if not content_snip:
                p = h2.find_next_sibling("p")
                content_snip = p.get_text(strip=True) if p else ""

            # apply your keyword filter only on Fintech section
            if sec["keywords"]:
                txt = (title + " " + content_snip).lower()
                if not any(k in txt for k in sec["keywords"]):
                    continue

            articles.append({
                "source":    sec["label"],
                "url":       url,
                "title":     title,
                "published_at": published_iso,
                "content":     content_snip,
                "platforms_mentioned": [],
            })

    print(f"→ Crunchbase News (all sections): {len(articles)} fetched.")
    return articles

In [11]:
# ========== SAVE ==========
# ── Compute a repo-relative data directory ──────────────────────────────────────
# In Actions, cwd() will be /github/workspace; locally it'll be wherever you launch Jupyter.
BASE_DIR = Path().cwd()
SAVE_DIR = BASE_DIR / "data"
SAVE_DIR.mkdir(parents=True, exist_ok=True)
# ────────────────────────────────────────────────────────────────────────────────

def save_articles(articles):
    today = datetime.now().strftime("%Y-%m-%d")
    filepath = SAVE_DIR / f"news_{today}.json"

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(articles, f, indent=2)
    print(f"✅ Saved {len(articles)} articles to {filepath}")

In [57]:
# ========== RUN ==========
newsapi_articles     = fetch_newsapi()
rss_articles         = fetch_bloomberg_rss()
gnews_articles       = fetch_gnews_financial_times()
investing_articles   = fetch_investing_rss()
sec_articles         = fetch_sec_press_releases()
crunchbase_articles  = fetch_crunchbase_sections()

all_articles = (
    newsapi_articles
  + rss_articles
  + gnews_articles
  + investing_articles
  + sec_articles
  + crunchbase_articles
)

if all_articles:
    save_articles(all_articles)

Fetching from NewsAPI...
→ NewsAPI: 99 articles fetched.
Fetching Bloomberg RSS feeds...
→ Bloomberg RSS: 180 articles fetched.
Fetching from GNews (lending and credit)...
→ GNews: 10 articles fetched.
Fetching Investing.com RSS feeds...
→ Investing.com RSS: 20 articles fetched.
→ SEC Press Releases: 25 fetched.
→ Crunchbase News (all sections): 24 fetched.
✅ Saved 358 articles to /Users/florianterne/Documents/M.Sc DMBA/Consulting Project/exaloan_news_tracker/data/news_2025-04-22.json
