### Block 1 - Setup and imports

In [1]:
import re
import time
from datetime import datetime
from typing import Optional, Tuple

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [4]:
nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)

ECB_BASE = "https://www.ecb.europa.eu"
INDEX_URL = f"{ECB_BASE}/press/pressconf/html/index.en.html"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (compatible; AcademicReplicationBot/1.0; +https://www.ecb.europa.eu/)"
}

stops = set(stopwords.words("english"))
porter = PorterStemmer()

### Block 2 - Collect the statement URLs from the ECB index page

- Downloads the ECB press conference index page.

- Extracts all links.

- Keeps only monetary-policy statements in English (.en.html).

In [6]:
def get_statement_links(index_url: str = INDEX_URL) -> pd.DataFrame:
    resp = requests.get(index_url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    hrefs = []
    for a in soup.find_all("a", href=True):
        hrefs.append(a["href"].strip())

    # Unique + filter to monetary-policy statement pages in English
    hrefs = sorted(set(hrefs))
    hrefs = [
        h for h in hrefs
        if ("press_conference/monetary-policy-statement" in h)
        and h.endswith(".en.html")
    ]

    df_links = pd.DataFrame({"link": hrefs})
    df_links["url"] = ECB_BASE + df_links["link"]
    return df_links

df_links = get_statement_links()
print(f"Found {len(df_links)} candidate statement pages.")
df_links.head()

Found 1 candidate statement pages.


Unnamed: 0,link,url
0,/press/press_conference/monetary-policy-statem...,https://www.ecb.europa.eu/press/press_conferen...
