### Block 1 - Setup and imports

In [13]:
import re
import time
from datetime import datetime
from typing import Optional, Tuple
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

### Block 2 - Selenium scrape the index page and build the link table

In [12]:
# INSTANTIATE DRIVER -> REPLACE WITH YOUR BROWSER IF NEEDED
driver = webdriver.Chrome()

# GO DIRECTLY TO THE MONETARY POLICY STATEMENT INDEX PAGE (EN)
driver.get("https://www.ecb.europa.eu/press/press_conference/monetary-policy-statement/html/index.en.html")

# SCROLL TO LOAD ALL CONTENT (kept close to your original logic)
for x in range(0, 10000, 200):
    driver.execute_script("window.scrollBy(0, " + str(x) + ");")
    time.sleep(0.5)

# GET WEBPAGE SOURCE
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

# COLLECT ALL LINKS
r = []
for a in soup.find_all("a", href=True):
    r.append(a["href"])

# REMOVE DUPLICATES
r = list(set(r))

# CONSTRUCT DATAFRAME
df = pd.DataFrame(r, columns=["link"]).sort_values("link")

# KEEP ONLY MONETARY POLICY STATEMENT DOCUMENTS (EN HTML), EXCLUDING THE INDEX ITSELF
df = df.loc[
    (df["link"].str.contains("press_conference/monetary-policy-statement", na=False)) &
    (df["link"].str.contains(r"\.en\.html$", na=False)) &
    (~df["link"].str.contains("index.en.html", na=False))
].reset_index(drop=True)

print(f"TOTAL DOCUMENTS : {len(df)}")
df.head()

TOTAL DOCUMENTS : 292


Unnamed: 0,link
0,/press/press_conference/monetary-policy-statem...
1,/press/press_conference/monetary-policy-statem...
2,/press/press_conference/monetary-policy-statem...
3,/press/press_conference/monetary-policy-statem...
4,/press/press_conference/monetary-policy-statem...


### Block 3 - Download each statement page and extract `content`, `title` and `date`

In [None]:
content = []
date = []
title = []

HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; AcademicReplication/1.0)"}

print(f"TOTAL DOCUMENTS : {len(df)}")

for i, x in tqdm(df.iterrows(), total=df.shape[0]):

    url = "https://www.ecb.europa.eu" + x["link"]

    try:
        req = requests.get(url, headers=HEADERS, timeout=30)
        req.raise_for_status()
        soup = BeautifulSoup(req.content, "html.parser")

        c = []
        sections = soup.find_all("div", {"class": "section"})
        for section in sections:
            for text_container in section.find_all(["h2", "p"]):
                txt = text_container.get_text(strip=True)
                if txt:
                    c.append(txt)

        # Fallback if the page structure differs and we captured nothing
        if len(c) == 0:
            main = soup.find("main") or soup.find("article")
            fallback_text = main.get_text(" ", strip=True) if main else soup.get_text(" ", strip=True)
            c = [fallback_text] if fallback_text else []

        content.append("\n".join(c))

        h1_tag = soup.find("h1")
        if h1_tag:
            h1_text = h1_tag.get_text(strip=True)
            h1_low = h1_text.lower()

            # Keep your original intent, but with correct logic
            if (
                ("introductory statement" in h1_low) or
                ("monetary policy statement" in h1_low) or
                ("press conference" in h1_low)
            ):
                title.append(h1_text)
            else:
                title.append(None)
        else:
            title.append(None)

        meta_date = soup.find("meta", {"property": "article:published_time"})
        if meta_date and meta_date.get("content"):
            date.append(meta_date["content"].strip())
        else:
            time_tag = soup.find("time")
            if time_tag and time_tag.get("datetime"):
                date.append(time_tag["datetime"].strip())
            else:
                date.append(None)

    except Exception as e:
        print(f"Error on URL {url}: {e}")
        content.append("")
        title.append(None)
        date.append(None)

df["content"] = content
df["date"] = date
df["title"] = title

# Drop rows that failed title/date extraction
df = df.dropna(subset=["title", "date"]).reset_index(drop=True)

print(f"DOCUMENTS AFTER FILTERING : {len(df)}")
df.head()


TOTAL DOCUMENTS : 292


 69%|██████▉   | 202/292 [03:56<02:03,  1.37s/it]

### Block 4 - Convert dates, create year, remove unwanted rows, and validate counts

In [7]:
# 1) Convert date and extract year
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df = df.dropna(subset=["date"]).copy()
df["year"] = df["date"].dt.year

# 2) Exclude 1998
df = df[df["year"] != 1998].copy()

# 3) Exclude known unwanted links
intruder_links = [
    "/press/press_conference/monetary-policy-statement/2000/html/is000330.en.html",
    "/press/press_conference/monetary-policy-statement/2000/html/is001019.en.html",
    "/press/press_conference/monetary-policy-statement/2001/html/is011213.en.html",
    "/press/press_conference/monetary-policy-statement/2002/html/is020103_2.en.html",
    "/press/press_conference/monetary-policy-statement/2003/html/is030917.en.html",
    "/press/press_conference/monetary-policy-statement/2003/html/is031013.en.html",
    "/press/press_conference/monetary-policy-statement/2005/html/is050120.en.html",
    "/press/press_conference/monetary-policy-statement/2005/html/is050120_1.en.html",
    "/press/press_conference/monetary-policy-statement/2014/html/is141026.en.html",
    "/press/press_conference/monetary-policy-statement/2021/html/ecb.sp210708~ab68c3bd9d.en.html",
    "/press/press_conference/monetary-policy-statement/html/index.en.html",
]

df = df[~df["link"].isin(intruder_links)].copy()

# 4) Sort and verify the result
df = df.sort_values("date").reset_index(drop=True)

counts_by_year = df["year"].value_counts().sort_index()
print("Number of documents by year:")
print(counts_by_year)
print(f"\nTotal documents: {len(df)}")

df[["date", "year", "title", "link"]].head(10)


Downloading and parsing 1 pages...
Parsed statements: 1
Errors: 0


Unnamed: 0,link,url,date,title,content
0,/press/press_conference/monetary-policy-statem...,https://www.ecb.europa.eu/press/press_conferen...,2025-12-18,Monetary policy statements,ANYTIME PAST MONTH PAST YEAR Search Options Im...


In [None]:
# Ensure we still only have statement pages
assert df["link"].str.contains("monetary-policy-statement", na=False).all()

# Check monotonic ordering after sort
assert df["date"].is_monotonic_increasing
