In [1]:
import json
from bs4 import BeautifulSoup
import requests

In [2]:
HEADERS = {
    "Accept-Encoding": "gzip, deflate, sdch",
    "Accept-Language": "en-US,en;q=0.8",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
}

# main page
jubilee_house = "https://presidency.gov.gh/press-releases/"

In [3]:
response = requests.get(jubilee_house, headers=HEADERS)
soup = BeautifulSoup(response.text, "html.parser")

# extract article URLs
urls = [a["href"] for a in soup.select("div.article-i-button a.button-custom")]

print(f"Found {len(urls)} articles")

Found 12 articles


In [4]:
results = []

def extract_image_urls(img_tag):
    urls = set()
    if img_tag.get("src"):
        urls.add(img_tag["src"])
    if img_tag.get("srcset"):
        for p in img_tag["srcset"].split(","):
            url = p.strip().split(" ")[0]
            urls.add(url)
    return list(urls)

# process each article page
for url in urls:
    r = requests.get(url, headers=HEADERS)
    if r.status_code != 200:
        print(f"Failed: {url}")
        continue

    soup_url = BeautifulSoup(r.text, "html.parser")

    # extract data
    title = soup_url.find("h1", class_="h2").get_text(strip=True)
    content_div = soup_url.find("div", class_="content")
    content = content_div.get_text(separator="\n", strip=True)

    # Extract date if available
    date_tag = soup_url.find("div", class_="article-date")

    date_text = ""
    if date_tag:
        date_text = date_tag.get_text(strip=True)

    # extract all images in article content
    image_urls = []
    for img in content_div.find_all("img"):
        image_urls.extend(extract_image_urls(img))

    results.append({
        "title": title,
        "date": date_text,
        "content": content,
        "link": url,
        "images": image_urls,
    })

In [5]:
# print results cleanly

print(json.dumps(results, indent=4))


[
    {
        "title": "Ghana and Colombia strengthen ties as Vice Presidents meet in Accra.",
        "date": "12 December 2025",
        "content": "Vice President Jane Naana Opoku-Agyemang on Thursday welcomed her Colombian counterpart, Francia Elena M\u00e1rquez Mina, to Accra, marking the second visit by Colombia\u2019s vice president since she took office and signalling a deepening partnership between the two nations.\nReceiving Her Excellency M\u00e1rquez Mina, Professor Opoku-Agyemang described the visit as a reaffirmation of \u201cdeep, longstanding, and mutually respectful relations\u201d built on shared values and common aspirations.\nThe vice president praised M\u00e1rquez Mina\u2019s record as a champion of racial equity, human dignity, and reparative justice, which are causes that resonate strongly across Latin America and beyond.\nShe also noted a convergence between Colombia\u2019s efforts to secure reparations for communities affected by slavery and systemic marginal

In [6]:
# Save results as JSON
data_path = "data/press_releases.jsonl"

with open(data_path, "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("\nSaved to press_releases.jsonl")


Saved to press_releases.jsonl
