In [13]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

## Extract URLs from a page

In [26]:
def _hrefs(url):
    response = requests.get(url)
    html_content = response.text
    
    soup = BeautifulSoup(html_content, "html.parser")
    
    hrefs = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href:
            if href.startswith("/"):
                href = urljoin(url, href)
                hrefs.append(href)
    return hrefs

In [15]:
hrefs_list = _hrefs("https://api.parliament.uk/historic-hansard/commons/1869/")

In [16]:
hrefs_list

['https://api.parliament.uk/historic-hansard/search',
 'https://api.parliament.uk/historic-hansard/',
 'https://api.parliament.uk/historic-hansard/sittings/1860s',
 'https://api.parliament.uk/historic-hansard/commons/1860s',
 'https://api.parliament.uk/historic-hansard/commons/1868',
 'https://api.parliament.uk/historic-hansard/commons/1869/feb',
 'https://api.parliament.uk/historic-hansard/commons/1869/mar',
 'https://api.parliament.uk/historic-hansard/commons/1869/apr',
 'https://api.parliament.uk/historic-hansard/commons/1869/may',
 'https://api.parliament.uk/historic-hansard/commons/1869/jun',
 'https://api.parliament.uk/historic-hansard/commons/1869/jul',
 'https://api.parliament.uk/historic-hansard/commons/1869/aug',
 'https://api.parliament.uk/historic-hansard/commons/1870',
 'https://api.parliament.uk/historic-hansard/commons/1869/feb',
 'https://api.parliament.uk/historic-hansard/commons/1869/mar',
 'https://api.parliament.uk/historic-hansard/commons/1869/apr',
 'https://api.p

Extract Text

In [18]:
def _extract(url):

    response = requests.get(url)
    html_content = response.content

    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text()

    text = text.strip()
    text = text.replace("\n", "")

    return text


In [19]:
text = _extract('https://api.parliament.uk/historic-hansard/commons/1869/aug/10/question')


In [20]:
text

"QUESTION. (Hansard, 10 August 1869)Search HelpHANSARD 1803–2005        →                          1860s                 →                  1869                 →                  August 1869                 →                  10 August 1869                →                          Commons Sitting                 →        ITALY—ARREST OF MR. NATHAN.QUESTION.HC Deb 10 August 1869 vol 198 c15261526 §MR. P. A. TAYLOR            said, he wished to ask the Under Secretary of State for Foreign Affairs, Whether his attention has been called to the arrest of a young man, Mr. Nathan, a British subject, at Milan in April last, and to his continued incarceration to this time without trial, bail being refused; and, whether he will state what steps Her Majesty's Government have taken in the matter?           MR. OTWAY            said, he had to say, in answer to the Question of his hon. Friend, that his attention had been called to the arrest of this young gentleman, Mr. Nathan, at Milan. It appea

In [23]:
def get_all_hrefs(base_url, year):
    year_url = urljoin(base_url, f"{year}/")

    response = requests.get(year_url)
    html_content = response.text
    
    soup = BeautifulSoup(html_content, "html.parser")
    
    article_urls = []
    for link in soup.find_all("a"):
        href = link.get("href")
        if href and href.endswith("/"):
            article_url = urljoin(base_url, href)
            article_urls.append(article_url)
    
    all_hrefs = []
    for article_url in article_urls:
        article_hrefs = _hrefs(article_url)
        all_hrefs.extend([urljoin(article_url, href) for href in article_hrefs])
    
    return all_hrefs

In [24]:
listone = get_all_hrefs("https://api.parliament.uk/historic-hansard/commons/","1869/aug")

In [25]:
listone

['https://api.parliament.uk/historic-hansard/sittings/1800s',
 'https://api.parliament.uk/historic-hansard/sittings/1810s',
 'https://api.parliament.uk/historic-hansard/sittings/1820s',
 'https://api.parliament.uk/historic-hansard/sittings/1830s',
 'https://api.parliament.uk/historic-hansard/sittings/1840s',
 'https://api.parliament.uk/historic-hansard/sittings/1850s',
 'https://api.parliament.uk/historic-hansard/sittings/1860s',
 'https://api.parliament.uk/historic-hansard/sittings/1870s',
 'https://api.parliament.uk/historic-hansard/sittings/1880s',
 'https://api.parliament.uk/historic-hansard/sittings/1890s',
 'https://api.parliament.uk/historic-hansard/sittings/C20',
 'https://api.parliament.uk/historic-hansard/sittings/1800s',
 'https://api.parliament.uk/historic-hansard/sittings/1810s',
 'https://api.parliament.uk/historic-hansard/sittings/1820s',
 'https://api.parliament.uk/historic-hansard/sittings/1830s',
 'https://api.parliament.uk/historic-hansard/sittings/1840s',
 'https://