In [8]:
import pandas as pd
from tqdm import tqdm

# Hansard scraper from TheyWorkForYou archive


See [Ludovic Rheault's GitHub](https://github.com/lrheault/emotion/blob/master/webscrape_hansard.py) for example code.

We want to scrape files from [TheyWorkForYou](www.theyworkforyou.com), where debates are stored in XML format with individual URLs.


In [9]:
def get_list_of_debate_urls():

    # load requests module to look-up www.theyworkforyou.com
    import requests
    # And BeatifulSoup to parse the resulting XML
    from bs4 import BeautifulSoup

    # Get response from URL and decode
    url = "https://www.theyworkforyou.com/pwdata/scrapedxml/debates/"
    response = requests.get(url)
    decoded_response = response.content.decode()

    # Tidy with BeatifulSoup
    bs = BeautifulSoup(decoded_response)

    # Find instances of 'a' tag
    tags = bs.find_all('a')
    debate_urls = []
    for link in tags:
        if '.xml' in link['href']:  # We only want .xml links - not .txt or others
            debate_urls.append(url+link['href'])

    return debate_urls

In [10]:
debate_urls = get_list_of_debate_urls()
pd.Series(debate_urls, name='url').to_csv('debate_urls.csv')

In [11]:
import os
import requests


def save_debate_xml_to_disk(url, filename=None):
    # Read and decode URL
    response = requests.get(url)
    # Where we have a decoding error, replace the tricky byte with '?'
    decoded_response = response.content.decode(errors='replace')
    encoded_response = decoded_response.encode('utf-8')

    # Set filename to write to disk
    if filename is None:
        filename = 'debates_xml/'+url.split('/')[-1]

    # Check if directory exists, if not, create it
    directory = os.path.dirname(filename)
    if not os.path.exists(directory):
        os.makedirs(directory)

    with open(filename, 'wb') as f:
        f.write(encoded_response)

In [12]:
for url in tqdm(debate_urls):
    save_debate_xml_to_disk(url)

100%|██████████| 18877/18877 [1:03:00<00:00,  4.99it/s]
