In [115]:
import requests
from lxml import etree
from collections import namedtuple
from bs4 import BeautifulSoup
import re
import csv

In [48]:
# Define a namedtuple because it's nice to work with them.
Proceedings = namedtuple('Proceedings', ['year','date','url'])

def get_decade_urls():
    # Get the main page
    main_page = requests.get('https://www.oldbaileyonline.org/browse.jsp?dir=sessionsPapers')
    # Decode the HTML to unicode.
    main_page = main_page.content.decode('utf-8')
    # Parse the HTML
    main_page = etree.HTML(main_page)
    # Get all the decade urls, looping through the decade list on the page.
    base_url = 'https://www.oldbaileyonline.org/'
    decade_urls = []
    for link in start_page.xpath('//div[@class="decadeList"]/a'):
        full_url = base_url + link.attrib['href']
        decade_urls.append(full_url)
    return decade_urls

In [49]:
def get_proceedings_pages(decade_url):
    "Get the links to the proceedings for a given decade."
    base_url = 'https://www.oldbaileyonline.org/'
    proceedings_pages = []
    page_for_decade = requests.get(decade_url)
    current_page = etree.HTML(example.content.decode('utf-8'))
    rows = current_page.xpath('//table[@class="dateTable"]/tr')
    for row in rows:
        for link in row.xpath('.//a'):
            proc_page = Proceedings(year = row[0].text,
                                    date = link.text,
                                    url = base_url+link.attrib['href'])
            proceedings_pages.append(proc_page)
    return proceedings_pages

In [111]:
def get_items_from_page(page):
    "Function to get all the items from a page."
    # Get the page URL
    page_data = requests.get(page.url)  
    # Load the HTML.
    current_page = etree.HTML(page_data.content.decode('utf-8'))
    # Find the sessionsPaper div, where all the data is stored.
    sessionsPaper = current_page.xpath('//div[@class="sessionsPaper"]')[0]
    # Turn the div into a string.
    sp_as_string = etree.tostring(sessionsPaper)
    # Load that into BeautifulSoup, because BS has a nice text method.
    soup = BeautifulSoup(sp_as_string,'lxml')
    # Get and clean the text.
    cleaned = soup.text.replace('See original\xa0','').replace('\xa0','')
    # Get the title and the rest of the text.
    # Each string in rest has its own reference number.
    title, *rest = cleaned.split("Reference Number: ")
    item_dict = dict()
    for item in rest:
        # Get the reference number.
        number, *text = item.split(' ')
        # join the text back together.
        text = ' '.join(text)
        # And normalize the whitespace in the text.
        text = re.sub('\s+', ' ', text).strip()
        item_dict[number] = text
    return item_dict

In [118]:
def save_page(page, directory='./', filename=None):
    "Saves page to a file."
    data = get_items_from_page(page)
    # Make a suitable filename
    if not filename:
        date_nospace = page.date.replace(' ','_')
        path = ''.join([directory, page.year, '_', date_nospace, '.tsv'])
    else:
        path = directory + filename
    # Write out to a file
    with open(path,'w',encoding='utf-8') as f:
        writer=csv.writer(f,delimiter='\t')
        writer.writerows(data.items())

In [112]:
# Run this cell to get URLs to all the pages.
decade_urls = get_decade_urls()
durl = decade_urls[0]
pages = get_proceedings_pages(durl)
# First page is a namedtuple, it is like a tuple, but you can also use first_page.year to get the year etc.
first_page = pages[0]

In [None]:
# If you want to get the items on the page as a dictionary, use this:
items = get_items_from_page(first_page)

In [119]:
# If you want to save the data, use this:
save_page(first_page)