# Scraping the Courts and Tribunals Judiciary Website for Prevent Future Death Reports 

In [1]:
from requests import get
from requests import ConnectionError
from bs4 import BeautifulSoup
import re
from time import sleep
from time import time
import csv

try:
    get_ipython
    from tqdm import tqdm_notebook as tqdm
except NameError:
    from tqdm import tqdm
    
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    
def get_url(url):
    response = get(url, verify = False)
    html = response.content
    soup = BeautifulSoup(html, "html.parser")
    return soup

# Scraper starts here - this was run on Friday, Jan 3 2020 at 9.20 pm.

In [2]:
#Your second value in "range" will be one more than the number of pages that exist on the wesbite
pages = list(range(1,297))

#This loops through all the pages to get the URLs to individual records
page_string = 'https://www.judiciary.uk/subject/prevention-of-future-deaths/page/{}/'
record_urls = []
for page in tqdm(pages):
    soup = get_url(page_string.format(str(page)))
    h5s = soup.find_all('h5', {'class': 'entry-title'})
    for h5 in h5s:
        record_urls.append(h5.a.get('href'))

HBox(children=(IntProgress(value=0, max=296), HTML(value='')))




Here we check how many records (i.e. cases) were pulled from the urls & the first and last case

In [3]:
len(record_urls)

2951

In [4]:
record_urls[0]

'https://www.judiciary.uk/publications/doris-clark/'

In [5]:
record_urls[-1]

'https://www.judiciary.uk/publications/phillip-pratt/'

Here is my second loop. This will go through the lists of URLs I just created above to visit each individual record and pull out and store the text data (info on the decreased/case) and the PDF URL I will use later

In [6]:
reg_exp = re.compile(r"’s\s|s\s|'s\s")
text_cats = ['Date of report', 'Ref', 'Deceased name', 'Coroner name', 'Coroner Area', 'Category', "This report is being sent to"]
#First, I create two lists, one for the PDFs and one for the text data
record_text = []
pdf_urls = []
ref_list = []
#I want to loop through each URL & pull out the death information and pdf link for downloading 
for record_url in tqdm(record_urls):
    #This is just a way to retry reaching the website incase there is a temporary error in reaching a specific page
    #If it fails more than 3 times, then you will get an error, but this protects against a temporary blip ruining 
    #the scrape
    try:
        tries = 3
        for i in range(tries):
            try:
                soup = get_url(record_url)
            except (ConnectionError, SSLError):
                if i < tries - 1:
                    sleep(2)
                    continue
                else:
                    raise
        #This gets all the text fields from the website to work with
        death_info = soup.find('div', {'class':'entry-content'}).find_all('p')
        #Our dictionary that will hold all of the text information that we will eventually append to "record_text"
        blankdict = {}
        #This is to handle 1 annoying record with messed up html tags
        if record_url == 'https://www.judiciary.uk/publications/roadsafety/':
            strong = death_info[0].find_all('strong')
            heads = ['date_of_report', 'ref', 'deceased_name', 'coroner_name', 'coroner_area', 'category']
            for st, h in zip(strong,heads):
                blankdict[h] = st.next_sibling.replace(':',"").strip()
        #looping through all of the text categories for handling
        for p in death_info:
            #This checks for blank fields and if there is nothing, it skips it
            if p.text.strip() == '':
                pass
            #This checks for our "Normal" case in which a colon exists and the category is one of the ones we 
            #pre-specified above in the "text_cats" list
            #We also need to account here for one strange record for "Rebecca Evans" which has a weird text error
            #That we manually correct for
            elif ':' in p.text and p.text.split(':')[0] in text_cats and not 'Rebecca-EvansR.pdf' in p.text:
                #Simply assigning the key and value from strings on either side of the colon, making everything 
                #lower case and replacing spaces with underscores and also removing any stray semi-colons
                text_list = p.text.split(':')
                blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','')
                    
            elif 'Rebecca-EvansR.pdf' in p.text:
                #This deals with that singular odd record that currently exists as of 8 Nov 2019
                blankdict['category'] = p.text.split(':')[1].strip().replace('\n','')
            elif ':' not in p.text:
                #If the string don't have a colon, we can't split on it so have to get it into dictionary format
                #Using an alternate method that counts the length of the thing
                if any(x in p.text for x in text_cats):
                    t = [x for x in text_cats if x in p.text][0]
                    l = len(t)
                    blankdict[t.replace(' ','_').lower()] = p.text[l+1:].replace('\n','')
                elif 'Coroners Area' in p.text:
                    blankdict['coroner_area'] = p.text[13:].strip().replace('\n','')
                else:
                    print("Something we haven't accounted for has happened")
            
            elif p.text.strip().count(":") == 2:
                #This corrects for one odd record in which there are 2 colons but should generalize to fix it for
                #any time this could happen, so long as it happens in the same way
                text_list = p.text.split(':')
                new_string = text_list[0] + text_list[1]
                new_name = re.sub(reg_exp, ' ', new_string).strip()
                blankdict[new_name.replace(' ','_').lower()] = text_list[2].strip().replace('\n','')
            
            elif ':' in p.text and p.text.split(':')[0] not in text_cats:
                #Some field names are in the form of "name_of_decesased" or "name_of_coroner" or are plural/
                #possessive so this smashes those into our preferred naming formats
                if 'Name of' in p.text:
                    all_text = p.text.split(':')
                    key_name = all_text[0].split(' ')
                    blankdict[key_name[2].strip() + '_name'] = all_text[-1].strip()
                else:    
                    new_name = re.sub(reg_exp, ' ', p.text)
                    text_list = new_name.split(':')
                    blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','')
        blankdict['url'] = record_url
        try:
            if not blankdict['ref']:
                pass
            elif blankdict['ref'] in ref_list:
                blankdict['ref'] = blankdict['ref'] + 'A'
            ref_list.append(blankdict['ref'])
        except KeyError:
            pass
        #This appends the final dict to the list
        record_text.append(blankdict)
        #this is a seperate process to get the PDF URLs (no matter how many there are) and adds them to their own list   
        urls = soup.find_all('li', {'class':'pdf'})
        pdf_list = []
        for url in urls:
            pdf_list.append(url.findNext('a').get('href'))
        pdf_urls.append(pdf_list)
    except Exception as e:
        #This is an exception catcher to give useful feedback for debugging
        import sys
        raise type(e)(str(e) + '\n' + 'Error for Record: {}, Field: {}'.format(record_url, p)).with_traceback(sys.exc_info()[2])

HBox(children=(IntProgress(value=0, max=2951), HTML(value='')))




Here is the third loop to save the PDFs using the deceased Ref as the file name

In [7]:
#This is the final scrape to actually get the URLs and change the name (when possible) to the refs
for r_t, p_u in zip(tqdm(record_text), pdf_urls):
    if len(p_u) == 0:
        #If there is no pdf at all, we skip it.
        pass
    else:
        #All this does is gets the PDF and downloads it and names it after the reg
        #It looks scary and complicated but all it is doing is varying the name in the case of multiple PDFs
        #Or naming it for the deceased person if there is no Ref value
        #If there is a pdf but no ref or deceased name, this will throw an error and we can adjust.
        try:
            counter = 0
            if len(p_u) > 1:
                for p in p_u:
                    if counter == 0:
                        myfile = get(p)
                        if r_t['ref']:
                            with open('/Users/georgiarichards/Desktop/Python stuff/PFDs opioids/All_PDFs3/{}.pdf'.format(r_t['ref']), 'wb') as d:
                                d.write(myfile.content)
                            counter +=1
                        else:
                            with open('/Users/georgiarichards/Desktop/Python stuff/PFDs opioids/All_PDFs3/{}.pdf'.format(r_t['deceased_name']), 'wb') as d:
                                d.write(myfile.content)
                            counter +=1
                    else:
                        myfile = get(p)
                        if r_t['ref']:
                            with open('/Users/georgiarichards/Desktop/Python stuff/PFDs opioids/All_PDFs3/{}_{}.pdf'.format(r_t['ref'], str(counter)), 'wb') as d:
                                d.write(myfile.content)
                            counter +=1
                        else:
                            with open('/Users/georgiarichards/Desktop/Python stuff/PFDs opioids/All_PDFs3/{}_{}.pdf'.format(r_t['deceased_name'], str(counter)), 'wb') as d:
                                d.write(myfile.content)
                            counter +=1
            else:
                myfile = get(p_u[0])
                if r_t['ref']:
                    with open('/Users/georgiarichards/Desktop/Python stuff/PFDs opioids/All_PDFs3/{}.pdf'.format(r_t['ref']), 'wb') as d:
                        d.write(myfile.content)
                else:
                    with open('/Users/georgiarichards/Desktop/Python stuff/PFDs opioids/All_PDFs3/{}.pdf'.format(r_t['deceased_name']), 'wb') as d:
                        d.write(myfile.content)
        except Exception as e:
            import sys
            if r_t['ref']:
                raise type(e)(str(e) + '\n' + 'Error for Record: {}'.format(r_t['ref'])).with_traceback(sys.exc_info()[2])
            else:
                raise type(e)(str(e) + '\n' + 'Error for Record: {}'.format(r_t['deceased_name'])).with_traceback(sys.exc_info()[2])

HBox(children=(IntProgress(value=0, max=2951), HTML(value='')))




This is my final step that puts the text data (info on the decreased/case) into a csv file

In [8]:
from datetime import date

headers = ['date_of_report', 'ref', 'deceased_name', 'coroner_name', 'coroner_area', 'category', "this_report_is_being_sent_to", "url"]


with open('death_info_{}.csv'.format(date.today()), 'w', newline='', encoding='utf-8') as deaths_csv:
    writer = csv.DictWriter(deaths_csv, fieldnames=headers)
    writer.writeheader()
    for record in record_text:
        if record == {}:
            pass
        else:
            writer.writerow(record)

This is an addition few steps to check what differences there are from the Dec month records 

In [9]:
import os

pdfs2 = os.listdir('All_PDFs2')
pdfs3 = os.listdir('All_PDFs3')

new_not_old = set(pdfs3).difference(pdfs2)
old_not_new = set(pdfs2).difference(pdfs3)

new_not_old_list = list(new_not_old)
new_not_old_list.sort()
new_not_old_list

['.DS_Store',
 '2013-0229A.pdf',
 '2013-0239A.pdf',
 '2013-0239A_1.pdf',
 '2013-0265A.pdf',
 '2013-0265A_1.pdf',
 '2013-0290A.pdf',
 '2013-0347A.pdf',
 '2013-0360A.pdf',
 '2013-0361A.pdf',
 '2013-0361A_1.pdf',
 '2013-0362A.pdf',
 '2013-0366A.pdf',
 '2014-0017A.pdf',
 '2014-0232A.pdf',
 '2014-0301A.pdf',
 '2014-0301A_1.pdf',
 '2014-0332A.pdf',
 '2014-0332A_1.pdf',
 '2014-0336A.pdf',
 '2014-0386A.pdf',
 '2014-0386A_1.pdf',
 '2014-0424A.pdf',
 '2014-0462A.pdf',
 '2014-0520A.pdf',
 '2014-0520A_1.pdf',
 '2014-0543A.pdf',
 '2014-0549A.pdf',
 '2014-0549A_1.pdf',
 '2014-0556A.pdf',
 '2014-0556A_1.pdf',
 '2014-0558A.pdf',
 '2014-0560A.pdf',
 '2014-0560A_1.pdf',
 '2015-0020A.pdf',
 '2015-0020A_1.pdf',
 '2015-0049A.pdf',
 '2015-0100A.pdf',
 '2015-0102A.pdf',
 '2015-0102A_1.pdf',
 '2015-0117A.pdf',
 '2015-0117A_1.pdf',
 '2015-0164A.pdf',
 '2015-0164A_1.pdf',
 '2015-0204A.pdf',
 '2015-0204A_1.pdf',
 '2015-0228A.pdf',
 '2015-0228A_1.pdf',
 '2015-0243A.pdf',
 '2015-0261A.pdf',
 '2015-0315A.pdf',
 '20

In [10]:
len(new_not_old_list)

206

In [11]:
import pandas as pd

dec = pd.read_csv('death_info_2019-12-02.csv')
jan = pd.read_csv('death_info_2020-01-03.csv')

In [12]:
cols = list(jan.columns)
merged = jan.merge(dec, on=cols, how='left', indicator=True)

In [13]:
l_only = merged[merged['_merge'] == 'left_only']

In [17]:
len(l_only)

172

In [15]:
l_only

Unnamed: 0,date_of_report,ref,deceased_name,coroner_name,coroner_area,category,this_report_is_being_sent_to,url,_merge
0,19 December 2019,2019-0444,Doris Clark,Nadia Persaud,London (East),Hospital Death (Clinical Procedures and medica...,"Barking, Havering & Redbridge University Hospi...",https://www.judiciary.uk/publications/doris-cl...,left_only
1,20 December 2019,2019-0443,Samantha Brousas,Joanna Lees,North Wale (East and Central),Emergency services related deaths (2019 onward...,Welsh Ambulance Service NHS Trust,https://www.judiciary.uk/publications/samantha...,left_only
2,16 December 2019,2019-0442,Alice Sloman,Simon Fox QC,Avon,Child Death (from 2015); Community health care...,University Hospitals Bristol; Torbay and South...,https://www.judiciary.uk/publications/alice-sl...,left_only
3,18 December 2019,2019-0441,Suzanne Roberts,Jame Healy-Pratt,West Sussex,Hospital Death (Clinical Procedures and medica...,NHS England,https://www.judiciary.uk/publications/suzanne-...,left_only
4,17 December 2019,2019-0400,Barry Liffen,Fiona Wilcox,London Inner (West),Care Home Health related deaths,Glebelands Care Team,https://www.judiciary.uk/publications/barry-li...,left_only
5,17 December 2019,2019-0439,Eugeniusz Malek,Fiona Malek,London Inner (West),Accident at Work and Health and Safety related...,HSE,https://www.judiciary.uk/publications/eugenius...,left_only
6,16 December 2019,2019-0438,Henry Campbell-Byatt,Fiona Wilcox,London Inner (West),Other related deaths,The Peligoni Club,https://www.judiciary.uk/publications/henry-ca...,left_only
7,18 December 2019,2019-0437,Katherine Stamp,Jame Healy-Pratt,West Sussex,Hospital Death (Clinical Procedures and medica...,NHS England,https://www.judiciary.uk/publications/katherin...,left_only
8,17 December 2019,2019-0436,Constance Robinson,Alan Walsh,Manchester (West),Hospital Death (Clinical Procedures and medica...,Greater Manchester Stroke Operational Delivery...,https://www.judiciary.uk/publications/constanc...,left_only
9,1 November 2019,2019-0458,Joshua Hoole,Louise Hunt,Birmingham and Solihull,Service Personnel related deaths,MOD,https://www.judiciary.uk/publications/joshua-h...,left_only


In [16]:
l_only.to_csv(r'death_info_new.csv')