# Scraping the Courts and Tribunals Judiciary Website to collect Prevent Future Death (PFD) reports 

In [1]:
from requests import get
from requests import ConnectionError
from bs4 import BeautifulSoup
import re
from time import sleep
from time import time
import csv
import pandas as pd

from tqdm.auto import tqdm
    
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

import concurrent

from time import sleep, time

def get_url(url):
    return BeautifulSoup(get(url, verify = False).content, "html.parser")

def retries(record_url, tries=3):
    for i in range(tries):
        try:
            soup = get_url(record_url)
            return soup
        except (ConnectionError, SSLError):
            sleep(2)
            continue
    raise ConnectionError("Connection error")

# Scraper starts here - last run on Monday, 06 Sept 2021, 9.06 am.

Here we find the number of pages containing PFD reports.

In [2]:
prefix = "https://www.judiciary.uk/subject/prevention-of-future-deaths/"
number_of_pages = int(get_url(prefix) \
                    .find("div", "pagination") \
                    .find_all("li")[-1] \
                    .find("a")["href"] \
                    .split("/")[-2])

In [3]:
def fetch_urls(page):
    soup = retries(prefix+"/page/{}".format(str(page+1)))
    h5s = soup.find_all('h5', {'class': 'entry-title'})
    return [h5.a.get('href') for h5 in h5s]

In [4]:
record_urls = []
with tqdm(total = number_of_pages) as pbar:
    with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor:
        record_urls = [executor.submit(fetch_urls, i)
                       for i in range(number_of_pages)]
        for future in concurrent.futures.as_completed(record_urls):
            pbar.update(1)
record_urls = [link for links in [urls.result() for urls in record_urls] for link in links]

  0%|          | 0/371 [00:00<?, ?it/s]

Here we check how many records (i.e. cases) were pulled from the urls & the first and last case

In [141]:
errors = []
class MissingRecordsError(Exception): pass
class MissingFieldError(Exception): pass
class UnreadableFieldError(Exception): pass
class SpecialCaseUnaccountedForError(Exception): pass

special_cases = [
    "https://www.judiciary.uk/publications/roadsafety/",
    "https://www.judiciary.uk/publications/helen-sheath/",
    "https://www.judiciary.uk/publications/rebecca-evans/"
]

columns = list(map(lambda x: x.lower(), [
    'Date of report',
    'Ref',
    'Deceased name',
    'Coroner name',
    'Coroner Area',
    'Coroners Area', #NB to be merged later; if both come up there'll be trouble
    'Category',
    "This report is being sent to"
]))

records, refs, pdflinks = [], [], []

plurals_possessives = re.compile(r"’s\s|s\s|'s\s")

In [150]:
class page_scrape(object):
    def __init__(self, url):       
#         try:
        self.url = url
        self.soup = retries(url, tries=5) #raises connexion error if fails
        self.information = self.soup.find("div", {"class": "entry-content"}).find_all("p")

        # nothing between <p> to process
        if not self.information:
            raise MissingRecordsError         

        self.extracted = {"url": url}

        if url in special_cases:
            self._process_special_case(url)
        else:
             for field in self.information:
                extraction = self.strip_field(field)
                self.extracted[extraction.header] = extraction.text

        download_box = self.soup.find("div", "download-box")

        links = download_box.find_all('a', href=True)
        responses = len([s for s in links if "response" in s.text.lower()])

        pdflinks.append([link.href for link in links])

        self.extracted["number of links"] = len(links)
        self.extracted["number of responses"] = responses

        try:
            self.extracted["number of recipients"] = self.extracted["this report is being sent to"].count(";") + 1
        except:
            pass

        try:
            if self.extracted["ref"] in refs: # deduplication of references
                self.extracted["ref"] += "-bis"
            refs.append(self.extracted["ref"])
        except KeyError:
            self.extracted["ref"] = "" # no reference found

#         except Exception as e:
#             details = str(e)

#             if isinstance(e, ConnectionError):
#                 details = "Could not connect"

#             if isinstance(e, MissingRecordsError):
#                 details = "No records found"

#             if isinstance(e, SpecialCaseUnaccountedForError):
#                 details = "Special case unaccounted for"

#             else:
#                 raise e

            #errors.append({"url": url, "reason": details})
            #print(details)
        
    def _process_special_case(self,url):
        if self.url == 'https://www.judiciary.uk/publications/roadsafety/':
            fields = self.information[0].find_all('strong')
            heads = ['date of report',
                     'ref',
                     'deceased name',
                     'coroner name',
                     'coroner area',
                     'category']
            for field, h in zip(fields,heads):
                self.extracted[h] = field.next_sibling.replace(':','').replace('Ref','').strip()
        elif url == 'https://www.judiciary.uk/publications/helen-sheath/':
            fields = self.information[0].text.split('\n')
            vals = [fields.split(":") for field in fields]
            for v in vals:
                if v[0] == "Coroners name":
                    alt = "coroner_name"
                elif v[0] == "Coroners Area":
                    alt = "coroner_area"
                else:
                    alt = v[0].strip().replace(' ','_').lower()
                self.extracted[alt] = v[1].strip().replace('\n','')
        elif url == "https://www.judiciary.uk/publications/rebecca-evans/":
            for field in self.information:
                if "Rebecca-EvansR" in field.text:
                    self.extracted["category"] = field.text.split(':')[1].strip().replace('\n','')
                else:
                    strip_field(field)

        else:
            raise SpecialCaseUnaccountedForError

            
            
    class strip_field(object):
        def __init__(self, field):
            #try:
            self.field_text = field.text.strip()

            if self.field_text == "": return

            self.pre_colon, self.post_colon = self.field_text.split(":", 1) # split by first colon
            self.header = self.pre_colon.lower()
            
#             print(self.field_text)
#             print(self.pre_colon)
#             print(self.post_colon)
#             print(self.header)
            
            if self.header in columns: # Normal case: colon separates text
                self.text = self.post_colon.strip().replace('\n','').replace('\xa0','')
#                 print("Normal case")
#                 print(self.header)
#                 print(self.text)

            elif self.post_colon.strip() == "": # no colon separator
                self.header, self.text = self._no_colon_separator()          
#                 print("No colon case")
#                 print(self.header)
#                 print(self.text)
            elif self.field_text.count(":") == 2: # two colons
                self.header, self.text = self._two_colons()
#                 print("Two colons case")
#                 print(self.header)
#                 print(self.text)
            elif ':' in self.field_text and self.header not in columns: # plurals and possessives
                self.header, self.text = self._plurals_and_possessives()
#                 print("Plurals and possessives")
#                 print(self.header)
#                 print(self.text)
            

#             except Exception as e:
#                 details = str(e)

#                 if isinstance(e, UnreadableFieldError):
#                     details = f"Unreadable field; text read: {field.text}"
#                     errors.append({"url": url, "reason": reason})

#                 else:
#                     raise e
#                 print(details)

        def _no_colon_separator(self):
            column_found = False
            for column in columns:
                if self.pre_colon.startswith(column):
                    header = column
                    text = pre_colon[len(column)+1] \
                            .replace('\n','') \
                            .replace('\xa0','')
                    column_found = True
                    break
            if not column_found:
                raise UnreadableFieldError
            return (header, text)

        def _two_colons(self):
            split_by_colon = self.field_text.split(':')
            header = re.sub(plurals_possessives,
                             ' ',
                             split_by_colon[0]+split_by_colon[1]
                            ) \
                            .strip() \
                            .lower()
            text = split_by_colon[2] \
                        .strip() \
                        .replace('\n','') \
                        .replace('\xa0','')
            return (header, text)

        def _plurals_and_possessives(self):
            if 'Name of' in self.field_text:
                header = self.pre_colon.split(' ')[2] + " name"
                text = self.field_text[-1].strip().replace('\n','').replace('\xa0','')
            else:     
                header = re.sub(plurals_possessives, ' ', self.field_text[0]).strip()
                text = self.field_text[-1].strip().replace('\n','').replace('\xa0','')
            return (header, text)



In [151]:
# clear old errors
errors = []

with tqdm(total = 100) as pbar:
    with concurrent.futures.ProcessPoolExecutor(max_workers=16) as executor:
        records = [executor.submit(scrape_from_page, url) for url in record_urls[:100]]
        for future in concurrent.futures.as_completed(records):
            pbar.update(1)    


name 'field_text' is not defined
name 'field_text' is not defined
name 'field_text' is not defined
name 'field_text' is not defined
name 'field_text' is not defined
name 'field_text' is not defined
name 'field_text' is not defined


TypeError: __init__() should return None, not 'dict'

In [None]:
reg_exp = re.compile(r"’s\s|s\s|'s\s")
text_cats = ['Date of report', 'Ref', 'Deceased name', 'Coroner name', 'Coroner Area', 'Category', "This report is being sent to"]
#First, I create two lists, one for the PDFs and one for the text data
record_text = []
pdf_urls = []
ref_list = []

errors = []

record_count = 0
for record_url in tqdm(record_urls[:5]):
    try:
    
        try:
            soup = retries(record_url, tries=5)
        
        except ConnectionError:
            print(f"{record_url} could not connect")
            errors.append(error_details(error_dict, record_count, record_url, 'Connection Error'))
            record_count +=1
            continue

        #This gets all the text fields from the website to work with
        death_info = soup.find('div', {'class':'entry-content'}).find_all('p')
        
        if not death_info:
            print(f"{record_url} produced no data")
            error_catching.append(error_details(error_dict, record_count, record_url, 'No Text Loaded'))
            record_count +=1
            continue
            
        #Our dictionary that will hold all of the text information that we will eventually append to "record_text"
        blankdict = {}
        
        #This is to handle 1 annoying record with messed up html tags
        if record_url == 'https://www.judiciary.uk/publications/roadsafety/':
            strong = death_info[0].find_all('strong')
            heads = ['date_of_report', 'ref', 'deceased_name', 'coroner_name', 'coroner_area', 'category']
            for st, h in zip(strong,heads):
                blankdict[h] = st.next_sibling.replace(':','').replace('Ref','').strip()
        #And another record with wonky html
        elif record_url == 'https://www.judiciary.uk/publications/helen-sheath/':
            brs = death_info[0].text.split('\n')
            vals = []
            for b in brs:
                vals.append(b.split(':'))
            for v in vals:
                if v[0] == "Coroners name":
                    alt = "coroner_name"
                    blankdict[alt] = v[1].strip().replace('\n','')
                elif v[0] == "Coroners Area":
                    alt = "coroner_area"
                    blankdict[alt] = v[1].strip().replace('\n','')
                else:
                    blankdict[v[0].strip().replace(' ','_').lower()] = v[1].strip().replace('\n','')
        else:        
            #looping through all of the text categories for handling
            for p in death_info:
                #This checks for blank fields and if there is nothing, it skips it
                if p.text.strip() == '':
                    pass
                #This checks for our "Normal" case in which a colon exists and the category is one of the ones we 
                #pre-specified above in the "text_cats" list
                #We also need to account here for one strange record for "Rebecca Evans" which has a weird text error
                #That we manually correct for
                elif ':' in p.text and p.text.split(':')[0] in text_cats and not 'Rebecca-EvansR.pdf' in p.text:
                    #Simply assigning the key and value from strings on either side of the colon, making everything 
                    #lower case and replacing spaces with underscores and also removing any stray semi-colons
                    text_list = p.text.split(':')
                    blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','').replace('\xa0','')

                elif 'Rebecca-EvansR.pdf' in p.text:
                    #This deals with that singular odd record that currently exists as of 8 Nov 2019
                    blankdict['category'] = p.text.split(':')[1].strip().replace('\n','')
                    
                elif ':' not in p.text:
                    #If the string doesn't have a colon, we can't split on it so have to get it into dictionary format
                    #Using an alternate method that counts the length of the thing
                    if any(x in p.text for x in text_cats):
                        t = [x for x in text_cats if x in p.text][0]
                        l = len(t)
                        blankdict[t.replace(' ','_').lower()] = p.text[l+1:].replace('\n','').replace('\xa0','')
                    elif 'Coroners Area' in p.text:
                        blankdict['coroner_area'] = p.text[13:].strip().replace('\n','').replace('\xa0','')
                    else:
                        print("Something we haven't accounted for has happened")

                elif p.text.strip().count(":") == 2:
                    #This corrects for one odd record in which there are 2 colons but should generalize to fix it for
                    #any time this could happen, so long as it happens in the same way
                    text_list = p.text.split(':')
                    new_string = text_list[0] + text_list[1]
                    new_name = re.sub(reg_exp, ' ', new_string).strip()
                    blankdict[new_name.replace(' ','_').lower()] = text_list[2].strip().replace('\n','').replace('\xa0','')

                elif ':' in p.text and p.text.split(':')[0] not in text_cats:
                    #Some field names are in the form of "name_of_decesased" or "name_of_coroner" or are plural/
                    #possessive so this smashes those into our preferred naming formats
                    if 'Name of' in p.text:
                        all_text = p.text.split(':')
                        key_name = all_text[0].split(' ')
                        blankdict[key_name[2].strip() + '_name'] = all_text[-1].strip()
                    else:    
                        new_name = re.sub(reg_exp, ' ', p.text)
                        text_list = new_name.split(':')
                        blankdict[text_list[0].strip().replace(' ','_').lower()] = text_list[1].strip().replace('\n','').replace('\xa0','')
        blankdict['url'] = record_url
        
        #A small little check for duplicated ref names
        try:
            if not blankdict['ref']:
                pass
            elif blankdict['ref'] in ref_list:
                blankdict['ref'] = blankdict['ref'] + 'A'
            ref_list.append(blankdict['ref'])
        except KeyError:
            blankdict['ref'] = ''
            
        #This appends the final dict to the list
        record_text.append(blankdict)
        
        #this is a seperate process to get the PDF URLs (no matter how many there are) and adds them to their own list   
        urls = soup.find_all('li', {'class':'pdf'})
        pdf_list = []
        for url in urls:
            pdf_list.append(url.findNext('a').get('href'))
        pdf_urls.append(pdf_list)
        
        record_count += 1
        
    except Exception as e:
        error_desc = f"{str(e)} occurred for {record_url} when trying to work with {p}"
        print(error_desc)
        error_catching.append(error_details(record_count, record_url, error_desc))
        
        #Saving this in case we don't like the error catching.
        #import sys
        #raise type(e)(str(e) + '\n' + 'Error for Record: {}, Field: {}'.format(record_url, p)).with_traceback(sys.exc_info()[2])

Here is the third loop to save the PDFs using the deceased Ref as the file name

In [None]:
#Any errors should print out above, but you can also check the error_catching dict
#Here we just turn it into a dataframe quickly to easily view

error_df = pd.DataFrame(error_catching)
error_df

In [None]:
def save_file(path_string, name_string):
    with open(path_string.format(name_string), 'wb') as d:
        d.write(myfile.content)

save_path = '/Users/georgiarichards/Desktop/Python/PFDs opioids/All_PDFs8/{}.pdf'

potential_names = ['ref', 'deceased_name', 'date_of_report']

record_count = 0
#This is the final scrape to actually get the URLs and change the name (when possible) to the refs
for r_t, p_u in zip(tqdm(record_text), pdf_urls):
    if not p_u:
        #If there is no pdf at all, we skip it.
        continue
    else:
        #All this does is gets the PDF and downloads it and names it after the reg
        #It looks scary and complicated but all it is doing is varying the name in the case of multiple PDFs
        #Or naming it for the deceased person if there is no Ref value
        #If there is a pdf but no ref or deceased name, this will throw an error and we can adjust.
        try:
            counter = 0
            if len(p_u) > 1:
                for p in p_u:
                    if counter == 0:
                        myfile = get(p)
                        named = False
                        for x in potential_names:
                            try:
                                if r_t[x]:
                                    save_file(save_path, r_t[x])
                                    counter +=1
                                    named = True
                                    break
                                else:
                                    continue
                            except KeyError:
                                continue
                        if not named:       
                            save_file(save_path, 'check_record_{}'.format(record_count))
                            counter +=1

                    else:
                        myfile = get(p)
                        named = False
                        for x in potential_names:
                            try:
                                if r_t[x]:
                                    save_file(save_path, r_t[x] + '_{}'.format(counter))
                                    counter +=1
                                    named = True
                                    break
                                else:
                                    continue
                            except KeyError:
                                continue
                        if not named:
                            save_file(save_path, 'check_record_{}_{}'.format(record_count, counter))
                            counter +=1
                                    
            else:
                myfile = get(p_u[0])
                named = False
                for x in potential_names:
                    try:
                        if r_t[x]:
                            save_file(save_path, r_t[x])
                            named = True
                            break
                        else:
                            continue
                    except KeyError:
                        continue
                if not named:       
                    save_file(save_path, 'check_record_{}'.format(record_count))
            
            record_count += 1
        
        except Exception as e:
            import sys
            if r_t['ref']:
                raise type(e)(str(e) + '\n' + 'Error for Record: {}'.format(r_t['ref'])).with_traceback(sys.exc_info()[2])
            else:
                raise type(e)(str(e) + '\n' + 'Error for Record Number: {}'.format(record_count)).with_traceback(sys.exc_info()[2])

This is my final step that puts the text data (info on the deceased/case) into a csv file & adds the date it was pulled

In [None]:
from datetime import date

headers = ['date_of_report', 'date_of_reports', 'ref', 'deceased_name', 'deceased_names', 'coroner_name', 'coroner_area', 'category', 'this_report_is_being_sent_to', 'these_report_are_being_sent_to', 'url']

with open('death_info_{}.csv'.format(date.today()), 'w', newline='', encoding='utf-8') as deaths_csv:
    writer = csv.DictWriter(deaths_csv, fieldnames=headers)
    writer.writeheader()
    for record in record_text:
        if record == {}:
            pass
        else:
            writer.writerow(record)

This is an addition few steps to check what differences there are from the June 2021 records 

In [None]:
import os

pdfs7 = os.listdir('All_PDFs7')
pdfs8 = os.listdir('All_PDFs8')

new_not_old = set(pdfs8).difference(pdfs7)

new_not_old_list = list(new_not_old)
new_not_old_list.sort()
new_not_old_list

In [None]:
len(new_not_old_list)

In [None]:
sep21 = pd.read_csv('death_info_2021-09-07.csv')
jun21 = pd.read_csv('death_info_2021-06-28.csv')

In [None]:
cols = list(jun21.columns)
merged = sep21.merge(jun21, on=cols, how='left', indicator=True)

In [None]:
l_only = merged[merged['_merge'] == 'left_only']

In [None]:
len(l_only)

In [None]:
l_only

In [None]:
l_only.to_csv(r'death_info_newsep21.csv')

Data processing for website 

In [None]:
sep_names = pd.read_csv('death_info_2021-09-07.csv')
sep_names.head()

In [None]:
sep_names['deceased_name'] = sep_names['deceased_name'].fillna("")

In [None]:
sep_names['deceased_name'] = sep_names['deceased_name'].apply(lambda x: ''.join(i[0] for i in x.split()))

In [None]:
sep_names['deceased_name'] = sep_names['deceased_name'].str.replace('\W', '')

In [None]:
sep_names.head()

In [None]:
sep_names.to_csv('death_info_2021-09-07_processed.csv')