In [1]:
# üêç Standard library
import os
import io
import time
import glob
import json
import pickle
from datetime import datetime
import urllib.request

# üåê Third-party packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
import pdfplumber
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdf2image import convert_from_path
import pytesseract
import numpy as np

def fetch_all_pfd_pdf_links(base_url="https://www.judiciary.uk/page/{}/?s&pfd_report_type&post_type=pfd&order=date",
                             delay=1.0,
                             verbose=True,
                             cache_file="cached_pdf_links.json"):
    if os.path.exists(cache_file):
        with open(cache_file, "r") as f:
            cached_links = json.load(f)
            if verbose:
                print(f"[CACHE] Loaded {len(cached_links)} PDF links from {cache_file}")
            return cached_links

    page_num = 0
    people_urls = []

    while True:
        url = base_url.format(page_num)
        response = requests.get(url)
        if response.status_code == 404:
            break

        soup = BeautifulSoup(response.content, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', class_='card__link') if a.has_attr('href')]
        people_urls.extend(links)

        if verbose:
            print(f"[Page {page_num}] Fetched {len(links)} links")

        page_num += 1
        time.sleep(delay)

    pdf_urls = []
    for report_url in people_urls:
        try:
            print(f"[FETCHING PDF LINK] {report_url}")
            res = requests.get(report_url, timeout=15)
            soup = BeautifulSoup(res.content, 'html.parser')
            links = soup.find_all('a')
            found_pdf = False

            for link in links:
                href = link.get('href')
                if href and href.endswith('.pdf'):
                    pdf_urls.append(href)
                    found_pdf = True

            if not found_pdf:
                print(f"[WARNING] No PDF found on page: {report_url}")

        except requests.exceptions.Timeout:
            print(f"[TIMEOUT] Skipped {report_url} due to timeout.")
        except Exception as e:
            print(f"[ERROR] Failed to fetch PDF from {report_url}: {e}")

        time.sleep(delay)

    pdf_urls = list(set(pdf_urls))  # deduplicate
    pd.DataFrame({'urls': people_urls}).drop_duplicates().to_csv("urls.csv", index=False)
    pd.DataFrame({'pdf_urls': pdf_urls}).drop_duplicates().to_csv("pdf_urls.csv", index=False)

    with open(cache_file, "w") as f:
        json.dump(pdf_urls, f, indent=2)
        if verbose:
            print(f"[CACHE] Saved {len(pdf_urls)} PDF links to {cache_file}")

    return pdf_urls


def download_pdfs(pdf_urls, download_dir="downloads"):
    os.makedirs(download_dir, exist_ok=True)
    for link in pdf_urls:
        link = link.strip()
        filename = os.path.basename(link)
        filepath = os.path.join(download_dir, filename)
        if os.path.isfile(filepath):
            print(f"[SKIP] {filename} already exists")
            continue
        try:
            print(f"[DOWNLOADING] {filename}")
            urllib.request.urlretrieve(link, filepath)
        except Exception as e:
            print(f"[ERROR] Failed to download {link}: {e}")

def extract_text_smart(pdf_path, min_length=200, min_space_ratio=0.05):
    def space_ratio(text):
        if not text:
            return 0
        return text.count(" ") / len(text)

    # Step 1: Try pdfminer
    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open(pdf_path, 'rb') as fh:
            for page in PDFPage.get_pages(fh, caching=True, check_extractable=False):
                page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()

        converter.close()
        fake_file_handle.close()

        if text and len(text.strip()) >= min_length:
            ratio = space_ratio(text)
            if ratio >= min_space_ratio:
                return text, 'pdfminer'
            else:
                print(f"[LOW SPACE RATIO: {ratio:.3f}] Falling back to pdfplumber")
    except Exception as e:
        print(f"[PDFMINER ERROR] {e}")

    # Step 2: Try pdfplumber
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "\n".join([page.extract_text() or "" for page in pdf.pages])
        if text and len(text.strip()) >= min_length:
            return text, 'pdfplumber'
    except Exception as e:
        print(f"[PDFPLUMBER ERROR] {e}")

    return "", "ocr"

            
    
    
    
def extract_text_from_pdfs(download_dir="downloads", text_dir="texts",
                           log_path="extraction_source.json"):
    os.makedirs(text_dir, exist_ok=True)

    # Load existing extraction log
    if os.path.exists(log_path):
        with open(log_path, "r") as f:
            source_log = json.load(f)
    else:
        source_log = {}

    for filepath in glob.glob(os.path.join(download_dir, '*.pdf')):
        filename = os.path.basename(filepath)
        output_path = os.path.join(text_dir, filename + '.txt')

        # Skip if already processed
        if filename in source_log and os.path.exists(output_path):
            print(f"[SKIP] Already extracted ({source_log[filename].upper()}): {filename}")
            continue

        try:
            print(f"[EXTRACTING] {filename}")
            text, source = extract_text_smart(filepath)

            if text:
                with open(output_path, "w", encoding="utf-8") as f:
                    f.write(text)
                source_log[filename] = source
                print(f"[SAVED] from {source.upper()}")
            else:
                print(f"[NEEDS OCR] {filename}")
                source_log[filename] = "ocr"

            # üíæ Save after every file so progress is retained
            with open(log_path, "w") as f:
                json.dump(source_log, f, indent=2)

        except Exception as e:
            print(f"[ERROR] Failed to extract {filename}: {e}")


    # Save updated log  
    source_log[filename] = source
    with open(log_path, "w") as f:
        json.dump(source_log, f, indent=2)
 

    print(f"\nüìò Updated extraction log saved to {log_path}")

           
def extract_ocr_text_for_missing(text_dir="texts", download_dir="downloads",
                                 extraction_log="extraction_source.json"):
    if not os.path.exists(extraction_log):
        print("[INFO] No extraction source log found. OCR step skipped.")
        return

    with open(extraction_log, "r") as f:
        extraction_source = json.load(f)

    for pdf_file, method in extraction_source.items():
        if method != "ocr":
            continue

        pdf_path = os.path.join(download_dir, pdf_file)
        txt_path = os.path.join(text_dir, pdf_file + ".txt")

        # Skip if already has decent text
        if os.path.exists(txt_path) and os.path.getsize(txt_path) > 50:
            print(f"[SKIP] OCR already done for {pdf_file}")
            continue

        try:
            print(f"[OCR] Running OCR for {pdf_file}")
            images = convert_from_path(pdf_path)

            ocr_text_chunks = []
            for i, img in enumerate(images):
                chunk = pytesseract.image_to_string(img)
                ocr_text_chunks.append(chunk)
                if i == 0:  # only print first page for debugging
                    print(f"[OCR DEBUG] Page 1 text preview:\n{chunk.strip()[:300]}\n")

            full_text = "\n".join(ocr_text_chunks)

            if full_text.strip():
                with open(txt_path, 'w', encoding='utf-8') as f:
                    f.write(full_text)
                print(f"[‚úÖ OCR SAVED] {txt_path}")
            else:
                print(f"[‚ùå OCR TEXT EMPTY] {pdf_file}")

        except Exception as e:
            print(f"[ERROR] OCR failed for {pdf_file}: {e}")
            
def extract_metadata_from_urls(url_csv="urls.csv", pickle_path="people_data.pickle", json_path="people_data.json"):
    import re
    import pandas as pd
    import requests
    from bs4 import BeautifulSoup
    import os
    import json
    import pickle

    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            people = pickle.load(f)
    else:
        people = {}

    df = pd.read_csv(url_csv)
    url_col = df.columns[0]
    urls = list(df[url_col])

    for url in urls:
        try:
            page = requests.get(url, timeout=15)
            soup = BeautifulSoup(page.content, 'html.parser')

            # Find PDF link to extract filename (used as the key)
            pdf_link = next((a['href'] for a in soup.find_all('a') if a.get('href', '').endswith('.pdf')), None)
            if not pdf_link:
                print(f"[WARNING] No PDF link found at {url}")
                continue

            filename = os.path.basename(pdf_link)
            if filename in people:
                continue

            # Extract metadata paragraphs
            paragraphs = [str(p) for p in soup.find_all('p')]

            people[filename] = paragraphs
            print(f"[SCRAPED] {filename}")

        except Exception as e:
            print(f"[ERROR] Failed to get metadata from {url}: {e}")

    with open(pickle_path, 'wb') as f:
        pickle.dump(people, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(people, f, indent=2)

    return people

import os
import json
import glob
import re

from bs4 import BeautifulSoup

def combine_text_and_metadata_to_json(
    text_dir="texts",
    metadata_path="people_data.json",
    output_path="data.json",
    extraction_log="extraction_source.json"
):
    def clean(text):
        return re.sub(r"[\xa0\u200b\n\r]+", " ", text).strip()

    def extract_fields(paragraphs):
        fields = {
            "date_of_report": "",
            "ref": "",
            "name_of_deceased": "",
            "coroner_name": "",
            "coroner_area": "",
            "category": "",
        }

        for html in paragraphs:
            soup = BeautifulSoup(html, "html.parser")
            text = clean(soup.get_text())
            text_lower = text.lower()

            if "date of report" in text_lower:
                fields["date_of_report"] = text
            elif text_lower.startswith("ref"):
                fields["ref"] = text.replace("Ref:", "").strip()
            elif "deceased name" in text_lower:
                fields["name_of_deceased"] = text
            elif "coroners name" in text_lower:
                fields["coroner_name"] = text
            elif "coroners area" in text_lower:
                fields["coroner_area"] = text
            elif "category" in text_lower:
                fields["category"] = text

        return fields

    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata_dict = json.load(f)

    if os.path.exists(extraction_log):
        with open(extraction_log, 'r') as f:
            extraction_source = json.load(f)
    else:
        extraction_source = {}

    data = []
    for text_file in glob.glob(os.path.join(text_dir, '*.txt')):
        filename = os.path.basename(text_file).replace('.txt', '')
        pdf_filename = filename.replace('.pdf', '') + '.pdf'

        with open(text_file, 'r', encoding='utf-8') as f:
            content = f.read()

        raw_meta = metadata_dict.get(pdf_filename, [])
        fields = extract_fields(raw_meta)
        year_match = re.search(r"\b(20\d{2})\b", fields.get("date_of_report", ""))
        year = int(year_match.group(1)) if year_match else ""

        base_slug = os.path.splitext(pdf_filename)[0].lower().replace(" ", "-")
        clean_slug = re.sub(r'-\d{4}-\d{4}(_.*)?$', '', base_slug)
        entry = {
            "person": base_slug,
            **fields,
            "filename": filename,
            "text": content,
            "url": f"https://www.judiciary.uk/prevention-of-future-death-reports/{clean_slug}/",
            "year_of_report": year,
            "source": extraction_source.get(pdf_filename, "unknown")
}
        entry.pop("source", None)  # üî• Strip out 'source' for Tantivy
        entry.pop("year_of_report", None)# üî• Strip out 'year_of_report' for Tantivy
        data.append(entry)
        
    with open(output_path, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    print(f"[DONE] Combined {len(data)} entries into {output_path}")

from datetime import datetime
import time as t

def run_full_pipeline(checkpoint_file="pipeline_checkpoint.json"):
    stages = [
        "fetch_pdf_links",
        "download_pdfs",
        "extract_text",
        "extract_ocr",
        "extract_metadata",
        "combine_json"
    ]

    # Load checkpoint if it exists
    checkpoint = {}
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            checkpoint = json.load(f)

    def mark_done(stage):
        checkpoint["stage"] = stage
        checkpoint["last_completed"] = datetime.now().isoformat()
        with open(checkpoint_file, "w") as f:
            json.dump(checkpoint, f, indent=2)

    def should_run(stage):
        return checkpoint.get("stage") is None or stages.index(checkpoint["stage"]) < stages.index(stage)

    # Stage 1: Fetch PDF links
    if should_run("fetch_pdf_links"):
        print("üîé Fetching PDF links...")
        start = t.time()
        pdf_urls = fetch_all_pfd_pdf_links()
        print(f"‚úÖ Done in {t.time() - start:.1f}s")
        mark_done("fetch_pdf_links")
    else:
        with open("cached_pdf_links.json") as f:
            pdf_urls = json.load(f)
        print("‚úÖ Skipping PDF link fetch (already done)")

    # Stage 2: Download PDFs
    if should_run("download_pdfs"):
        print("\n‚¨áÔ∏è Downloading PDFs...")
        start = t.time()
        download_pdfs(pdf_urls)
        print(f"‚úÖ Done in {t.time() - start:.1f}s")
        mark_done("download_pdfs")
    else:
        print("‚úÖ Skipping PDF download (already done)")

    # Stage 3: Extract text
    if should_run("extract_text"):
        print("\nüìÑ Extracting text from PDFs...")
        start = t.time()
        extract_text_from_pdfs()
        print(f"‚úÖ Done in {t.time() - start:.1f}s")
        mark_done("extract_text")
    else:
        print("‚úÖ Skipping text extraction (already done)")

    # Stage 4: Run OCR
    if should_run("extract_ocr"):
        print("\nüß† Running OCR on PDFs that need it...")
        start = t.time()
        extract_ocr_text_for_missing()
        print(f"‚úÖ Done in {t.time() - start:.1f}s")
        mark_done("extract_ocr")
    else:
        print("‚úÖ Skipping OCR (already done)")

    # Stage 5: Extract metadata
    if should_run("extract_metadata"):
        print("\nüóÇÔ∏è Extracting metadata from report pages...")
        start = t.time()
        extract_metadata_from_urls()
        print(f"‚úÖ Done in {t.time() - start:.1f}s")
        mark_done("extract_metadata")
    else:
        print("‚úÖ Skipping metadata extraction (already done)")

    # Stage 6: Combine into JSON
    if should_run("combine_json"):
        print("\nüì¶ Combining text and metadata into JSON...")
        start = t.time()
        combine_text_and_metadata_to_json()
        print(f"‚úÖ Done in {t.time() - start:.1f}s")
        mark_done("combine_json")
    else:
        print("‚úÖ Skipping final JSON build (already done)")

    print("\nüèÅ Pipeline complete.")

  from cryptography.hazmat.backends import default_backend


In [2]:
# test case need pdfplumber

import unittest
import os
import json
import shutil

class TestPdfPlumberFallback(unittest.TestCase):
    def setUp(self):
        self.test_dir = "test_pdfplumber"
        self.download_dir = os.path.join(self.test_dir, "downloads")
        self.text_dir = os.path.join(self.test_dir, "texts")
        self.meta_json = os.path.join(self.test_dir, "people_data.json")
        self.output_json = os.path.join(self.test_dir, "data.json")
        self.extract_log = os.path.join(self.test_dir, "extraction_source.json")

        os.makedirs(self.download_dir, exist_ok=True)
        os.makedirs(self.text_dir, exist_ok=True)

        # ‚úÖ Use your known-good local file
        self.filename = "Thomas-Ratchford-2018-0147_Redacted.pdf"
        self.local_pdf_path = os.path.join("downloads", self.filename)

        assert os.path.exists(self.local_pdf_path), "Missing local test PDF in downloads/"
        shutil.copy(self.local_pdf_path, os.path.join(self.download_dir, self.filename))

        # ‚úÖ Create minimal dummy metadata
        dummy_slug = "Thomas-Ratchford-2018-0147_Redacted"
        dummy_data = {dummy_slug: ["<p>Dummy metadata paragraph</p>"]}
        with open(self.meta_json, "w") as f:
            json.dump(dummy_data, f, indent=2)

    def test_extraction_prefers_pdfplumber(self):
        extract_text_from_pdfs(download_dir=self.download_dir,
                               text_dir=self.text_dir,
                               log_path=self.extract_log)

        with open(self.extract_log) as f:
            log = json.load(f)

        self.assertIn(self.filename, log)
        self.assertEqual(log[self.filename], "pdfplumber", "Expected fallback to pdfplumber")

        combine_text_and_metadata_to_json(text_dir=self.text_dir,
                                          metadata_path=self.meta_json,
                                          output_path=self.output_json,
                                          extraction_log=self.extract_log)

        with open(self.output_json) as f:
            data = json.load(f)

        self.assertEqual(len(data), 1)
        self.assertIn("text", data[0])
        self.assertIn("metadata", data[0])
        self.assertEqual(data[0]["source"], "pdfplumber")
        self.assertGreater(len(data[0]["text"]), 100)

        # ‚úÖ Show a preview of the extracted text
        print("\n‚úÖ Extracted text preview:\n")
        print(data[0]["text"][:500])  # Show first 500 characters of the extracted text

    def tearDown(self):
        shutil.rmtree(self.test_dir)

unittest.main(argv=[''], verbosity=2, exit=False)


test_extraction_prefers_pdfplumber (__main__.TestPdfPlumberFallback) ... 

[EXTRACTING] Thomas-Ratchford-2018-0147_Redacted.pdf
[LOW SPACE RATIO: 0.000] Falling back to pdfplumber
[SAVED] from PDFPLUMBER

üìò Updated extraction log saved to test_pdfplumber/extraction_source.json
[DONE] Combined 1 entries into test_pdfplumber/data.json


FAIL

FAIL: test_extraction_prefers_pdfplumber (__main__.TestPdfPlumberFallback)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-2-d2678b3ba447>", line 52, in test_extraction_prefers_pdfplumber
    self.assertEqual(len(data), 1)
AssertionError: 10 != 1

----------------------------------------------------------------------
Ran 1 test in 0.589s

FAILED (failures=1)


<unittest.main.TestProgram at 0x7fdfb0fdc1d0>

In [3]:
# test case need ocr

import unittest
import os
import json
import shutil

class TestPdfMinerPreferred(unittest.TestCase):
    def setUp(self):
        self.test_dir = "test_pdfminer"
        self.download_dir = os.path.join(self.test_dir, "downloads")
        self.text_dir = os.path.join(self.test_dir, "texts")
        self.meta_json = os.path.join(self.test_dir, "people_data.json")
        self.output_json = os.path.join(self.test_dir, "data.json")
        self.extract_log = os.path.join(self.test_dir, "extraction_source.json")

        os.makedirs(self.download_dir, exist_ok=True)
        os.makedirs(self.text_dir, exist_ok=True)

        self.filename = "2014-0061-Response-from-Care-UK.pdf"
        self.local_pdf_path = os.path.join("downloads", self.filename)

        assert os.path.exists(self.local_pdf_path), "Missing test PDF for pdfminer in downloads/"
        shutil.copy(self.local_pdf_path, os.path.join(self.download_dir, self.filename))

        dummy_slug = "2014-0061-Response-from-Care-UK"
        dummy_data = {dummy_slug: ["<p>Dummy metadata</p>"]}
        with open(self.meta_json, "w") as f:
            json.dump(dummy_data, f, indent=2)

    def test_extraction_uses_pdfminer(self):
        extract_text_from_pdfs(download_dir=self.download_dir,
                               text_dir=self.text_dir,
                               log_path=self.extract_log)

        with open(self.extract_log) as f:
            log = json.load(f)

        self.assertIn(self.filename, log)
        self.assertEqual(log[self.filename], "pdfminer")

        combine_text_and_metadata_to_json(text_dir=self.text_dir,
                                          metadata_path=self.meta_json,
                                          output_path=self.output_json,
                                          extraction_log=self.extract_log)

        with open(self.output_json) as f:
            data = json.load(f)

        self.assertEqual(len(data), 1)
        self.assertIn("text", data[0])
        self.assertIn("metadata", data[0])
        self.assertEqual(data[0]["source"], "pdfminer")
        self.assertGreater(len(data[0]["text"]), 100)

    def tearDown(self):
        shutil.rmtree(self.test_dir)

unittest.main(argv=[''], verbosity=2, exit=False)


test_extraction_uses_pdfminer (__main__.TestPdfMinerPreferred) ... 

[EXTRACTING] 2014-0061-Response-from-Care-UK.pdf


FAIL
test_extraction_prefers_pdfplumber (__main__.TestPdfPlumberFallback) ... 

[SAVED] from PDFMINER

üìò Updated extraction log saved to test_pdfminer/extraction_source.json
[DONE] Combined 1 entries into test_pdfminer/data.json
[EXTRACTING] Thomas-Ratchford-2018-0147_Redacted.pdf
[LOW SPACE RATIO: 0.000] Falling back to pdfplumber
[SAVED] from PDFPLUMBER

üìò Updated extraction log saved to test_pdfplumber/extraction_source.json
[DONE] Combined 1 entries into test_pdfplumber/data.json


FAIL

FAIL: test_extraction_uses_pdfminer (__main__.TestPdfMinerPreferred)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-3-f0b9fdb8f872>", line 50, in test_extraction_uses_pdfminer
    self.assertEqual(len(data), 1)
AssertionError: 10 != 1

FAIL: test_extraction_prefers_pdfplumber (__main__.TestPdfPlumberFallback)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-2-d2678b3ba447>", line 52, in test_extraction_prefers_pdfplumber
    self.assertEqual(len(data), 1)
AssertionError: 10 != 1

----------------------------------------------------------------------
Ran 2 tests in 0.786s

FAILED (failures=2)


<unittest.main.TestProgram at 0x7fdfb0fdc160>

In [4]:
run_full_pipeline()

‚úÖ Skipping PDF link fetch (already done)
‚úÖ Skipping PDF download (already done)
‚úÖ Skipping text extraction (already done)
‚úÖ Skipping OCR (already done)
‚úÖ Skipping metadata extraction (already done)
‚úÖ Skipping final JSON build (already done)

üèÅ Pipeline complete.


In [5]:
# need a urls.csv containing a single reference

def run_test_pipeline_single(url_csv="urls.csv", test_dir="test_run"):
    import shutil

    # Create a test directory
    os.makedirs(test_dir, exist_ok=True)

    # Clean test outputs inside test_dir
    for f in ["cached_pdf_links.json", "people_data.json", "people_data.pickle",
              "extraction_source.json", "data.json"]:
        path = os.path.join(test_dir, f)
        if os.path.exists(path):
            os.remove(path)
    shutil.rmtree(os.path.join(test_dir, "downloads"), ignore_errors=True)
    shutil.rmtree(os.path.join(test_dir, "texts"), ignore_errors=True)

    # Fetch PDF link from the single report URL
    print("\nüîó Fetching PDF URL from page...")
    df = pd.read_csv(url_csv)
    url = df.iloc[0, 0]
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    pdf_links = [a["href"] for a in soup.find_all("a") if a.get("href", "").endswith(".pdf")]
    if not pdf_links:
        print("‚ùå No PDF link found.")
        return
    pdf_url = pdf_links[0]
    print(f"‚úÖ Found PDF: {pdf_url}")

    # Save cached_pdf_links.json manually
    with open(os.path.join(test_dir, "cached_pdf_links.json"), "w") as f:
        json.dump([pdf_url], f)

    # Functions redirected to test_dir
    print("\n‚¨áÔ∏è Downloading...")
    download_pdfs([pdf_url], download_dir=os.path.join(test_dir, "downloads"))

    print("\nüìÑ Extracting text...")
    extract_text_from_pdfs(download_dir=os.path.join(test_dir, "downloads"),
                           text_dir=os.path.join(test_dir, "texts"),
                           log_path=os.path.join(test_dir, "extraction_source.json"))

    print("\nüß† Running OCR if needed...")
    extract_ocr_text_for_missing(text_dir=os.path.join(test_dir, "texts"),
                                 download_dir=os.path.join(test_dir, "downloads"),
                                 extraction_log=os.path.join(test_dir, "extraction_source.json"))

    print("\nüóÇÔ∏è Extracting metadata...")
    extract_metadata_from_urls(url_csv=url_csv,
                               pickle_path=os.path.join(test_dir, "people_data.pickle"),
                               json_path=os.path.join(test_dir, "people_data.json"))

    print("\nüì¶ Building final data.json...")
    combine_text_and_metadata_to_json(text_dir=os.path.join(test_dir, "texts"),
                                      metadata_path=os.path.join(test_dir, "people_data.json"),
                                      output_path=os.path.join(test_dir, "data.json"),
                                      extraction_log=os.path.join(test_dir, "extraction_source.json"))

    print("\nüîç Preview of data.json:")
    with open(os.path.join(test_dir, "data.json")) as f:
        one = json.loads(f.readline())
        for k in one:
            print(f"{k}: {str(one[k])[:100]}{'...' if len(str(one[k])) > 100 else ''}")
        

In [6]:
# run_test_pipeline_single("./test_run/urls.csv") breaks metadata?? prob not, should test

In [7]:
# fill in ref where it's missing

# Load newline-delimited JSON file (each line is a separate JSON object)
df = pd.read_json("data.json", lines=True)

# Treat empty strings as missing
df['ref'].replace("", pd.NA, inplace=True)

# Fill 'ref' from the start of 'filename' where it's missing
missing_ref = df['ref'].isna()
df.loc[missing_ref, 'ref'] = df.loc[missing_ref, 'filename'].str.extract(r"^(\d{4}-\d{4})", expand=False)

# Fill any remaining missing 'ref' with the 'url' since not all cases appear to have a ref
df['ref'].fillna(df['url'], inplace=True)

In [8]:
# have issues with broken urls and missing refs still

In [9]:
def check_url(url):
    try:
        if "judiciary.uk" in url:
            r = requests.get(url, allow_redirects=True, timeout=5)
            content = r.text.lower()
            broken = (
                r.status_code != 200 or
                "page not found" in content or
                "sorry, we can‚Äôt find" in content or
                "no results found" in content
            )
            return url, not broken
        else:
            r = requests.head(url, allow_redirects=True, timeout=5)
            return url, r.status_code == 200
    except requests.RequestException:
        return url, False

In [10]:
# Test case
def test_judiciary_soft_404():
    url = "https://www.judiciary.uk/prevention-of-future-death-reports/2016-0368-barnsley-hospital-nhs-trust/"
    checked_url, ok = check_url(url)
    if ok is False:
        print("‚úÖ Test passed: URL correctly marked as broken.")
    else:
        print("‚ùå Test failed: URL incorrectly marked as working.")

In [11]:
test_judiciary_soft_404()

‚úÖ Test passed: URL correctly marked as broken.


In [12]:
# most complete row happens to have correct url (I think), this is v hacky

# Define a completeness score that excludes nulls and empty strings/whitespace
def completeness_score(row):
    return sum(x not in [None, np.nan] and str(x).strip() != '' for x in row)

# Apply completeness score
df['completeness'] = df.apply(completeness_score, axis=1)

# Get the most complete row per ref (that has a non-null URL)
best_urls = (
    df.dropna(subset=['ref', 'url'])
      .sort_values('completeness', ascending=False)
      .drop_duplicates('ref')[['ref', 'url']]
)

# Build mapping and update
ref_to_best_url = best_urls.set_index('ref')['url']
df['url'] = df['ref'].map(ref_to_best_url).fillna(df['url'])

# Clean up
df.drop(columns='completeness', inplace=True)

In [13]:
# Step 1: Get non-empty name_of_deceased for each ref
ref_to_name = (
    df[df['name_of_deceased'].notna() & (df['name_of_deceased'].str.strip() != '')]
    .drop_duplicates('ref')
    .set_index('ref')['name_of_deceased']
)

# Step 2: Fill missing or empty name_of_deceased using that mapping
def fill_name(row):
    if pd.isna(row['name_of_deceased']) or str(row['name_of_deceased']).strip() == '':
        return ref_to_name.get(row['ref'], row['name_of_deceased'])
    return row['name_of_deceased']

df['name_of_deceased'] = df.apply(fill_name, axis=1)

In [15]:
import os
import json
import requests
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# Load or create cache
cache_file = "url_status_cache.json"
if os.path.exists(cache_file):
    with open(cache_file, "r") as f:
        url_cache = json.load(f)
else:
    url_cache = {}

# Optional: force rechecking some URLs
force_recheck = True  # set to True to ignore cache

# Function to check a single URL
def check_url(url):
    try:
        if "judiciary.uk" in url:
            r = requests.get(url, allow_redirects=True, timeout=5)
            content = r.text.lower()
            broken = (
                r.status_code != 200 or
                "page not found" in content or
                "sorry, we can‚Äôt find" in content or
                "no results found" in content
            )
            return url, not broken
        else:
            r = requests.head(url, allow_redirects=True, timeout=5)
            return url, r.status_code == 200
    except requests.RequestException:
        return url, False

# Get URLs to check (exclude cached unless forced)
urls_to_check = [url for url in df['url'].dropna().unique() if force_recheck or url not in url_cache]

# Multithreaded URL checking with progress
with ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(check_url, url): url for url in urls_to_check}
    for future in tqdm(as_completed(futures), total=len(futures), desc="Checking URLs"):
        url, ok = future.result()
        url_cache[url] = ok

# Persist updated cache
with open(cache_file, "w") as f:
    json.dump(url_cache, f)

# Map results back to DataFrame
df['url_ok'] = df['url'].map(url_cache)

# Filter broken URLs
broken_urls = df[df['url_ok'] == False]

# Output summary
print(f"Checked {len(urls_to_check)} new URLs (cached: {len(url_cache) - len(urls_to_check)}).")
print(f"Found {len(broken_urls)} broken URLs.")

# Replace bad URLs using ref
df['url_ok'] = df['url_ok'].fillna(False).astype(bool)
ref_to_good_url = (
    df[df['url_ok']]
    .dropna(subset=['ref'])
    .drop_duplicates('ref')
    .set_index('ref')['url']
)
# ref_to_good_url = df[df['url_ok']].dropna(subset=['ref']).drop_duplicates('ref').set_index('ref')['url']
df['url'] = df.apply(
    lambda row: ref_to_good_url.get(row['ref'], row['url']) if not row.get('url_ok', True) else row['url'],
    axis=1
)


Checking URLs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5440/5440 [05:32<00:00, 16.34it/s]


Checked 5440 new URLs (cached: 8162).
Found 0 broken URLs.


In [16]:
# lets try setting url appropriately in another way..

mask = df['url_ok'] == False

df.loc[mask, 'url'] = (
    df.loc[mask, 'name_of_deceased']
      .str.replace(r'^Deceased name:\s*', '', regex=True)
      .str.strip()
      .str.lower()
      .str.replace(r'[^a-z0-9]+', '-', regex=True)
      .str.strip('-')
      .apply(lambda name: f"https://www.judiciary.uk/prevention-of-future-death-reports/{name}/" if pd.notna(name) and name != '' else np.nan)
)


In [21]:
def name_to_search_url(name):
    if not isinstance(name, str) or not name.strip().lower().startswith("deceased name:"):
        return np.nan
    clean_name = name.replace("Deceased name:", "").strip()
    query = '+'.join(part.capitalize() for part in clean_name.split())
    return f"https://www.judiciary.uk/?s={query}"

# Apply only to broken URLs
mask = df['url_ok'] == False
df.loc[mask, 'url'] = df.loc[mask, 'name_of_deceased'].apply(name_to_search_url)


In [24]:
# I found a mistake on the internet https://www.judiciary.uk/prevention-of-future-death-reports/williams-vickers/
# the 'broken' urls appear to be typos (based on checking the first three); strictly we ought to have captured
# the slug urls during our download process for the situation when it does not meet the expected pattern
# for now we accept this
# an elgant alternative is actually to revert to a search of the pfd website
# e.g. https://www.judiciary.uk/?s=FirstName+SecondName so lets do that
# transforms = {'https://www.judiciary.uk/prevention-of-future-death-reports/william-vickers/':'https://www.judiciary.uk/prevention-of-future-death-reports/williams-vickers/', 
#             'https://www.judiciary.uk/prevention-of-future-death-reports/grenfell-tower/':'https://www.judiciary.uk/prevention-of-future-death-reports/2018-0262-prevention-of-future-deaths-report/', 
#             'https://www.judiciary.uk/prevention-of-future-death-reports/colin-sluman/':'https://www.judiciary.uk/prevention-of-future-death-reports/colin-james/'}

In [25]:
df['ref'].str.len().unique() # some of our refs are not refs but are urls, I likely did that as some were missing
# should review

array([  9,  72,  77,  11,  81,  73,  86,  83,  80,  79,  84,  85,  98,
       136,  65,  97,  71,  78,  88,  87, 117, 109,  76, 116,  74,  94,
        10,  82,  95, 121, 105,  75,  67,   8,  89, 126, 100, 137,  69,
        92, 106,  90,  96, 110, 114,  68,  99,  91, 131,  66,  93, 104,
       124, 122, 154, 153, 112, 128, 127, 113, 118,  70, 111, 101, 108,
       149, 132, 129, 107, 171, 115, 142,  35, 135, 130,  41, 267, 245,
       281, 125])

In [29]:
df.name_of_deceased.nunique()

5375

In [30]:
df.name_of_deceased.unique()

array(['Deceased name: Malyun Karama', 'Deceased name: Theresa Robertson',
       'Deceased name: George Townsend', ...,
       'Deceased name: Susan Williams', 'Deceased name: Keith Weston',
       'Deceased name: Barbara Mitchell'], dtype=object)

In [35]:
df.person.unique()

array(['malyun-karama-2020-0162_redacted',
       'theresa-robertson-2020-0158_redacted',
       'george-townsend-2020-0157_redacted', ...,
       'john-jennings-2020-0257',
       'sean-ennis-prevention-of-future-deaths-report-2022-0054',
       'barbara-mitchell-prevention-of-future-deaths-report-2023-0153_published'],
      dtype=object)

In [42]:
df.name_of_deceased = df.name_of_deceased.str.replace(r'^Deceased name:\s*', '', regex=True)

In [43]:
df

Unnamed: 0,person,date_of_report,ref,name_of_deceased,coroner_name,coroner_area,category,filename,text,url,url_ok
0,malyun-karama-2020-0162_redacted,Date of report: 21 August 2020,2020-0162,5375,,,Category: Hospital death (Clinical procedure a...,Malyun-Karama-2020-0162_Redacted.pdf,\n\n \n\n \n1 \n\n \n2 \n\n \n3 \n\n \n4 \n\n...,https://www.judiciary.uk/prevention-of-future-...,True
1,theresa-robertson-2020-0158_redacted,Date of report: 6 August 2020,2020-0158,5375,,,"Category: Alcohol, drug and medication related...",Theresa-Robertson-2020-0158_Redacted.pdf,Regulation 28: Prevention of Future Deaths re...,https://www.judiciary.uk/prevention-of-future-...,True
2,george-townsend-2020-0157_redacted,Date of report: 4 June 2020,2020-0157,5375,,,"Category: Community healthcare related deaths,...",George-Townsend-2020-0157_Redacted.pdf,REGULATION 28: REPORT TO PREVENT FUTURE DEATH...,https://www.judiciary.uk/prevention-of-future-...,True
3,jerrelle-mckenzie-2020-0144_redacted,Date of report: 17 July 2020,2020-0144,5375,,,"Category: Alcohol, drug and medication related...",Jerrelle-McKenzie-2020-0144_Redacted.pdf,47812-2019 \n\nSenior Coroner - Emma Whitting ...,https://www.judiciary.uk/prevention-of-future-...,True
4,joan-williams-2020-0128_redacted,Date of report: 16 June 2020,2020-0128,5375,,,Category: Road (Highways Safety) related deaths,Joan-Williams-2020-0128_Redacted.pdf,48060-2019\n\nSenior Coroner - Emma Whitting\n...,https://www.judiciary.uk/prevention-of-future-...,True
...,...,...,...,...,...,...,...,...,...,...,...
11971,2014-0061-response-from-care-uk,,2014-0061,5375,,,,2014-0061-Response-from-Care-UK.pdf,care \n\nCare UK Clinical Services \nLimited ...,https://www.judiciary.uk/prevention-of-future-...,True
11972,neville-bardoliwalla-2020-0258,Date of report: 26 November 2020,2020-0258,5375,,,Category: Suicide; Other related deaths,Neville-Bardoliwalla-2020-0258.pdf,\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n ...,https://www.judiciary.uk/?s=Neville+Bardoliwalla,True
11973,john-jennings-2020-0257,Date of report: 26 November 2020,2020-0257,5375,,,Category: Emergency services related deaths; O...,John-Jennings-2020-0257.pdf,\n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n \n\n ...,https://www.judiciary.uk/prevention-of-future-...,True
11974,sean-ennis-prevention-of-future-deaths-report-...,Date of report: 21 February 2022,2022-0054,5375,,,Category: Other related deaths,Sean-Ennis-Prevention-of-future-deaths-report-...,Her Majesty‚Äôs Coroner for the \nNorthern Distr...,https://www.judiciary.uk/prevention-of-future-...,True


In [47]:
# Keep only the columns needed for Tantivy (order matters too if schema is sensitive)
columns = [
    "person", "date_of_report", "ref", "name_of_deceased", "coroner_name",
    "coroner_area", "category", "filename", "text", "url"
]
df_tantivy = df[columns]

# Save in newline-delimited JSON format (one JSON object per line)
df_tantivy.to_json("data.json", orient="records", lines=True, force_ascii=False)

In [None]:
df = pd.read_json("data.json", lines=True)

In [49]:
df.to_csv('elspeth_data.csv')