In [8]:
# 🐍 Standard library
import os
import io
import time
import glob
import json
import pickle
from datetime import datetime
import urllib.request

# 🌐 Third-party packages
import requests
import pandas as pd
from bs4 import BeautifulSoup
import pdfplumber
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdf2image import convert_from_path
import pytesseract

def fetch_all_pfd_pdf_links(base_url="https://www.judiciary.uk/page/{}/?s&pfd_report_type&post_type=pfd&order=date",
                             delay=1.0,
                             verbose=True,
                             cache_file="cached_pdf_links.json"):
    if os.path.exists(cache_file):
        with open(cache_file, "r") as f:
            cached_links = json.load(f)
            if verbose:
                print(f"[CACHE] Loaded {len(cached_links)} PDF links from {cache_file}")
            return cached_links

    page_num = 0
    people_urls = []

    while True:
        url = base_url.format(page_num)
        response = requests.get(url)
        if response.status_code == 404:
            break

        soup = BeautifulSoup(response.content, 'html.parser')
        links = [a['href'] for a in soup.find_all('a', class_='card__link') if a.has_attr('href')]
        people_urls.extend(links)

        if verbose:
            print(f"[Page {page_num}] Fetched {len(links)} links")

        page_num += 1
        time.sleep(delay)

    pdf_urls = []
    for report_url in people_urls:
        try:
            print(f"[FETCHING PDF LINK] {report_url}")
            res = requests.get(report_url, timeout=15)
            soup = BeautifulSoup(res.content, 'html.parser')
            links = soup.find_all('a')
            found_pdf = False

            for link in links:
                href = link.get('href')
                if href and href.endswith('.pdf'):
                    pdf_urls.append(href)
                    found_pdf = True

            if not found_pdf:
                print(f"[WARNING] No PDF found on page: {report_url}")

        except requests.exceptions.Timeout:
            print(f"[TIMEOUT] Skipped {report_url} due to timeout.")
        except Exception as e:
            print(f"[ERROR] Failed to fetch PDF from {report_url}: {e}")

        time.sleep(delay)

    pdf_urls = list(set(pdf_urls))  # deduplicate
    pd.DataFrame({'urls': people_urls}).drop_duplicates().to_csv("urls.csv", index=False)
    pd.DataFrame({'pdf_urls': pdf_urls}).drop_duplicates().to_csv("pdf_urls.csv", index=False)

    with open(cache_file, "w") as f:
        json.dump(pdf_urls, f, indent=2)
        if verbose:
            print(f"[CACHE] Saved {len(pdf_urls)} PDF links to {cache_file}")

    return pdf_urls


def download_pdfs(pdf_urls, download_dir="downloads"):
    os.makedirs(download_dir, exist_ok=True)
    for link in pdf_urls:
        link = link.strip()
        filename = os.path.basename(link)
        filepath = os.path.join(download_dir, filename)
        if os.path.isfile(filepath):
            print(f"[SKIP] {filename} already exists")
            continue
        try:
            print(f"[DOWNLOADING] {filename}")
            urllib.request.urlretrieve(link, filepath)
        except Exception as e:
            print(f"[ERROR] Failed to download {link}: {e}")

def extract_text_smart(pdf_path, min_length=200, min_space_ratio=0.05):
    def space_ratio(text):
        if not text:
            return 0
        return text.count(" ") / len(text)

    # Step 1: Try pdfminer
    try:
        resource_manager = PDFResourceManager()
        fake_file_handle = io.StringIO()
        converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
        page_interpreter = PDFPageInterpreter(resource_manager, converter)

        with open(pdf_path, 'rb') as fh:
            for page in PDFPage.get_pages(fh, caching=True, check_extractable=False):
                page_interpreter.process_page(page)
            text = fake_file_handle.getvalue()

        converter.close()
        fake_file_handle.close()

        if text and len(text.strip()) >= min_length:
            ratio = space_ratio(text)
            if ratio >= min_space_ratio:
                return text, 'pdfminer'
            else:
                print(f"[LOW SPACE RATIO: {ratio:.3f}] Falling back to pdfplumber")
    except Exception as e:
        print(f"[PDFMINER ERROR] {e}")

    # Step 2: Try pdfplumber
    try:
        with pdfplumber.open(pdf_path) as pdf:
            text = "\n".join([page.extract_text() or "" for page in pdf.pages])
        if text and len(text.strip()) >= min_length:
            return text, 'pdfplumber'
    except Exception as e:
        print(f"[PDFPLUMBER ERROR] {e}")

    return "", "ocr"

            
    
    
    
def extract_text_from_pdfs(download_dir="downloads", text_dir="texts",
                           log_path="extraction_source.json"):
    os.makedirs(text_dir, exist_ok=True)

    # Load existing extraction log
    if os.path.exists(log_path):
        with open(log_path, "r") as f:
            source_log = json.load(f)
    else:
        source_log = {}

    for filepath in glob.glob(os.path.join(download_dir, '*.pdf')):
        filename = os.path.basename(filepath)
        output_path = os.path.join(text_dir, filename + '.txt')

        # Skip if already processed
        if filename in source_log and os.path.exists(output_path):
            print(f"[SKIP] Already extracted ({source_log[filename].upper()}): {filename}")
            continue

        try:
            print(f"[EXTRACTING] {filename}")
            text, source = extract_text_smart(filepath)

            if text:
                with open(output_path, "w", encoding="utf-8") as f:
                    f.write(text)
                source_log[filename] = source
                print(f"[SAVED] from {source.upper()}")
            else:
                print(f"[NEEDS OCR] {filename}")
                source_log[filename] = "ocr"

            # 💾 Save after every file so progress is retained
            with open(log_path, "w") as f:
                json.dump(source_log, f, indent=2)

        except Exception as e:
            print(f"[ERROR] Failed to extract {filename}: {e}")


    # Save updated log  
    source_log[filename] = source
    with open(log_path, "w") as f:
        json.dump(source_log, f, indent=2)
 

    print(f"\n📘 Updated extraction log saved to {log_path}")

           
def extract_ocr_text_for_missing(text_dir="texts", download_dir="downloads",
                                 extraction_log="extraction_source.json"):
    if not os.path.exists(extraction_log):
        print("[INFO] No extraction source log found. OCR step skipped.")
        return

    with open(extraction_log, "r") as f:
        extraction_source = json.load(f)

    for pdf_file, method in extraction_source.items():
        if method != "ocr":
            continue

        pdf_path = os.path.join(download_dir, pdf_file)
        txt_path = os.path.join(text_dir, pdf_file + ".txt")

        # Skip if already has decent text
        if os.path.exists(txt_path) and os.path.getsize(txt_path) > 50:
            print(f"[SKIP] OCR already done for {pdf_file}")
            continue

        try:
            print(f"[OCR] Running OCR for {pdf_file}")
            images = convert_from_path(pdf_path)

            ocr_text_chunks = []
            for i, img in enumerate(images):
                chunk = pytesseract.image_to_string(img)
                ocr_text_chunks.append(chunk)
                if i == 0:  # only print first page for debugging
                    print(f"[OCR DEBUG] Page 1 text preview:\n{chunk.strip()[:300]}\n")

            full_text = "\n".join(ocr_text_chunks)

            if full_text.strip():
                with open(txt_path, 'w', encoding='utf-8') as f:
                    f.write(full_text)
                print(f"[✅ OCR SAVED] {txt_path}")
            else:
                print(f"[❌ OCR TEXT EMPTY] {pdf_file}")

        except Exception as e:
            print(f"[ERROR] OCR failed for {pdf_file}: {e}")

def extract_metadata_from_urls(url_csv="urls.csv", pickle_path="people_data.pickle", json_path="people_data.json"):
    if os.path.exists(pickle_path):
        with open(pickle_path, 'rb') as f:
            people = pickle.load(f)
    else:
        people = {}

    df = pd.read_csv(url_csv)
    url_col = df.columns[0]
    urls = list(df[url_col])

    for url in urls:
        slug = url.replace("https://www.judiciary.uk/prevention-of-future-death-reports/", "").replace("/", "")
        if slug in people:
            continue
        try:
            page = requests.get(url)
            soup = BeautifulSoup(page.content, 'html.parser')
            paragraphs = [str(p) for p in soup.find_all('p')]
            people[slug] = paragraphs
            print(f"[SCRAPED] {slug}")
        except Exception as e:
            print(f"[ERROR] Failed to get metadata from {url}: {e}")

    with open(pickle_path, 'wb') as f:
        pickle.dump(people, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(people, f, indent=2)
    return people


def combine_text_and_metadata_to_json(text_dir="texts", metadata_path="people_data.json",
                                      output_path="data.json", extraction_log="extraction_source.json"):
    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)

    # Load extraction source log
    if os.path.exists(extraction_log):
        with open(extraction_log, 'r') as f:
            extraction_source = json.load(f)
    else:
        extraction_source = {}

    data = []
    for text_file in glob.glob(os.path.join(text_dir, '*.txt')):
        filename = os.path.basename(text_file).replace('.pdf.txt', '').replace('.txt', '')
        pdf_name = filename + ".pdf"

        with open(text_file, 'r', encoding='utf-8') as f:
            content = f.read()

        entry = {
            "filename": filename,
            "text": content,
            "metadata": metadata.get(filename, []),
            "source": extraction_source.get(pdf_name, "unknown")
        }
        data.append(entry)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

    print(f"[DONE] Combined {len(data)} entries into {output_path}")

from datetime import datetime
import time as t

def run_full_pipeline(checkpoint_file="pipeline_checkpoint.json"):
    stages = [
        "fetch_pdf_links",
        "download_pdfs",
        "extract_text",
        "extract_ocr",
        "extract_metadata",
        "combine_json"
    ]

    # Load checkpoint if it exists
    checkpoint = {}
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            checkpoint = json.load(f)

    def mark_done(stage):
        checkpoint["stage"] = stage
        checkpoint["last_completed"] = datetime.now().isoformat()
        with open(checkpoint_file, "w") as f:
            json.dump(checkpoint, f, indent=2)

    def should_run(stage):
        return checkpoint.get("stage") is None or stages.index(checkpoint["stage"]) < stages.index(stage)

    # Stage 1: Fetch PDF links
    if should_run("fetch_pdf_links"):
        print("🔎 Fetching PDF links...")
        start = t.time()
        pdf_urls = fetch_all_pfd_pdf_links()
        print(f"✅ Done in {t.time() - start:.1f}s")
        mark_done("fetch_pdf_links")
    else:
        with open("cached_pdf_links.json") as f:
            pdf_urls = json.load(f)
        print("✅ Skipping PDF link fetch (already done)")

    # Stage 2: Download PDFs
    if should_run("download_pdfs"):
        print("\n⬇️ Downloading PDFs...")
        start = t.time()
        download_pdfs(pdf_urls)
        print(f"✅ Done in {t.time() - start:.1f}s")
        mark_done("download_pdfs")
    else:
        print("✅ Skipping PDF download (already done)")

    # Stage 3: Extract text
    if should_run("extract_text"):
        print("\n📄 Extracting text from PDFs...")
        start = t.time()
        extract_text_from_pdfs()
        print(f"✅ Done in {t.time() - start:.1f}s")
        mark_done("extract_text")
    else:
        print("✅ Skipping text extraction (already done)")

    # Stage 4: Run OCR
    if should_run("extract_ocr"):
        print("\n🧠 Running OCR on PDFs that need it...")
        start = t.time()
        extract_ocr_text_for_missing()
        print(f"✅ Done in {t.time() - start:.1f}s")
        mark_done("extract_ocr")
    else:
        print("✅ Skipping OCR (already done)")

    # Stage 5: Extract metadata
    if should_run("extract_metadata"):
        print("\n🗂️ Extracting metadata from report pages...")
        start = t.time()
        extract_metadata_from_urls()
        print(f"✅ Done in {t.time() - start:.1f}s")
        mark_done("extract_metadata")
    else:
        print("✅ Skipping metadata extraction (already done)")

    # Stage 6: Combine into JSON
    if should_run("combine_json"):
        print("\n📦 Combining text and metadata into JSON...")
        start = t.time()
        combine_text_and_metadata_to_json()
        print(f"✅ Done in {t.time() - start:.1f}s")
        mark_done("combine_json")
    else:
        print("✅ Skipping final JSON build (already done)")

    print("\n🏁 Pipeline complete.")

In [9]:
# add test case for pdfminer

In [10]:
# test case need pdfplumber

import unittest
import os
import json
import shutil

class TestPdfPlumberFallback(unittest.TestCase):
    def setUp(self):
        self.test_dir = "test_pdfplumber"
        self.download_dir = os.path.join(self.test_dir, "downloads")
        self.text_dir = os.path.join(self.test_dir, "texts")
        self.meta_json = os.path.join(self.test_dir, "people_data.json")
        self.output_json = os.path.join(self.test_dir, "data.json")
        self.extract_log = os.path.join(self.test_dir, "extraction_source.json")

        os.makedirs(self.download_dir, exist_ok=True)
        os.makedirs(self.text_dir, exist_ok=True)

        # ✅ Use your known-good local file
        self.filename = "Thomas-Ratchford-2018-0147_Redacted.pdf"
        self.local_pdf_path = os.path.join("downloads", self.filename)

        assert os.path.exists(self.local_pdf_path), "Missing local test PDF in downloads/"
        shutil.copy(self.local_pdf_path, os.path.join(self.download_dir, self.filename))

        # ✅ Create minimal dummy metadata
        dummy_slug = "Thomas-Ratchford-2018-0147_Redacted"
        dummy_data = {dummy_slug: ["<p>Dummy metadata paragraph</p>"]}
        with open(self.meta_json, "w") as f:
            json.dump(dummy_data, f, indent=2)

    def test_extraction_prefers_pdfplumber(self):
        extract_text_from_pdfs(download_dir=self.download_dir,
                               text_dir=self.text_dir,
                               log_path=self.extract_log)

        with open(self.extract_log) as f:
            log = json.load(f)

        self.assertIn(self.filename, log)
        self.assertEqual(log[self.filename], "pdfplumber", "Expected fallback to pdfplumber")

        combine_text_and_metadata_to_json(text_dir=self.text_dir,
                                          metadata_path=self.meta_json,
                                          output_path=self.output_json,
                                          extraction_log=self.extract_log)

        with open(self.output_json) as f:
            data = json.load(f)

        self.assertEqual(len(data), 1)
        self.assertIn("text", data[0])
        self.assertIn("metadata", data[0])
        self.assertEqual(data[0]["source"], "pdfplumber")
        self.assertGreater(len(data[0]["text"]), 100)

        # ✅ Show a preview of the extracted text
        print("\n✅ Extracted text preview:\n")
        print(data[0]["text"][:500])  # Show first 500 characters of the extracted text

    def tearDown(self):
        shutil.rmtree(self.test_dir)

unittest.main(argv=[''], verbosity=2, exit=False)


test_extraction_uses_pdfminer (__main__.TestPdfMinerPreferred) ... 

[EXTRACTING] 2014-0061-Response-from-Care-UK.pdf


ok
test_extraction_prefers_pdfplumber (__main__.TestPdfPlumberFallback) ... 

[SAVED] from PDFMINER

📘 Updated extraction log saved to test_pdfminer/extraction_source.json
[DONE] Combined 1 entries into test_pdfminer/data.json
[EXTRACTING] Thomas-Ratchford-2018-0147_Redacted.pdf
[LOW SPACE RATIO: 0.000] Falling back to pdfplumber
[SAVED] from PDFPLUMBER

📘 Updated extraction log saved to test_pdfplumber/extraction_source.json
[DONE] Combined 1 entries into test_pdfplumber/data.json

✅ Extracted text preview:

REGULATION 28: REPORT TO PREVENT FUTURE DEATHS (1)
REGULATION 28 REPORT TO PREVENT FUTURE DEATHS
THIS REPORT IS BEING SENT TO:
, Elizabeth House (Oldham) Limited, 35 Queens Road, Oldham, 0L8 2AX
CORONER
I am Catherine McKenna, Assistant Coroner for the Coroner area of Manchester North
2 CORONER’S LEGAL POWERS
I make this report under paragraph 7, Schedule 5, of the Coroner’s and Justice Act 2009 and
Regulations 28 and 29 of the Coroners (Investigations) Regulations 2013
3 INVESTIGATION and INQUE


ok

----------------------------------------------------------------------
Ran 2 tests in 0.863s

OK


<unittest.main.TestProgram at 0x7f4a9fb54ac8>

In [11]:
# test case need ocr

import unittest
import os
import json
import shutil

class TestPdfMinerPreferred(unittest.TestCase):
    def setUp(self):
        self.test_dir = "test_pdfminer"
        self.download_dir = os.path.join(self.test_dir, "downloads")
        self.text_dir = os.path.join(self.test_dir, "texts")
        self.meta_json = os.path.join(self.test_dir, "people_data.json")
        self.output_json = os.path.join(self.test_dir, "data.json")
        self.extract_log = os.path.join(self.test_dir, "extraction_source.json")

        os.makedirs(self.download_dir, exist_ok=True)
        os.makedirs(self.text_dir, exist_ok=True)

        self.filename = "2014-0061-Response-from-Care-UK.pdf"
        self.local_pdf_path = os.path.join("downloads", self.filename)

        assert os.path.exists(self.local_pdf_path), "Missing test PDF for pdfminer in downloads/"
        shutil.copy(self.local_pdf_path, os.path.join(self.download_dir, self.filename))

        dummy_slug = "2014-0061-Response-from-Care-UK"
        dummy_data = {dummy_slug: ["<p>Dummy metadata</p>"]}
        with open(self.meta_json, "w") as f:
            json.dump(dummy_data, f, indent=2)

    def test_extraction_uses_pdfminer(self):
        extract_text_from_pdfs(download_dir=self.download_dir,
                               text_dir=self.text_dir,
                               log_path=self.extract_log)

        with open(self.extract_log) as f:
            log = json.load(f)

        self.assertIn(self.filename, log)
        self.assertEqual(log[self.filename], "pdfminer")

        combine_text_and_metadata_to_json(text_dir=self.text_dir,
                                          metadata_path=self.meta_json,
                                          output_path=self.output_json,
                                          extraction_log=self.extract_log)

        with open(self.output_json) as f:
            data = json.load(f)

        self.assertEqual(len(data), 1)
        self.assertIn("text", data[0])
        self.assertIn("metadata", data[0])
        self.assertEqual(data[0]["source"], "pdfminer")
        self.assertGreater(len(data[0]["text"]), 100)

    def tearDown(self):
        shutil.rmtree(self.test_dir)

unittest.main(argv=[''], verbosity=2, exit=False)


test_extraction_uses_pdfminer (__main__.TestPdfMinerPreferred) ... 

[EXTRACTING] 2014-0061-Response-from-Care-UK.pdf


ok
test_extraction_prefers_pdfplumber (__main__.TestPdfPlumberFallback) ... 

[SAVED] from PDFMINER

📘 Updated extraction log saved to test_pdfminer/extraction_source.json
[DONE] Combined 1 entries into test_pdfminer/data.json
[EXTRACTING] Thomas-Ratchford-2018-0147_Redacted.pdf
[LOW SPACE RATIO: 0.000] Falling back to pdfplumber
[SAVED] from PDFPLUMBER

📘 Updated extraction log saved to test_pdfplumber/extraction_source.json
[DONE] Combined 1 entries into test_pdfplumber/data.json

✅ Extracted text preview:

REGULATION 28: REPORT TO PREVENT FUTURE DEATHS (1)
REGULATION 28 REPORT TO PREVENT FUTURE DEATHS
THIS REPORT IS BEING SENT TO:
, Elizabeth House (Oldham) Limited, 35 Queens Road, Oldham, 0L8 2AX
CORONER
I am Catherine McKenna, Assistant Coroner for the Coroner area of Manchester North
2 CORONER’S LEGAL POWERS
I make this report under paragraph 7, Schedule 5, of the Coroner’s and Justice Act 2009 and
Regulations 28 and 29 of the Coroners (Investigations) Regulations 2013
3 INVESTIGATION and INQUE


ok

----------------------------------------------------------------------
Ran 2 tests in 0.839s

OK


<unittest.main.TestProgram at 0x7f4a9fb76dd8>

In [12]:
run_full_pipeline()

✅ Skipping PDF link fetch (already done)
✅ Skipping PDF download (already done)
✅ Skipping text extraction (already done)
✅ Skipping OCR (already done)
✅ Skipping metadata extraction (already done)
✅ Skipping final JSON build (already done)

🏁 Pipeline complete.


In [13]:
def run_test_pipeline_single(url_csv="urls.csv", test_dir="test_run"):
    import shutil

    # Create a test directory
    os.makedirs(test_dir, exist_ok=True)

    # Clean test outputs inside test_dir
    for f in ["cached_pdf_links.json", "people_data.json", "people_data.pickle",
              "extraction_source.json", "data.json"]:
        path = os.path.join(test_dir, f)
        if os.path.exists(path):
            os.remove(path)
    shutil.rmtree(os.path.join(test_dir, "downloads"), ignore_errors=True)
    shutil.rmtree(os.path.join(test_dir, "texts"), ignore_errors=True)

    # Fetch PDF link from the single report URL
    print("\n🔗 Fetching PDF URL from page...")
    df = pd.read_csv(url_csv)
    url = df.iloc[0, 0]
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    pdf_links = [a["href"] for a in soup.find_all("a") if a.get("href", "").endswith(".pdf")]
    if not pdf_links:
        print("❌ No PDF link found.")
        return
    pdf_url = pdf_links[0]
    print(f"✅ Found PDF: {pdf_url}")

    # Save cached_pdf_links.json manually
    with open(os.path.join(test_dir, "cached_pdf_links.json"), "w") as f:
        json.dump([pdf_url], f)

    # Functions redirected to test_dir
    print("\n⬇️ Downloading...")
    download_pdfs([pdf_url], download_dir=os.path.join(test_dir, "downloads"))

    print("\n📄 Extracting text...")
    extract_text_from_pdfs(download_dir=os.path.join(test_dir, "downloads"),
                           text_dir=os.path.join(test_dir, "texts"),
                           log_path=os.path.join(test_dir, "extraction_source.json"))

    print("\n🧠 Running OCR if needed...")
    extract_ocr_text_for_missing(text_dir=os.path.join(test_dir, "texts"),
                                 download_dir=os.path.join(test_dir, "downloads"),
                                 extraction_log=os.path.join(test_dir, "extraction_source.json"))

    print("\n🗂️ Extracting metadata...")
    extract_metadata_from_urls(url_csv=url_csv,
                               pickle_path=os.path.join(test_dir, "people_data.pickle"),
                               json_path=os.path.join(test_dir, "people_data.json"))

    print("\n📦 Building final data.json...")
    combine_text_and_metadata_to_json(text_dir=os.path.join(test_dir, "texts"),
                                      metadata_path=os.path.join(test_dir, "people_data.json"),
                                      output_path=os.path.join(test_dir, "data.json"),
                                      extraction_log=os.path.join(test_dir, "extraction_source.json"))

    print("\n🔍 Preview of data.json:")
    with open(os.path.join(test_dir, "data.json")) as f:
        one = json.load(f)[0]
        for k in one:
            print(f"{k}: {str(one[k])[:100]}{'...' if len(str(one[k])) > 100 else ''}")


In [14]:
run_test_pipeline_single("urls.csv")


🔗 Fetching PDF URL from page...
✅ Found PDF: https://www.judiciary.uk/wp-content/uploads/2025/04/Sandra-Millard-Prevention-of-Future-Deaths-Report-2025-0175.pdf

⬇️ Downloading...
[DOWNLOADING] Sandra-Millard-Prevention-of-Future-Deaths-Report-2025-0175.pdf

📄 Extracting text...
[EXTRACTING] Sandra-Millard-Prevention-of-Future-Deaths-Report-2025-0175.pdf
[SAVED] from PDFMINER

📘 Updated extraction log saved to test_run/extraction_source.json

🧠 Running OCR if needed...

🗂️ Extracting metadata...
[SCRAPED] sandra-millard-prevention-of-future-deaths-report

📦 Building final data.json...
[DONE] Combined 1 entries into test_run/data.json

🔍 Preview of data.json:
filename: Sandra-Millard-Prevention-of-Future-Deaths-Report-2025-0175
text: Regulation 28: REPORT TO PREVENT FUTURE DEATHS

NOTE: This form is to be used after an inquest.

REG...
metadata: []
source: pdfminer


In [16]:
pickle_path="people_data.pickle"
with open(pickle_path, 'rb') as f:
            people = pickle.load(f)

In [17]:
people

{'sandra-millard-prevention-of-future-deaths-report': ['<p class="pill pill--single"><a href="https://www.judiciary.uk/pfd-types/emergency-services-related-deaths-2019-onwards/">Emergency services related deaths (2019 onwards)</a></p>',
  '<p>Date of report: 07/04/2025\xa0</p>',
  '<p>Ref: 2025-0175\xa0</p>',
  '<p>Deceased name: Sandra Millard\xa0</p>',
  '<p>Coroners name: Robert Simpson\xa0</p>',
  '<p>Coroners Area:\xa0Berkshire\xa0</p>',
  '<p>Category:\xa0Emergency services related deaths (2019 onwards)\xa0 \xa0</p>',
  '<p>This report is being sent to: South Central Ambulance Service | NHS England\xa0</p>',
  '<p></p>'],
 'andrew-waters-prevention-of-future-deaths-report': ['<p class="pill pill--single"><a href="https://www.judiciary.uk/pfd-types/emergency-services-related-deaths-2019-onwards/">Emergency services related deaths (2019 onwards)</a></p>',
  '<p>Date of report: 03/04/2025\xa0</p>',
  '<p>Ref: 2025-0174\xa0</p>',
  '<p>Deceased name: Andrew Waters\xa0</p>',
  '<p>Cor

In [None]:
# need to understand! what we have, what we want including print statement 
# I think issue is with combine_text_and_metadata_to_json
# look at how achieved before

def combine_text_and_metadata_to_json(text_dir="texts", metadata_path="people_data.json",
                                      output_path="data.json", extraction_log="extraction_source.json"):
    with open(metadata_path, 'r', encoding='utf-8') as f:
        metadata = json.load(f)

    # Load extraction source log
    if os.path.exists(extraction_log):
        with open(extraction_log, 'r') as f:
            extraction_source = json.load(f)
    else:
        extraction_source = {}

    data = []
    for text_file in glob.glob(os.path.join(text_dir, '*.txt')):
        filename = os.path.basename(text_file).replace('.pdf.txt', '').replace('.txt', '')
        pdf_name = filename + ".pdf"

        with open(text_file, 'r', encoding='utf-8') as f:
            content = f.read()

        entry = {
            "filename": filename,
            "text": content,
            "metadata": metadata.get(filename, []),
            "source": extraction_source.get(pdf_name, "unknown")
        }
        data.append(entry)

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2)

    print(f"[DONE] Combined {len(data)} entries into {output_path}")
