In [1]:
import os
import re
import glob
import json
import csv

def check_text_quality(text, min_length=500, min_alpha_ratio=0.7, verbose=False):
    quality = {
        "length": len(text),
        "alnum_ratio": len(re.findall(r'\w', text)) / max(len(text), 1),
        "long_word_count": len([w for w in text.split() if len(w) > 3]),
    }

    quality["ok"] = (
        quality["length"] >= min_length and
        quality["alnum_ratio"] >= min_alpha_ratio and
        quality["long_word_count"] >= 10
    )

    if verbose and not quality["ok"]:
        print(f"⚠️ Quality check failed: {quality}")

    return quality

def clean_text(text):
    text = re.sub(r'(.)\1{3,}', r'\1', text)  # Collapse repeated characters
    text = text.replace('ﬁ', 'fi').replace('ﬂ', 'fl')  # Fix ligatures
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII
    text = re.sub(r'-\s*\n\s*', '', text)  # Remove hyphenation at line breaks
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

def process_texts(text_dir="texts", cleaned_dir="texts_cleaned",
                  low_quality_log="low_quality_files.json",
                  stats_csv="text_quality_stats.csv",
                  skip_low_quality=True):
    os.makedirs(cleaned_dir, exist_ok=True)

    # Load low quality log if it exists
    low_quality = []
    if os.path.exists(low_quality_log):
        with open(low_quality_log, "r") as f:
            low_quality = json.load(f)

    cleaned_files = {os.path.basename(f) for f in glob.glob(os.path.join(cleaned_dir, "*.txt"))}
    files = glob.glob(os.path.join(text_dir, "*.txt"))
    print(f"🔍 Found {len(files)} files in '{text_dir}'")

    stats = []

    for path in files:
        filename = os.path.basename(path)
        if filename in cleaned_files:
            print(f"[SKIP] Already cleaned: {filename}")
            continue

        with open(path, "r", encoding="utf-8") as f:
            raw_text = f.read()

        cleaned = clean_text(raw_text)
        quality = check_text_quality(cleaned, verbose=True)
        quality["filename"] = filename
        stats.append(quality)

        if skip_low_quality and not quality["ok"]:
            print(f"[LOW QUALITY] {filename} — skipping")
            low_quality.append(filename)
            continue

        out_path = os.path.join(cleaned_dir, filename)
        with open(out_path, "w", encoding="utf-8") as f:
            f.write(cleaned)
        print(f"[CLEANED] {filename}")

    # Deduplicate and save low quality log
    low_quality = sorted(set(low_quality))
    with open(low_quality_log, "w") as f:
        json.dump(low_quality, f, indent=2)

    print(f"\n📉 Logged {len(low_quality)} low-quality files to {low_quality_log}")

    # Write CSV of stats
    if stats:
        keys = ["filename", "length", "alnum_ratio", "long_word_count", "ok"]
        with open(stats_csv, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(stats)
        print(f"📊 Saved quality stats to {stats_csv}")

if __name__ == "__main__":
    process_texts()


🔍 Found 11976 files in 'texts'
[SKIP] Already cleaned: Malyun-Karama-2020-0162_Redacted.pdf.txt
[SKIP] Already cleaned: Theresa-Robertson-2020-0158_Redacted.pdf.txt
[SKIP] Already cleaned: George-Townsend-2020-0157_Redacted.pdf.txt
[SKIP] Already cleaned: Jerrelle-McKenzie-2020-0144_Redacted.pdf.txt
[SKIP] Already cleaned: Joan-Williams-2020-0128_Redacted.pdf.txt
[SKIP] Already cleaned: Lynda-Pedersen-2020-0112.pdf.txt
[SKIP] Already cleaned: Tomasz-Nowasad-2019-0445_Redacted.pdf.txt
[SKIP] Already cleaned: Keith-WHETTON-2019-0452_Redacted.pdf.txt
[SKIP] Already cleaned: Kieran-Hubbard-2019-0451_Redacted.pdf.txt
[SKIP] Already cleaned: Barry-Liffen-2019-0440_Redacted.pdf.txt
[SKIP] Already cleaned: Eugeniusz-MALEK-2019-0439_Redacted.pdf.txt
[SKIP] Already cleaned: Peter-Frosdick-2019-0423_Redacted.pdf.txt
[SKIP] Already cleaned: 2019-0378-Royal-College-of-General-Practitioners.pdf.txt
[SKIP] Already cleaned: David-Kirsch-2019-0362_Redacted.pdf.txt
[SKIP] Already cleaned: Alex-Malcolm-2

[SKIP] Already cleaned: 2018-0173-Kernow-NHS-Trust.pdf.txt
[SKIP] Already cleaned: Elaine-Horrocks-2018-0169_Redacted.pdf.txt
[SKIP] Already cleaned: Mwitumwa-Ngenda-2018-0167_Redacted.pdf.txt
[SKIP] Already cleaned: Michalla-Sweeting-2018-0165_Redacted.pdf.txt
[SKIP] Already cleaned: Alfie-Scambler-Holt-2018-0156_Redacted.pdf.txt
[SKIP] Already cleaned: George-Goldby-2018-0104_Redacted.pdf.txt
[SKIP] Already cleaned: Harry-Jellicoe-2018-0108_Redacted.pdf.txt
[SKIP] Already cleaned: Karen-Edgar-2018-0106_Redacted.pdf.txt
[SKIP] Already cleaned: Lakhminder-Kaur-2018-0029.pdf.txt
[SKIP] Already cleaned: Bethany-Shipsey-2018-0049_Redacted.pdf.txt
[SKIP] Already cleaned: Riaz-Begum-2018-0041_Redacted.pdf.txt
[SKIP] Already cleaned: David-Buttriss-2018-0010_Redacted.pdf.txt
[SKIP] Already cleaned: Rose-Ball-2017-0395_Redacted.pdf.txt
[SKIP] Already cleaned: Anthony-Grant-2017-0410_Redacted.pdf.txt
[SKIP] Already cleaned: Robert-Richards-2017-0406_Redacted.pdf.txt
[SKIP] Already cleaned: Mic

[SKIP] Already cleaned: Sousse-Inquest-2017-0206.pdf.txt
[SKIP] Already cleaned: Fairclough-2017-0119.pdf.txt
[SKIP] Already cleaned: Krasinsky-Lloyd-2017-0109.pdf.txt
[SKIP] Already cleaned: FOSTER-2017-0095.pdf.txt
[SKIP] Already cleaned: Hooper-2017-0068_Redacted.pdf.txt
[SKIP] Already cleaned: Bevan-2017-0060.pdf.txt
[SKIP] Already cleaned: Alexander-2017-0044.pdf.txt
[SKIP] Already cleaned: Evanson-2016-0359.pdf.txt
[SKIP] Already cleaned: Lock-2016-0355.pdf.txt
[SKIP] Already cleaned: Bassendine-2016-0424.pdf.txt
[SKIP] Already cleaned: Arthur-2017-0009.pdf.txt
[SKIP] Already cleaned: ODonoghue-2017-0007.pdf.txt
[SKIP] Already cleaned: Mills-2016-0416.pdf.txt
[SKIP] Already cleaned: Charles-2016-0465.pdf.txt
[SKIP] Already cleaned: Haughton-2016-0339.pdf.txt
[SKIP] Already cleaned: Jones-2016-0327.pdf.txt
[SKIP] Already cleaned: Cahill-2016-0304.pdf.txt
[SKIP] Already cleaned: Mindham-2016-0295.pdf.txt
[SKIP] Already cleaned: Olawale-ADELUSI_Redacted.pdf.txt
[SKIP] Already cleane

[SKIP] Already cleaned: 2016-0271-Response-by-University-health-Board.pdf.txt
[SKIP] Already cleaned: 2016-0276-Response-by-Medicines-and-Healthcare-Products-Regulatory-Agency.pdf.txt
[SKIP] Already cleaned: 2016-0303-Response-by-North-West-Ambulance-Service-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2016-0308-Response-by-Medicines-and-Healthcare-Products-Regulatory-Agency.pdf.txt
[SKIP] Already cleaned: 2016-0316-Response-by-Warwick-Medical-School.pdf.txt
[SKIP] Already cleaned: 2016-0367-Response-by-NMC.pdf.txt
[SKIP] Already cleaned: 2016-0376-Response-by-Resuscitation-Council-UK.pdf.txt
[SKIP] Already cleaned: 2016-0384-Response-by-Yorkshire-Ambulance-Service.pdf.txt
[SKIP] Already cleaned: 2016-0396-Response-by-Maritime-and-Coastguard-Agency.pdf.txt
[SKIP] Already cleaned: 2016-0402-Response-by-RYA.pdf.txt
[SKIP] Already cleaned: 2016-0459-Response-by-University-Health-Board.pdf.txt
[SKIP] Already cleaned: 2016-0600-Response-from-the-London-Ambulance-Service_Redacted.pdf.txt
[SKIP]

[SKIP] Already cleaned: 2017-0193-Response-by-NMC.pdf.txt
[SKIP] Already cleaned: 2017-0234-Response-by-The-Royal-College-of-Emergency-Medicine.pdf.txt
[SKIP] Already cleaned: 2017-0248-Response-by-Priory-Hospital_Redacted.pdf.txt
[SKIP] Already cleaned: 2017-0275-Response-by-Parkhill-Estates-Limited.pdf.txt
[SKIP] Already cleaned: 2017-0325-Response-by-Bedfordshire-Police.pdf.txt
[SKIP] Already cleaned: 2017-0348-Response-United-Lincolnshire-Hospitals-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2017-0376-Response-by-East-Midlands-Ambulance-Service-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2017-0434-Response-by-Draeger-Medical-UK-Limited.pdf.txt
[SKIP] Already cleaned: 2018-0007-Response.pdf.txt
[SKIP] Already cleaned: 2018-0008-Response-by-CAA.pdf.txt
[SKIP] Already cleaned: 2018-0021-Response-by-Sussex-Partnership-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2018-0066-Response-by-University-Health-Board.pdf.txt
[SKIP] Already cleaned: 2018-0084-Responses.pdf.txt
[SKIP] Already cleaned: 20

[CLEANED] Lee-Carpenter-2020-0052.pdf.txt
[SKIP] Already cleaned: Mohan-Acharya-2020-0045-Redacted.pdf.txt
[CLEANED] Jake-Lee-2020-0039-Redacted.pdf.txt
[SKIP] Already cleaned: Liam-Seager-2020-0029-Redacted.pdf.txt
[SKIP] Already cleaned: Gemma-Azhar-2020-0026-Redacted.pdf.txt
[SKIP] Already cleaned: GORDON-GILLOTT-2020-0020.pdf.txt
[CLEANED] Blaitthin-Buckley-2019-0465.pdf.txt
[CLEANED] Gareth-Williams-2019-0464.pdf.txt
[SKIP] Already cleaned: Joanna-Orpin-2019-0457_Redacted.pdf.txt
[CLEANED] Samantha-Brousas-2019-0443_Redacted.pdf.txt
[CLEANED] Shirley-Nightingale-2019-0431_Redacted.pdf.txt
[SKIP] Already cleaned: Terence-James-2019-0430_Redacted.pdf.txt
[SKIP] Already cleaned: Brenda-McWilliams-2019-0406.pdf.txt
[SKIP] Already cleaned: Costal-Stancu-2010-0379.pdf.txt
[CLEANED] Catherine-Gardiner-Others-2019-0350_Redacted.pdf.txt
[SKIP] Already cleaned: Ffion-Jones-2019-0298.pdf.txt
[SKIP] Already cleaned: Darren-CUMBERBATCH-2019-0289_Redacted.pdf.txt
[CLEANED] Carol-Jennings-2019-0

[CLEANED] 2014-0550-Response-by-Bupa.pdf.txt
[CLEANED] 2014-0555-Response-by-Central-North-West-London-NHS-Trust.pdf.txt
[CLEANED] 2014-0576-Response-by-NOMS.pdf.txt
[CLEANED] 2015-0007-Response-by-Home-Office.pdf.txt
[CLEANED] 2015-0088-Response-by-Royal-Orthopaedic-Hospital.pdf.txt
[CLEANED] 2015-0143-Response-by-Department-of-Health.pdf.txt
[CLEANED] 2015-0150-Response-by-Department-for-Business-Innovation-Skills.pdf.txt
[CLEANED] 2015-0163-Response-by-NHS-Wales.pdf.txt
[CLEANED] 2015-0170-Response-by-Springfield-Medical-Centre.pdf.txt
[CLEANED] 2015-0177-Response-by-Department-of-Health.pdf.txt
[CLEANED] 2015-0220-Response-by-Metropolitan-Police.pdf.txt
[CLEANED] 2015-0233-Response-by-Avon-and-Wiltshire-NHS-Trust.pdf.txt
[CLEANED] 2015-0252-Response-by-CAA.pdf.txt
[SKIP] Already cleaned: 2015-0266-Response-by-MHRA.pdf.txt
[CLEANED] 2015-0289-Response-by-NHS-England.pdf.txt
[CLEANED] 2015-0394-Response-by-NOMS.pdf.txt
[CLEANED] 2015-0444-Response.pdf.txt
[CLEANED] 2015-0453-Response

[CLEANED] Marion-Clode-Swinhoe-Farm-2021-0228-Redacted.pdf.txt
[SKIP] Already cleaned: Serena-Nicolle-2021-0212-Redacted.pdf.txt
[SKIP] Already cleaned: Nicholas-OBrien-2021-0197-Redacted.pdf.txt
⚠️ Quality check failed: {'length': 125, 'alnum_ratio': 0.808, 'long_word_count': 14, 'ok': False}
[LOW QUALITY] 2021-0131-Response-from-from-Canal-River-Trust-Redacted.pdf.txt — skipping
[SKIP] Already cleaned: 2021-0127-Response-from-Department-of-Health-and-Social-Care_Published.pdf.txt
[SKIP] Already cleaned: 2021-0113-Response-from-Royal-College-of-Physicians-Redacted.pdf.txt
[SKIP] Already cleaned: 2021-0106-Response-from-Wrexham-County-Borough-Council.pdf.txt
[SKIP] Already cleaned: 2021-0073-Response-from-Black-Country-Healthcare-NHS-Foundation-Trust_Published.pdf.txt
[SKIP] Already cleaned: Ellie-Issacs-2020-0169.pdf.txt
[SKIP] Already cleaned: Dereck-John-Chapman-2020-0165_Redacted.pdf.txt
[SKIP] Already cleaned: Pauline-Russell-2020-0149_Redacted.pdf.txt
[SKIP] Already cleaned: Alla

[CLEANED] 2014-0355-Response-by-The-Royal-College-of-Anaesthetists.pdf.txt
[CLEANED] 2014-0377-Response-by-Tees-Esk-and-Wear-Valleys-NHS-Trust.pdf.txt
[CLEANED] 2014-0410-Response-by-The-GEO-Group-UK-Ltd.pdf.txt
[CLEANED] 2014-0473-Response-by-Brighton-Sussex-University-Hospitals-NHS-Trust.pdf.txt
[CLEANED] 2014-0490-Response-by-BSI.pdf.txt
[CLEANED] 2014-0553-Response-by-Care-Uk.pdf.txt
[CLEANED] 2014-0575-Response-by-Ofsted.pdf.txt
[CLEANED] 2015-0003-Response-by-Department-of-Health.pdf.txt
[CLEANED] 2015-0015-Response-by-Barts-Health-NHS-Trust.pdf.txt
[CLEANED] 2015-0035-Response-by-Ministry-of-Justice.pdf.txt
[CLEANED] 2015-0046-Response-by-Doncaster-Borough-Council.pdf.txt
[CLEANED] 2015-0053-Response-by-Barts-Health-NHS-Trust.pdf.txt
[CLEANED] 2015-0085-Response-by-University-Health-Board.pdf.txt
[CLEANED] 2015-0103-Response-by-BHTA.pdf.txt
[CLEANED] 2015-0120-Response-by-NOMS.pdf.txt
[CLEANED] 2015-0190-Response-by-Priory-Group.pdf.txt
[CLEANED] 2015-0195-Response-by-Gloucester

[CLEANED] Euan-ELLIS-2019-0264_Redacted.pdf.txt
[CLEANED] Kay-Martin-2019-0262_Redacted.pdf.txt
[SKIP] Already cleaned: Christopher-Summerhayes-2019-0263_Redacted.pdf.txt
[SKIP] Already cleaned: Elisa-Fuller-2019-0481_Redacted.pdf.txt
[CLEANED] Zona-Tebbs-2019-0248_Redacted.pdf.txt
[SKIP] Already cleaned: Alex-Blake-2019-0259_Redacted.pdf.txt
[CLEANED] Simon-Robinson-2019-0176_Redacted.pdf.txt
[SKIP] Already cleaned: James-Fletcher-2019-0146_Redacted.pdf.txt
[CLEANED] Benjamin-Murray-2019-0155_Redacted.pdf.txt
[CLEANED] June-Russell-2019-0128_Redacted.pdf.txt
[SKIP] Already cleaned: Patrick-Kelly-2019-0128_Redacted.pdf.txt
[SKIP] Already cleaned: Ann-CORFIELD-2019-0107_Redacted.pdf.txt
[CLEANED] John-Mellor-2019-0053_Redacted.pdf.txt
[CLEANED] Elizabeth-Curtis-2019-0018_Redacted.pdf.txt
[CLEANED] Brian-Frost-2018-0362_Redacted.pdf.txt
[SKIP] Already cleaned: Collette-DUNN-2018-0337.pdf.txt
[SKIP] Already cleaned: Catherine-Gibbon-2018-0317_Redacted.pdf.txt
[CLEANED] Mary-Johnson-2019-0

[CLEANED] 2016-0248-Response-by-Orchard-Care-Homes.pdf.txt
[CLEANED] 2016-0267-Response-by-Durham-County-Council.pdf.txt
[CLEANED] 2016-0305-Response-by-Department-of-Health.pdf.txt
[CLEANED] 2016-0348-Response-by-Department-for-Transport.pdf.txt
[CLEANED] 2016-0350-Response-by-Isle-of-Wight-NHS-Trust.pdf.txt
[CLEANED] 2016-0363-Response-by-NHS-England.pdf.txt
[SKIP] Already cleaned: 2016-0409-Response-by-Care-Quality-Commission.pdf.txt
[CLEANED] 2016-0413-Response-by-Royal-Cornwall-Hospital-NHS-trust.pdf.txt
[CLEANED] 2016-0415-Response-by-Portsmouth-Hospitals-NHS-Trust.pdf.txt
[CLEANED] 2016-0462-Response-by-University-Health-Board.pdf.txt
[CLEANED] 2017-0044-Response-by-HSE.pdf.txt
[CLEANED] 2017-0047-Response-by-Kent-County-Council.pdf.txt
[CLEANED] 2017-0077-Response-by-University-Health-Board.pdf.txt
[SKIP] Already cleaned: 2017-0137-Response-by-Department-of-Health.pdf.txt
[CLEANED] 2017-0180-Response-by-Pennine-Care-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2017-0183-Response-b

[CLEANED] Maureen-MARTIN-2019-0220.pdf.txt
[CLEANED] Feni-Lee-2019-0224_Redacted.pdf.txt
[SKIP] Already cleaned: Alexander-Boamah-2019-0232.pdf.txt
[CLEANED] Pauline-Howell-2019-0498.pdf.txt
[CLEANED] Maia-Strachan-2019-0174_Redacted.pdf.txt
[CLEANED] Christopher-Barnes-2019-0164_Redacted.pdf.txt
[SKIP] Already cleaned: John-Alliston-2019-0153_Redacted.pdf.txt
[CLEANED] Anthony-WALKER-2019-0152_Redacted.pdf.txt
[CLEANED] Tamsin-Grundy-2019-0088_Redacted.pdf.txt
[CLEANED] Doreen-Fell-2019-0109_Redacted.pdf.txt
[CLEANED] Jeremy-Sutch-2019-0065_Redacted.pdf.txt
[CLEANED] Terence-Penney-2019-0034_Redacted.pdf.txt
[CLEANED] George-THOMPSON-2019-0022.pdf.txt
[SKIP] Already cleaned: Marie-Millward-Winter-2019-0020.pdf.txt
[CLEANED] Ruth-Edwards-2018-0395_Redacted.pdf.txt
[CLEANED] Sheila-Hadfield-2018-0334_Redacted.pdf.txt
[CLEANED] Bradley-Morgan-2018-0412.pdf.txt
[SKIP] Already cleaned: Jacob-Sulaiman-2018-0252_Redacted.pdf.txt
[CLEANED] Astonn-Mitchell-Male-2018-0248.pdf.txt
[SKIP] Already

[CLEANED] 2017-0216-Response-by-Department-for-Transport.pdf.txt
[CLEANED] 2017-0259-Response-by-Department-for-Transport.pdf.txt
[CLEANED] 2017-0293-Response-by-Somerset-NHS-Trust.pdf.txt
[CLEANED] 2017-0356-Response-by-NHS-Engalnd.pdf.txt
[CLEANED] 2017-0392-Response.pdf.txt
[SKIP] Already cleaned: 2017-0402-Response-by-Heywood-Middleton-and-Rochdale-CCG.pdf.txt
[CLEANED] 2017-0449-Response-by-NHS-England.pdf.txt
[CLEANED] 2018-0010-Response-by-NHS-England.pdf.txt
[CLEANED] 2018-0018-Response-by-East-Sussex-Healthcare-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2018-0189-Response-by-Leicestershire-Partnershire-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2018-0212-Response-by-Barts-NHS-Trust.pdf.txt
[CLEANED] 2018-0256-Response-by-NHS-England.pdf.txt
[CLEANED] 2018-0265-Response-by-Bannatyne-Fitness-Limited.pdf.txt
[CLEANED] 2018-0333-Response-by-The-Leeds-Teaching-Hospitals-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2019-0005-Response-by-Barts-Health-NHS-Trust.pdf.txt
[SKIP] Already clean

[CLEANED] Jenson-Francis-2019-0158_Redacted.pdf.txt
[CLEANED] Heather-Birchall-2019-0223_Redacted.pdf.txt
[CLEANED] David-MOBSBY-2019-0087_Redacted.pdf.txt
[SKIP] Already cleaned: Andrew-Carr-2019-0038.pdf.txt
[CLEANED] Branko-Zdravkovic-2019-0047_Redacted.pdf.txt
[SKIP] Already cleaned: Paul-GILLAM-2019-0045_Redacted.pdf.txt
[CLEANED] Michael-FLYNN-2019-0008.pdf.txt
[CLEANED] Edward-Farmer-2018-0390_Redacted.pdf.txt
[CLEANED] Barnaby-Aylward-2018-0387_Redacted.pdf.txt
[CLEANED] Roy-Burgess-2018-0364_Redacted.pdf.txt
[CLEANED] Charles-Grainger-2019-0353_Redacted.pdf.txt
[CLEANED] Patricia-Chambers-2018-0350_Redacted.pdf.txt
[CLEANED] Madeline-Staples-2019-0041.pdf.txt
[CLEANED] Ryan-Williams-2018-0341_Redacted.pdf.txt
[CLEANED] Eileen-Cooke-2018-0311_Redacted.pdf.txt
[CLEANED] Daniel-Collins-2018-0283_Redacted.pdf.txt
[CLEANED] Joan-Wright-2018-0408_Redacted.pdf.txt
[CLEANED] Susan-Elliott-2018-0275.pdf.txt
[SKIP] Already cleaned: Kathleen-Allen-2018-0213.pdf.txt
[CLEANED] Andrew-Craig

[CLEANED] 2015-0233-Response-by-Wiltshire-Council.pdf.txt
[CLEANED] 2015-0234-Response-by-Transport-for-London.pdf.txt
[CLEANED] 2015-0259-Response-by-Greenlane-Care-Homes-Limited_Redacted.pdf.txt
[CLEANED] 2015-0290-Response-by-Rotherham-Borough-Council.pdf.txt
[CLEANED] 2015-0363-Response-by-Sub-C-Pre-Dive-Checklist.pdf.txt
[CLEANED] 2015-0379-response.pdf.txt
[CLEANED] 2015-0433-Response.pdf.txt
[CLEANED] 2015-0445-Response-by-Southern-Health-NHS-Trust.pdf.txt
[CLEANED] 2015-0466-Response.pdf.txt
[CLEANED] 2015-0469-Response-by-North-West-Ambulance-Service-NHS-Trust.pdf.txt
[CLEANED] 2016-0105-Response-by-Rotherham-Doncaster-and-South-Humber-NHS-Trust.pdf.txt
[CLEANED] 2016-0162-Response-by-Department-of-Health.pdf.txt
[CLEANED] 2016-0244-Response-by-Metropolitan-Police.pdf.txt
[CLEANED] 2016-0256-Response-by-Wychall-Lane-Surgery.pdf.txt
[SKIP] Already cleaned: 2016-0270-Response-by-Care-Quality-Commission.pdf.txt
[CLEANED] 2016-0296-Response-by-Brighton-and-Sussex-University-NHS-Tr

[CLEANED] Amy-Allan-2019-0343_Redacted.pdf.txt
[CLEANED] Fern-Marie-Choya-2019-0281_Redacted.pdf.txt
[CLEANED] Daniel-Davey-2019-0267_Redacted.pdf.txt
[CLEANED] Carl-Sargeant-2019-0236.pdf.txt
[CLEANED] Cherylee-Shennan-2019-02443_Redacted.pdf.txt
[CLEANED] Robert-Cobbina-2019-0210_Redacted.pdf.txt
[CLEANED] Tyereece-Johnson-2019-0166_Redacted.pdf.txt
[SKIP] Already cleaned: Graham-TAILBY-2019-0092.pdf.txt
[CLEANED] Tyrone-Evans-2018-0358_Redacted.pdf.txt
[CLEANED] Mary-Tyder-2018-0323_Redacted.pdf.txt
[CLEANED] Graeme-Mathieson-2018-0153_Redacted.pdf.txt
[CLEANED] Greg-Hutchins-2018-0129_Redacted.pdf.txt
[SKIP] Already cleaned: Natasha-Ford-2018-0052.pdf.txt
[CLEANED] Paul-Daniels-2018-0003_Redacted.pdf.txt
[SKIP] Already cleaned: Jeff-Antwis-2017-0392_Redacted.pdf.txt
[CLEANED] Harold-Chapman-2017-0377B_Redacted.pdf.txt
[SKIP] Already cleaned: Mark-Welsh-2017-0456_Redacted.pdf.txt
[CLEANED] Naomi-Sourbut_Redacted.pdf.txt
[CLEANED] Derek-Dudley-2017-0284_Redacted.pdf.txt
[CLEANED] Geo

[SKIP] Already cleaned: 2022-0325-Response-from-University-Hospital-of-Derby-and-Burton.pdf.txt
[SKIP] Already cleaned: Carl-Wright-Prevention-of-future-deaths-report-2022-0324_Published.pdf.txt
[SKIP] Already cleaned: Seth-Thind-Prevention-of-future-deaths-report-2022-0323_Published.pdf.txt
[SKIP] Already cleaned: Robert-Evans-Prevention-of-future-deaths-report-2022-0322_Published.pdf.txt
[SKIP] Already cleaned: Rebecca-Hayward-Prevention-of-future-deaths-report-2022-0321_Published.pdf.txt
[SKIP] Already cleaned: Adam-Simms-Prevention-of-future-deaths-report-2022-0320_Published.pdf.txt
[SKIP] Already cleaned: Neha-Raju-Prevention-of-future-deaths-report-2022-0319_Published-1.pdf.txt
[SKIP] Already cleaned: Kenneth-Goodwin-Prevention-of-future-deaths-report-2022-0318_Published.pdf.txt
[SKIP] Already cleaned: Charles-Stringer-Prevention-of-future-deaths-report-2022-0317_Published-1.pdf.txt
[SKIP] Already cleaned: Oli-Hoque-Prevention-of-future-deaths-report-2022-0316_Published-1.pdf.txt

[CLEANED] Connor-Marron-Prevention-of-future-deaths-report-2022-0190_Pulished.pdf.txt
[SKIP] Already cleaned: 2022-0190-Response-from-Alexandra-Palace.pdf.txt
[SKIP] Already cleaned: 2022-0190-Response-from-Thames-Water.pdf.txt
[SKIP] Already cleaned: Keith-Nottle-Prevention-of-future-deaths-report-2022-0189_Published.pdf.txt
[SKIP] Already cleaned: 2022-0189-Response-from-NHS-Nottinghamshire-Healthcare.pdf.txt
[SKIP] Already cleaned: 2022-0189-Response-from-Turning-Point.pdf.txt
[SKIP] Already cleaned: 2022-0188-Response-from-NHS-Tameside-and-Glossop-Integrated-Care.pdf.txt
[SKIP] Already cleaned: Derek-Holmes-Prevention-of-future-deaths-report-2022-0188_Published-1.pdf.txt
[SKIP] Already cleaned: Margaret-Stringer-Prevention-of-future-deaths-report-2022-0187_Published.pdf.txt
[SKIP] Already cleaned: 2022-0187-Response-from-DAC-Beachcroft.pdf.txt
[SKIP] Already cleaned: 2022-0187-Response-from-Adult-Community-Social-Care.pdf.txt
[SKIP] Already cleaned: 2022-0187-Response-from-Blackpoo

[CLEANED] Eirlys-Roberts-Prevention-of-future-deaths-2022-0034_Published.pdf.txt
[SKIP] Already cleaned: 2022-0034-Response-from-Welsh-Government_Published.pdf.txt
[SKIP] Already cleaned: 2022-0034-Response-from-Gwynedd-Council_Published.pdf.txt
[SKIP] Already cleaned: Carole-Cole-Prevention-of-future-deaths-report-2022-0033_Published.pdf.txt
[SKIP] Already cleaned: 2022-0033-Response-from-Dorset-Police_Published.pdf.txt
[SKIP] Already cleaned: 2022-0033-Response-from-Dorset-Council_Published.pdf.txt
[SKIP] Already cleaned: Jake-Cahill-Prevention-of-future-deaths-report-2022-0032_Published.pdf.txt
[SKIP] Already cleaned: 2022-0032-Response-from-Youth-Justice-Board_Published.pdf.txt
[SKIP] Already cleaned: Oskar-Nash-Prevention-of-future-deaths-report-2022-0031_Published-1.pdf.txt
[SKIP] Already cleaned: 2022-0031-Response-from-Child-Safeguarding-Practice-Review-Panel_Published.pdf.txt
[SKIP] Already cleaned: 2022-0031-Response-from-Surrey-County-Council_Published.pdf.txt
[SKIP] Already

[CLEANED] 2021-0349-Response-from-Droylsden-Road-Family-Practice_Published.pdf.txt
[SKIP] Already cleaned: 2021-0340-Response-from-Croft-Shifa-Health-Centre.pdf.txt
[CLEANED] 2021-0338-Response-from-Nottingham-Healthcare-NHS-Foundation-Trust_Published.pdf.txt
[SKIP] Already cleaned: 2021-0335-Response-from-NHS-England-and-NHS-Improvement_Published.pdf.txt
[SKIP] Already cleaned: 2021-0332-Response-from-HMPPS_Published.pdf.txt
[SKIP] Already cleaned: 2021-0332-Response-from-Department-of-Health-Social-Care_Published.pdf.txt
[SKIP] Already cleaned: 2021-0323-Response-from-Harden-Bingley-Park-Ltd_Published.pdf.txt
[SKIP] Already cleaned: 2021-0322-Response-from-MHRA_Published-1.pdf.txt
[SKIP] Already cleaned: 2021-0322-Response-from-Mid-Yorkshire-Hospitals_Published-1.pdf.txt
[SKIP] Already cleaned: 2021-0322-Response-from-Department-of-Health-and-Social-Care_Published-1.pdf.txt
[SKIP] Already cleaned: 2021-0322-Response-from-Philips_Published-1.pdf.txt
[SKIP] Already cleaned: 2021-0307-R

[SKIP] Already cleaned: Joseph-Price-Prevention-of-future-deaths-report-2023-0019_Published.pdf.txt
[SKIP] Already cleaned: Lyn-Brind-Prevention-of-future-deaths-report-2023-0017_Published.pdf.txt
[SKIP] Already cleaned: Derek-Larkin-Prevention-of-future-deaths-report-2023-0018_Published.pdf.txt
[SKIP] Already cleaned: 2023-0018-Response-from-Dorset-Council.pdf.txt
[SKIP] Already cleaned: 2023-0018-Response-from-Dorset-Integrated-Care-Board.pdf.txt
[SKIP] Already cleaned: Sean-Duignan-Prevention-of-future-deaths-report-2023-0016_Published.pdf.txt
[SKIP] Already cleaned: 2023-0016-Response-from-HMICFRS.pdf.txt
[SKIP] Already cleaned: 2023-0016-Response-from-Bedfordshire-Police.pdf.txt
[SKIP] Already cleaned: Gary-Cooper-Prevention-of-future-deaths-report-2023-0015_Published.pdf.txt
[SKIP] Already cleaned: Teegan-Barnard-Prevention-of-future-deaths-report-2023-0014_Published.pdf.txt
[SKIP] Already cleaned: 2023-0014-Response-from-Royal-College-of-Anaesthetists.pdf.txt
[SKIP] Already clea

⚠️ Quality check failed: {'length': 19621, 'alnum_ratio': 0.6400795066510372, 'long_word_count': 260, 'ok': False}
[LOW QUALITY] 2022-0183-Response-from-Department-of-Health-and-Social-Care.pdf.txt — skipping
[SKIP] Already cleaned: 2022-0183-Response-from-Donneybrook-Medical-Centre.pdf.txt
[SKIP] Already cleaned: 2022-0181-Response-from-Welsh-Ambulance-Services.pdf.txt
[SKIP] Already cleaned: 2022-0180-Response-from-NHS-England.pdf.txt
[SKIP] Already cleaned: 2022-0180-Response-from-Department-of-Health-and-Social-Care.pdf.txt
[SKIP] Already cleaned: 2022-0180-Response-from-NHS-Birmingham-and-Soilhull.pdf.txt
[SKIP] Already cleaned: 2022-0176-Response-from-Department-of-Health-and-Social-Care.pdf.txt
[SKIP] Already cleaned: 2022-0176-Response-from-Greater-Manchester-Integrated-Care.pdf.txt
[SKIP] Already cleaned: 2022-0175-Response-from-Department-of-Health-and-Social-Care.pdf.txt
[SKIP] Already cleaned: 2022-0173-Response-from-Department-of-Health-and-Social-Care.pdf.txt
[SKIP] Alrea

[CLEANED] 2024-0411-Response-from-DHSC.pdf.txt
[SKIP] Already cleaned: Wendy-Hammon-Prevention-of-Future-Deaths-Report-2024-0410.pdf.txt
[SKIP] Already cleaned: Zara-Aleena-Prevention-of-Future-Deaths-Report-2024-0409.pdf.txt
[SKIP] Already cleaned: Marjorie-Michael-Prevention-of-Future-Deaths-Report-2024-0408.pdf.txt
[SKIP] Already cleaned: Jessica-De-Souza-Prevention-of-Future-Deaths-Report-2024-0407.pdf.txt
[SKIP] Already cleaned: Jennifer-and-Marion-Bunyan-Prevention-of-Future-Deaths-Report-2024-0406.pdf.txt
[SKIP] Already cleaned: Danny-Anderson-Prevention-of-Future-Deaths-Report-2024-0405.pdf.txt
[SKIP] Already cleaned: Brogen-Lea-Storey-Prevention-of-Future-Deaths-Report-2024-0404.pdf.txt
[SKIP] Already cleaned: Elizabeth-Holder-Prevention-of-Future-Deaths-Report-2024-0403.pdf.txt
[SKIP] Already cleaned: Josh-Smith-Prevention-of-Future-Deaths-Report-2024-0402.pdf.txt
[SKIP] Already cleaned: David-Curry-Prevention-of-Future-Deaths-Report-2024-0401.pdf.txt
[SKIP] Already cleaned: 

[CLEANED] 2024-0310-Response-from-TEC-Quality-2.pdf.txt
[SKIP] Already cleaned: Dominic-Chapman-Prevention-of-future-deaths-report-2024-0309_Published.pdf.txt
[CLEANED] 2024-0309-Response-from-Ultra-Events.pdf.txt
[CLEANED] Exhibit-JL1-in-Response-to-Reg.28-Prevention-of-Future-Deaths-report-Ultra-Events.pdf.txt
[SKIP] Already cleaned: Alan-Lee-Prevention-of-future-deaths-report-2024-0308_Published-1.pdf.txt
[SKIP] Already cleaned: 2024-0308-Response-from-Care-Outlook.pdf.txt
[SKIP] Already cleaned: Robert-Fray-Prevention-of-future-deaths-report-2024-0307_Published.pdf.txt
[SKIP] Already cleaned: 2024-0307-Response-from-NHS-England.pdf.txt
[SKIP] Already cleaned: 2024-0307-Response-from-West-Midlands-Ambulance-Service.pdf.txt
[SKIP] Already cleaned: Michael-Pegg-Prevention-of-future-deaths-report-2024-0306_Published.pdf.txt
[SKIP] Already cleaned: 2024-0306-Response-from-NHS-England.pdf.txt
[SKIP] Already cleaned: 2024-0306-Response-from-Worcestershire-Acute-Hospitals-NHS-Trust.pdf.txt

[CLEANED] 2024-0252-Response-from-Ashlea-Medical-Practice.pdf.txt
[SKIP] Already cleaned: John-Bass-Prevention-of-future-deaths-report-2024-0251_Published.pdf.txt
[SKIP] Already cleaned: Bobilya-Mulonge-Prevention-of-future-deaths-report-2024-0250_Published.pdf.txt
[SKIP] Already cleaned: 2024-0250-Response-from-Department-of-Health-and-Social-Care.pdf.txt
[SKIP] Already cleaned: Peter-Fanning-Prevention-of-future-deaths-report-2024-0249_Published.pdf.txt
[SKIP] Already cleaned: 2024-0249-Response-from-University-Hospitals-Birmingham.pdf.txt
[SKIP] Already cleaned: Colin-Waterhouse-Prevention-of-future-deaths-report-2024-0248_Published.pdf.txt
[SKIP] Already cleaned: Neville-Abbott-Prevention-of-future-deaths-report-2024-0247_Published.pdf.txt
[SKIP] Already cleaned: 2024-0247-Response-from-BCP-Council.pdf.txt
[SKIP] Already cleaned: Rosie-Young-Prevention-of-future-deaths-report-2024-0246_Published.pdf.txt
[SKIP] Already cleaned: Michael-Clarke-Prevention-of-future-deaths-report-2024-

[SKIP] Already cleaned: Joseph-Cattle-Prevention-of-future-deaths-report-2024-0107_Published.pdf.txt
[SKIP] Already cleaned: 2024-0107-Response-from-Welsh-Government.pdf.txt
[SKIP] Already cleaned: Benjamin-Leonard-Prevention-of-future-deaths-report-2024-0106_Published-1.pdf.txt
[SKIP] Already cleaned: 2024-0106-Response-from-Childrens-Commissioner-for-Wales.pdf.txt
[SKIP] Already cleaned: 2024-0106-Response-from-Charity-Commission-for-England-and-Wales.pdf.txt
[SKIP] Already cleaned: 2024-0106-Response-from-Scouts.pdf.txt
[SKIP] Already cleaned: 2024-0106-Response-from-Department-for-Education.pdf.txt
[SKIP] Already cleaned: 2024-0106-Response-from-Childrens-Commissioner.pdf.txt
[SKIP] Already cleaned: 2024-0106-Response-from-Unity.pdf.txt
[SKIP] Already cleaned: 2024-0106-Response-from-Health-and-Safety-Executive.pdf.txt
[SKIP] Already cleaned: Kim-Stroud-Prevention-of-future-deaths-report-2024-0105_Published.pdf.txt
[SKIP] Already cleaned: 2024-0105-Response-from-The-Queen-Elizabeth

⚠️ Quality check failed: {'length': 28317, 'alnum_ratio': 0.678391072500618, 'long_word_count': 884, 'ok': False}
[LOW QUALITY] 2024-0032-Response-from-UK-Health-Security-Agency.pdf.txt — skipping
[SKIP] Already cleaned: 2024-0031-Prevention-of-future-deaths-report_Published.pdf.txt
[SKIP] Already cleaned: 2024-0031-Prevention-of-future-deaths-report_Published-1.pdf.txt
[SKIP] Already cleaned: 2024-0031-Response-from-London-Fire-Brigade-.pdf.txt
[SKIP] Already cleaned: William-Helstrip-Prevention-of-future-deaths-report-2024-0030_Published.pdf.txt
[SKIP] Already cleaned: 2024-0030-Response-from-Humberside-Police.pdf.txt
[SKIP] Already cleaned: Thomas-Langley-Prevention-of-future-deaths-report-2024-0029_Published.pdf.txt
[SKIP] Already cleaned: 2024-0029-Response-from-Travelodge.pdf.txt
[SKIP] Already cleaned: John-Gray-Prevention-of-future-deaths-report-2024-0028_Published.pdf.txt
[SKIP] Already cleaned: 2024-0028-Response-from-East-Suffolk-Council.pdf.txt
[SKIP] Already cleaned: Dorot

[CLEANED] 2023-0243-Response-from-Highgrove-Rest-Home.pdf.txt
[SKIP] Already cleaned: John-James-Prevention-of-future-deaths-report-2023-0242_Published.pdf.txt
[SKIP] Already cleaned: 2023-0242-Response-from-Bart-Health-NHS-Trust-1.pdf.txt
[SKIP] Already cleaned: Mohammed-Hussain-Prevention-of-future-deaths-report-2023-0241_Published.pdf.txt
[SKIP] Already cleaned: 2023-0241-Response-Birmingham-and-Solihull-Mental-Health-NHS-Foundation-Trust.pdf.txt
[SKIP] Already cleaned: 2023-0241-Response-Medicines-Healthcare-products-Regulatory-Agency.pdf.txt
[SKIP] Already cleaned: Roy-Walklet-Prevention-of-future-deaths-report-2023-0240_Published.pdf.txt
[SKIP] Already cleaned: 2023-0240-University-Hospitals-of-North-Midlands-.pdf.txt
[SKIP] Already cleaned: Chrisitian-Tuvi-Prevention-of-future-deaths-report-2023-0239_Published.pdf.txt
[SKIP] Already cleaned: 2023-0239-Response-from-Transport-for-London.pdf.txt
[SKIP] Already cleaned: 2023-0239-Response-from-Department-for-Transport.pdf.txt
[SKIP

[CLEANED] MapAa3_1869316399.pdf.txt
[CLEANED] MapDa3_1869316399.pdf.txt
[SKIP] Already cleaned: 2014-0335-Response-from-Care-UK.pdf.txt
[SKIP] Already cleaned: 2014-0287-Response-from-University-Hospital-of-Leicester-NHS-Trust.pdf.txt
[SKIP] Already cleaned: 2014-0240-Response-from-Motor-Sports-Association.pdf.txt
[SKIP] Already cleaned: 2014-0175-Response-from-Department-of-Health-and-Social-Care.pdf.txt
[SKIP] Already cleaned: 2013-0325-Response-from-My-Mil-Limited.pdf.txt
[SKIP] Already cleaned: 2013-0234-Response-from-The-Manor-.pdf.txt
[CLEANED] Rushworth-2013-0264_Redacted.pdf.txt
[CLEANED] Rushworth2013-0264.pdf.txt
[CLEANED] Dunham2013-0229.pdf.txt
[CLEANED] Dunham-R2013-0229_Redacted.pdf.txt
[SKIP] Already cleaned: 2014-0116-Response-from-Highways-Agency.pdf.txt
[SKIP] Already cleaned: 2013-0247-Response-from-NPOWER.pdf.txt
[SKIP] Already cleaned: 2013-0247-Response-from-OFGEM.pdf.txt
[SKIP] Already cleaned: 2013-0176-Response-from-South-Central-Ambulance-Service.pdf.txt
[SKIP

[CLEANED] 2024-0348-Response-from-DHSC.pdf.txt
[SKIP] Already cleaned: 2024-0347-Response-from-University-Hospitals-Leicester.pdf.txt
[SKIP] Already cleaned: 2024-0346-Response-from-Evolve.pdf.txt
[SKIP] Already cleaned: 2024-0343-Response-from-OPSS.pdf.txt
[SKIP] Already cleaned: 2024-0342-Response-from-HCA-Healthcare-.pdf.txt
[SKIP] Already cleaned: 2024-0339-Response-from-MFT.pdf.txt
[SKIP] Already cleaned: 2024-0339-Response-from-MCC.pdf.txt
[SKIP] Already cleaned: 2024-0339-Response-from-EMAS.pdf.txt
[CLEANED] 2024-0338-Response-from-Tesco.pdf.txt
[SKIP] Already cleaned: 2024-0331-Response-from-Mid-and-South-Essex-NHS.pdf.txt
[SKIP] Already cleaned: 2024-0331-Response-from-NHS-England.pdf.txt
[SKIP] Already cleaned: 2024-0331-Response-from-Essex-Partnership-NHS.pdf.txt
[SKIP] Already cleaned: 2024-0320-Response-from-DHSC.pdf.txt
[SKIP] Already cleaned: 2024-0320-Response-from-NHSE.pdf.txt
[SKIP] Already cleaned: 2024-0319-Response-from-Welsh-Ambulance-Service.pdf.txt
[SKIP] Alread

[CLEANED] Sheridan-Pickett-Prevention-of-Future-Deaths-Report-2025-0150-1.pdf.txt
[SKIP] Already cleaned: Renate-Mark-Prevention-of-Future-Deaths-Report-2025-0149.pdf.txt
[CLEANED] Nicholas-Gedge-Prevention-of-Future-Deaths-Report-2025-0148.pdf.txt
[CLEANED] Mark-Fernandez-Prevention-of-Future-Deaths-Report-2025-0147-1.pdf.txt
[SKIP] Already cleaned: Billie-Wicks-Prevention-of-Future-Deaths-Report-2025-0146.pdf.txt
[SKIP] Already cleaned: Colin-Colley-Prevention-of-Future-Deaths-Report-2025-0145.pdf.txt
[SKIP] Already cleaned: Darren-Turner-Prevention-of-Future-Deaths-Report-2025-0144.pdf.txt
[SKIP] Already cleaned: William-Radford-Prevention-of-Future-Deaths-Report-2025-0143.pdf.txt
[SKIP] Already cleaned: Alexander-Eastwood-Prevention-of-Future-Deaths-Report-2025-0142.pdf.txt
[SKIP] Already cleaned: Barry-Myers-Prevention-of-Future-Deaths-Report-2025-0141.pdf.txt
[SKIP] Already cleaned: James-Keen-Prevention-of-Future-Deaths-Report-2025-0140.pdf.txt
[SKIP] Already cleaned: 2025-0140-

[CLEANED] 2025-0039-Response-from-Saville-Town-Medical-Centre.pdf.txt
[SKIP] Already cleaned: 2025-0039-Treatment-of-Immediate-Family-Members.pdf.txt
[SKIP] Already cleaned: Nathan-Shepherd-Prevention-of-Future-Deaths-Report-2025-0038.pdf.txt
[SKIP] Already cleaned: Reginald-Smith-Prevention-of-Future-Deaths-Report-2025-0037.pdf.txt
[SKIP] Already cleaned: 2025-0037-Response-from-Stryker-UK-Ltd.pdf.txt
[SKIP] Already cleaned: Paul-Williams-Prevention-of-Future-Deaths-Report-2025-0036.pdf.txt
[SKIP] Already cleaned: 2025-0036-Response-from-Ministry-of-Housing-Communities-Local-Government.pdf.txt
[SKIP] Already cleaned: Carl-Butler-and-Sean-Brett-Prevention-of-Future-Deaths-Report-2025-0035.pdf.txt
[SKIP] Already cleaned: 2025-0035-Response-from-Cheshire-Constabulary-.pdf.txt
[SKIP] Already cleaned: Harry-Southern-Prevention-of-Future-Deaths-Report-2025-0034-1.pdf.txt
[SKIP] Already cleaned: 2025-0034-Response-from-Sussex-Partnership-Foundation-Trust-.pdf.txt
[SKIP] Already cleaned: Vaun

[CLEANED] Timothy-De-Boos-Prevention-of-Future-Deaths-Report-2024-0691.pdf.txt
[SKIP] Already cleaned: 2024-0691-Response-from-DHSC.pdf.txt
[SKIP] Already cleaned: Matthew-Sheldrick-2-Prevention-of-Future-Deaths-Report-2024-0690.pdf.txt
[SKIP] Already cleaned: 2024-0690-Response-from-NHS-England.pdf.txt
[SKIP] Already cleaned: 2024-0690-Response-from-DHSC.pdf.txt
[SKIP] Already cleaned: Matthew-Sheldrick-1-Prevention-of-Future-Deaths-Report-2024-0689.pdf.txt
[SKIP] Already cleaned: 2024-0689-Response-from-Sussex-ICB.pdf.txt
[SKIP] Already cleaned: Laura-Jane-Seaman-Prevention-of-Future-Deaths-Report-2024-0688.pdf.txt
[SKIP] Already cleaned: 2024-0688-Response-from-Royal-College-of-Obstetricians-and-Gynaecologists.pdf.txt
[SKIP] Already cleaned: 2024-0688-Response-from-Mid-South-Essex-NHS-Trust.pdf.txt
[SKIP] Already cleaned: Susan-Evans-Prevention-of-Future-Deaths-Report-2024-0687.pdf.txt
[SKIP] Already cleaned: 2024-0687-Response-from-Portsmouth-Hospital-NHS-Trust.pdf.txt
[SKIP] Alrea

[CLEANED] 2024-0643-Response-from-Mid-and-West-Wales-Fire-and-Rescue-Service.pdf.txt
[SKIP] Already cleaned: 2024-0643-Response-from-Welsh-Government.pdf.txt
[SKIP] Already cleaned: Dorothy-Nias-Prevention-of-Future-Deaths-Report-2024-0642.pdf.txt
[SKIP] Already cleaned: 2024-0642-Response-from-DVLA.pdf.txt
[SKIP] Already cleaned: 2024-0642-Response-from-Department-for-Transport.pdf.txt
[SKIP] Already cleaned: Kevin-Ince-Prevention-of-Future-Deaths-Report-2024-0641.pdf.txt
[SKIP] Already cleaned: 2024-0641-Response-from-The-Priory.pdf.txt
[SKIP] Already cleaned: Edward-Barnard-Prevention-of-Future-Deaths-Report-2024-0640.pdf.txt
[SKIP] Already cleaned: 2024-0640-Response-from-RCVS.pdf.txt
[SKIP] Already cleaned: 2024-0640-Veterinary-Medicines-Directorate.pdf.txt
[SKIP] Already cleaned: Charlotte-Roscoe-Prevention-of-Future-Deaths-Report-2024-0639.pdf.txt
[SKIP] Already cleaned: 2024-0639-Response-from-The-Royal-College-of-Radiologists.pdf.txt
[SKIP] Already cleaned: 2024-0639-Response-

[CLEANED] Mark-Eccles-Prevention-of-Future-Deaths-Report-2024-0579.pdf.txt
[SKIP] Already cleaned: 2024-0579-Response-from-Herefordshire-Council.pdf.txt
[SKIP] Already cleaned: Chloe-Every-Prevention-of-Future-Deaths-Report-2024-0578.pdf.txt
[SKIP] Already cleaned: 2024-0578-Response-from-Barking-Havering-and-Redbridge-NHS-Foundation-Trust.pdf.txt
[SKIP] Already cleaned: 2024-0578-Response-from-DHSC.pdf.txt
[SKIP] Already cleaned: Mark-Beresford-Prevention-of-Future-Deaths-Report-2024-0577.pdf.txt
[SKIP] Already cleaned: 2024-0577-Response-from-HMPPS.pdf.txt
[SKIP] Already cleaned: Sylvia-Prichard-Prevention-of-Future-Deaths-Report-2024-0576.pdf.txt
[CLEANED] 2024-0576-Response-from-Avery-Healthcare-Group.pdf.txt
[SKIP] Already cleaned: 2024-0575-Response-from-NHS-Stockport.pdf.txt
[SKIP] Already cleaned: Charles-Daniels-Prevention-of-Future-Deaths-Report-2024-0575.pdf.txt
[SKIP] Already cleaned: Patricia-Lines-Prevention-of-Future-Deaths-Report-2024-0574.pdf.txt
[SKIP] Already cleaned

[CLEANED] 2024-0429-Response-from-Westmorland-Court-Care-Home.pdf.txt
[CLEANED] 2024-0429-Response-from-NMC.pdf.txt
[CLEANED] 2024-0429-Response-from-CQC.pdf.txt
[SKIP] Already cleaned: 2024-0428-Response-from-Durham-County-Council.pdf.txt
[SKIP] Already cleaned: 2024-0427-Response-from-North-East-Ambulance-Service.pdf.txt
[SKIP] Already cleaned: 2024-0426-Response-from-HSE.pdf.txt
[SKIP] Already cleaned: 2024-0425-Response-from-Department-for-Education.pdf.txt
[SKIP] Already cleaned: 2024-0424-Response-from-ROSPA.pdf.txt
[SKIP] Already cleaned: 2024-0422-Response-from-Birmingham-and-Solihull-Mental-Health-NHS-Foundation-Trust.pdf.txt
[CLEANED] 2024-0421-Response-from-James-Paget-University-Hospitals-page-2.pdf.txt
[CLEANED] 2024-0421-Response-from-James-Paget-University-Hospitals-page-1.pdf.txt
[SKIP] Already cleaned: 2024-0418-Response-from-HMPPS.pdf.txt
[SKIP] Already cleaned: 2024-0416-Response-from-Royal-College-of-Physicians.pdf.txt
[SKIP] Already cleaned: 2024-0416-Response-from

[CLEANED] 2020-0002-Response-from-County-Durham-and-Darlington-NHS.pdf.txt
[CLEANED] 2019-0413-Response-from-Durham-County-Council.pdf.txt
[CLEANED] 2016-0230-Response-from-Cornwall-Council.pdf.txt
[SKIP] Already cleaned: 2014-0061-Response-from-Care-UK.pdf.txt
[SKIP] Already cleaned: Neville-Bardoliwalla-2020-0258.pdf.txt
[SKIP] Already cleaned: John-Jennings-2020-0257.pdf.txt
[CLEANED] Sean-Ennis-Prevention-of-future-deaths-report-2022-0054.pdf.txt
[CLEANED] Barbara-Mitchell-Prevention-of-future-deaths-report-2023-0153_Published.pdf.txt

📉 Logged 2143 low-quality files to low_quality_files.json
📊 Saved quality stats to text_quality_stats.csv


In [2]:
import pandas as pd
df = pd.read_csv('text_quality_stats.csv')

In [11]:
df[~df['ok']]

Unnamed: 0,filename,length,alnum_ratio,long_word_count,ok
0,2021-0077-Response-from-St-Pancras-Hospital-Re...,0,0.0,0,False
1,2021-0058-Academy-of-Medical-Royal-Colleges-Re...,250,0.832,28,False
2,Liane-Davenport-2020-0136_Redacted-1.pdf.txt,0,0.0,0,False
187,2019-0494-Response-by-Stockport-NHS-Trust.pdf.txt,118,0.754237,11,False
204,2021-0131-Response-from-from-Canal-River-Trust...,125,0.808,14,False
357,2019-0119-Response-from-the-Department-for-Tra...,1473,0.397828,36,False
731,2017-0205-Response-by-South-London-and-Maudsle...,3031,0.690861,137,False
1082,2022-0233-Response-from-Royal-College-General-...,0,0.0,0,False
1085,2022-0123-Response-from-British-Standards-Inst...,8358,0.562216,58,False
1086,2022-0101-Response-from-Spire-Healthcare.pdf.txt,0,0.0,0,False


In [17]:
df[df['filename'].str.contains("Andrea-Mann-Prevention-of-Future-Deaths-Report")].filename.values

array(['Andrea-Mann-Prevention-of-Future-Deaths-Report-2025-0130-1.pdf.txt'],
      dtype=object)

In [18]:
with open("texts/Andrea-Mann-Prevention-of-Future-Deaths-Report-2025-0130-1.pdf.txt", "r", encoding="utf-8") as f:
    print(f.read())

, Chief Executive (Bradford District Care NHS Trust)




In [19]:
import pdfplumber

with pdfplumber.open("downloads/Andrea-Mann-Prevention-of-Future-Deaths-Report-2025-0130-1.pdf") as pdf:
    full_text = "\n".join([page.extract_text() or "" for page in pdf.pages])

print(full_text[:1500])

, Chief Executive (Bradford District Care NHS Trust)




In [6]:
with open("texts/Shariff-2016-0321.pdf.txt", "r", encoding="utf-8") as f:
    print(f.read())

REGULATION 28: REPORT TO PREVENT FUTURE DEATHS (1)
REGULATION 28 REPORT TO PREVENT FUTURE DEATHS
THIS REPORT IS BEING SENT TO:
1. Department of Health
2. NICE
3. Pennine Acute NHS Trust
4. Chief Coroner
CORONER
I am Ms Julie Robertson, Assistant Coroner for the Coroner area of Manchester North
2 CORONER’S LEGAL POWERS
I make this report under paragraph 7, Schedule 5, of the Coroner’s and Justice Act 2009 and Regulations 28
and 29 of the Coroners (Investigations) Regulations 2013
3 INVESTIGATION and INQUEST
On the 10 May 2016 I commenced an investigation into the death of Dildar Shariff. The inquest into Mr
Shariff’s death was heard on 7 September 2016.
4 CIRCUMSTANCES OF DEATH
Mr Shariff died on 10 May 2016 at Fairfield General Hospital having been admitted following a
cardiac arrest at his home address that day. He had had an unwitnessed fallen from a chair onto
his kitchen floor on 8 May and attended at the Urgent Care Centre promptly following that fall.
Neither a CT scan nor additi

In [7]:
with open("texts/2016-0230-Response-from-Cornwall-Council.pdf.txt", "r", encoding="utf-8") as f:
    print(f.read())

one and all ‘onen hag all

CORNWALL
COUNCIL

Dr E Carlyon
Coroner for Cornwall

The New Lodge Your ref:

Newquay Road , My ref:

Penmount po Date: 13 July 2016
Truro

TR4 9AA

Dear Dr Carlyon

 

A3075 Lambourne Mill Bend, Perranzabuloe

Thank you for your letter dated 24 May and attached report with respect to a road
traffic collision on the above highway.

The Council has now thoroughly investigated the circumstances of this unfortunate
incident and as we did not attend the inquest have made efforts to speak to the
Senior Investigation Officer and Collision Investigator. We have, of course, also
examined and reviewed all the accident data kept by the Council.

After careful consideration of the facts resulting from our investigations, advice from
the Council’s lead Road Safety Auditor, the content of your report and the geometry of
this particular highway, the Council has concluded that any works at this site such as
crash barriers or Cornish hedging are as likely to cause injury to 

In [8]:
from pdf2image import convert_from_path
import pytesseract

pdf_path = "downloads/2016-0230-Response-from-Cornwall-Council.pdf"

# Convert first page to image
images = convert_from_path(pdf_path, dpi=300)
images[0].show()  # display first page


In [9]:
# Run OCR manually on the first page
ocr_text = pytesseract.image_to_string(images[0])
print(ocr_text.strip() or "❌ No OCR output")


one and all onen hag olf

CORNWALL
COUNCIL

   

Dr E Carlyon
Coroner for Cornwall

 

The New Lodge Your ref:

Newquay Road a My ref:

Penmount yp Date: 13 July 2016
Truro

TR4 9AA

 

Dear Dr Carlyon
A3075 Lambourne Mill Bend, Perranzabuloe

Thank you for your letter dated 24 May and attached report with respect to a road
traffic collision on the above highway.

The Council has now thoroughly investigated the circumstances of this unfortunate
incident and as we did not attend the inquest have made efforts to speak to the
Senior Investigation Officer and Collision Investigator. We have, of course, also
examined and reviewed all the accident data kept by the Council.

After careful consideration of the facts resulting from our investigations, advice from
the Council’s lead Road Safety Auditor, the content of your report and the geometry of
this particular highway, the Council has concluded that any works at this site such as
crash barriers or Cornish hedging are as likely to cause inju