# Extract data

In [2]:
# Import libraries
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import os
import re
import shutil
import unicodedata

## Get Strings of Case Summaries
Please create a folder named medical_pdfs, then upload the 93 case PDFs into that folder.

In [3]:
# --- Helper: Clean text from newlines ---
def clean_text(text):
    if text is None:
        return ""
    # Replace '\n' with space if not preceded by '-'
    text = re.sub(r'(?<!-)\n', ' ', text)
    # Remove '\n' if preceded by '-'
    text = re.sub(r'-\n', '', text)
    # Remove carriage returns
    text = re.sub(r'\r', '', text)
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


# --- Helper: normalize and pre-clean text before regex ---
def normalize_text(t):
    if t is None:
        return ""
    t = unicodedata.normalize("NFKC", t)
    # Replace mis-encoded characters common in author names
    t = t.replace("€", "Ü").replace("Â", "")
    # Remove weird spacing artifacts
    t = re.sub(r"[ \t]+", " ", t)
    return t


# --- 1. Function: Extract relevant sections from each PDF ---
def extract_sections(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")

    # Normalize and lightly flatten text before regex
    text = normalize_text(text)

    # Flatten hyphenated and broken lines for better matching
    text = re.sub(r"-\n", "", text)
    text = re.sub(r"\n{2,}", "\n", text)

    # --- Extract Title  ---

    # Manually assign known title for exception case
    # (This specific case does not follow normal title patterns)
    if "A 14-Year-Old Girl in the Solomon Islands With a Non-Healing Leg Ulcer" in text:
        title = "A 14-Year-Old Girl in the Solomon Islands With a Non-Healing Leg Ulcer"
    else:
        title = None

        # Try multiline title extraction
        pattern_multiline = (
            r"^\s*(?:Case\s*\d+:|\d+)\s*\n"
            r"((?:[^\n]*\n)+?)"
            r"(?=\n?[A-ZÁÉÍÓÚÑÜ\s,.\-&]{3,}\n)"
        )
        match_multiline = re.search(pattern_multiline, text, re.M)
        if match_multiline:
            title = re.sub(r"\s*\n\s*", " ", match_multiline.group(1).strip())

        # Clean up whitespace
        if title:
            title = re.sub(r"\s+", " ", title).strip()



    section_boundaries = (
        "Clinical Presentation|History|Clinical Findings|Clinical Examination|Examination|Examination findings|Physical Examination|"
        "Questions|Discussion|Laboratory Results|Investigations|Further Investigations|"
        "Laboratory Findings|Laboratory Results and Imaging|Abdominal Ultrasound|"
        "The Case Continued|SUMMARY BOX|Answer to Question|Diagnosis|Treatment"
    )

    # --- Extract "History" section ---
    history_pattern = rf"History\s*\n(.*?)(?=\n(?:{section_boundaries})\b)"
    history_match = re.search(history_pattern, text, re.S | re.I)
    history = history_match.group(1).strip() if history_match else None

    # --- Extract "Clinical Findings" section ---
    findings_pattern = rf"(?:Clinical Findings|Clinical Examination|Examination|Examination findings|Physical Examination)\s*\n(.*?)(?=\n(?:{section_boundaries})\b)"
    findings_match = re.search(findings_pattern, text, re.S | re.I)
    findings = findings_match.group(1).strip() if findings_match else None

    # --- Extract "Discussion" section ---
    discussion_pattern = rf"Discussion\s*\n(.*?)(?=\n(?:Answer to Question 1)|What are the priorities for management?|Answer Question 1\b)"
    discussion_match = re.search(discussion_pattern, text, re.S | re.I)
    discussion = discussion_match.group(1).strip() if discussion_match else None

    # --- Extract "SUMMARY BOX" (diagnosis line) ---
    summary_box_match = re.search(r"SUMMARY BOX\s*\n([^\n]*)", text)
    summary_box_first_line = summary_box_match.group(1).strip() if summary_box_match else None

    # Apply cleaning to extracted sections
    return {
        "Title": clean_text(title),
        "History": clean_text(history),
        "Clinical Findings": clean_text(findings),
        "Discussion": clean_text(discussion),
        "Summary Box First Line": clean_text(summary_box_first_line),
    }


# --- 2. Load PDFs and extract summaries ---
pdf_folder_path = "./medical_pdfs"
pdf_summaries_dicts = []
pdf_filenames = []

print("Extracting summaries from PDFs...")
for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, filename)
        summary = extract_sections(pdf_path)
        pdf_summaries_dicts.append(summary)
        pdf_filenames.append(filename)

print(f"Extracted summaries from {len(pdf_summaries_dicts)} PDF files.")

# Convert list of dictionaries to a list of strings
pdf_summaries_strings = []
for summary_dict in pdf_summaries_dicts:
    summary_string = " ".join([str(v) for v in summary_dict.values()])
    pdf_summaries_strings.append(summary_string)

Extracting summaries from PDFs...
Extracted summaries from 93 PDF files.


## Some important variables

In [4]:
# Contain summaries
pdf_summaries_strings[1]

'A 55-Year-Old Indigenous Woman from Australia With a Widespread Exfoliating Rash and Sepsis You are working in a remote indigenous community in tropical northern Australia, and the community health worker asks you to visit a house to assess an elderly woman who has been living in the crowded back room. Her family are worried that she has become increasingly withdrawn and hasn’t been getting out of the house much at all. The patient is a 55-year-old indigenous Australian woman with a widespread exfoliative rash involving all limbs and especially the armpits, buttocks and thighs (Fig. 10.1). Many flakes of skin cover the mattress she is lying on. In addition, she has fissures over her wrists and knees. She also looks pale, is clammy and poorly responsive. Her temperature is 39.5°C (103.1°F), heart rate 110bpm, respiratory rate 28 breath cycles per minute and blood pressure 85mmHg systolic to radial pulsation. Oxygen saturation by pulse oximetry is 92% on room air. A 55-year-old indigeno

In [5]:
# Contain names
pdf_filenames[1]

'10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-.pdf'

In [6]:
# Contain full extractions of files
pdf_summaries_dicts[1]

{'Title': 'A 55-Year-Old Indigenous Woman from Australia With a Widespread Exfoliating Rash and Sepsis',
 'History': 'You are working in a remote indigenous community in tropical northern Australia, and the community health worker asks you to visit a house to assess an elderly woman who has been living in the crowded back room. Her family are worried that she has become increasingly withdrawn and hasn’t been getting out of the house much at all.',
 'Clinical Findings': 'The patient is a 55-year-old indigenous Australian woman with a widespread exfoliative rash involving all limbs and especially the armpits, buttocks and thighs (Fig. 10.1). Many flakes of skin cover the mattress she is lying on. In addition, she has fissures over her wrists and knees. She also looks pale, is clammy and poorly responsive. Her temperature is 39.5°C (103.1°F), heart rate 110bpm, respiratory rate 28 breath cycles per minute and blood pressure 85mmHg systolic to radial pulsation. Oxygen saturation by pulse o

# Connect to LMStudio
We will use MedGemma as the "teacher" model.
Requirement: Install LMStudio and and MedGemma version

In [10]:
import google.generativeai as genai
import getpass
import os
import time
from dotenv import load_dotenv

In [None]:
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")