# Extract data

In [1]:
# Import libraries
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import os
import re
import shutil
import unicodedata

## Get Strings of Case Summaries
Please create a folder named medical_pdfs, then upload the 93 case PDFs into that folder.

In [2]:
# --- Helper: Clean text from newlines ---
def clean_text(text):
    if text is None:
        return ""
    # Replace '\n' with space if not preceded by '-'
    text = re.sub(r'(?<!-)\n', ' ', text)
    # Remove '\n' if preceded by '-'
    text = re.sub(r'-\n', '', text)
    # Remove carriage returns
    text = re.sub(r'\r', '', text)
    # Collapse multiple spaces
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


# --- Helper: normalize and pre-clean text before regex ---
def normalize_text(t):
    if t is None:
        return ""
    t = unicodedata.normalize("NFKC", t)
    # Replace mis-encoded characters common in author names
    t = t.replace("€", "Ü").replace("Â", "")
    # Remove weird spacing artifacts
    t = re.sub(r"[ \t]+", " ", t)
    return t


# --- 1. Function: Extract relevant sections from each PDF ---
def extract_sections(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text")

    # Normalize and lightly flatten text before regex
    text = normalize_text(text)

    # Flatten hyphenated and broken lines for better matching
    text = re.sub(r"-\n", "", text)
    text = re.sub(r"\n{2,}", "\n", text)

    # --- Extract Title  ---

    # Manually assign known title for exception case
    # (This specific case does not follow normal title patterns)
    if "A 14-Year-Old Girl in the Solomon Islands With a Non-Healing Leg Ulcer" in text:
        title = "A 14-Year-Old Girl in the Solomon Islands With a Non-Healing Leg Ulcer"
    else:
        title = None

        # Try multiline title extraction
        pattern_multiline = (
            r"^\s*(?:Case\s*\d+:|\d+)\s*\n"
            r"((?:[^\n]*\n)+?)"
            r"(?=\n?[A-ZÁÉÍÓÚÑÜ\s,.\-&]{3,}\n)"
        )
        match_multiline = re.search(pattern_multiline, text, re.M)
        if match_multiline:
            title = re.sub(r"\s*\n\s*", " ", match_multiline.group(1).strip())

        # Clean up whitespace
        if title:
            title = re.sub(r"\s+", " ", title).strip()



    section_boundaries = (
        "Clinical Presentation|History|Clinical Findings|Clinical Examination|Examination|Examination findings|Physical Examination|"
        "Questions|Discussion|Laboratory Results|Investigations|Further Investigations|"
        "Laboratory Findings|Laboratory Results and Imaging|Abdominal Ultrasound|"
        "The Case Continued|SUMMARY BOX|Answer to Question|Diagnosis|Treatment"
    )

    # --- Extract "History" section ---
    history_pattern = rf"History\s*\n(.*?)(?=\n(?:{section_boundaries})\b)"
    history_match = re.search(history_pattern, text, re.S | re.I)
    history = history_match.group(1).strip() if history_match else None

    # --- Extract "Clinical Findings" section ---
    findings_pattern = rf"(?:Clinical Findings|Clinical Examination|Examination|Examination findings|Physical Examination)\s*\n(.*?)(?=\n(?:{section_boundaries})\b)"
    findings_match = re.search(findings_pattern, text, re.S | re.I)
    findings = findings_match.group(1).strip() if findings_match else None

    # --- Extract "Discussion" section ---
    discussion_pattern = rf"Discussion\s*\n(.*?)(?=\n(?:Answer to Question 1)|What are the priorities for management?|Answer Question 1\b)"
    discussion_match = re.search(discussion_pattern, text, re.S | re.I)
    discussion = discussion_match.group(1).strip() if discussion_match else None

    # --- Extract "SUMMARY BOX" (diagnosis line) ---
    summary_box_match = re.search(r"SUMMARY BOX\s*\n([^\n]*)", text)
    summary_box_first_line = summary_box_match.group(1).strip() if summary_box_match else None

    # Apply cleaning to extracted sections
    return {
        "Title": clean_text(title),
        "History": clean_text(history),
        "Clinical Findings": clean_text(findings),
        "Discussion": clean_text(discussion),
        "Summary Box First Line": clean_text(summary_box_first_line),
    }


# --- 2. Load PDFs and extract summaries ---
pdf_folder_path = "./medical_pdfs"
pdf_summaries_dicts = []
pdf_filenames = []

print("Extracting summaries from PDFs...")
for filename in os.listdir(pdf_folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, filename)
        summary = extract_sections(pdf_path)
        pdf_summaries_dicts.append(summary)
        pdf_filenames.append(filename)

print(f"Extracted summaries from {len(pdf_summaries_dicts)} PDF files.")

# Convert list of dictionaries to a list of strings
pdf_summaries_strings = []
for summary_dict in pdf_summaries_dicts:
    summary_string = " ".join([str(v) for v in summary_dict.values()])
    pdf_summaries_strings.append(summary_string)

Extracting summaries from PDFs...
Extracted summaries from 93 PDF files.


## Some important variables

In [3]:
# Contain summaries
pdf_summaries_strings[1]

'A 55-Year-Old Indigenous Woman from Australia With a Widespread Exfoliating Rash and Sepsis You are working in a remote indigenous community in tropical northern Australia, and the community health worker asks you to visit a house to assess an elderly woman who has been living in the crowded back room. Her family are worried that she has become increasingly withdrawn and hasn’t been getting out of the house much at all. The patient is a 55-year-old indigenous Australian woman with a widespread exfoliative rash involving all limbs and especially the armpits, buttocks and thighs (Fig. 10.1). Many flakes of skin cover the mattress she is lying on. In addition, she has fissures over her wrists and knees. She also looks pale, is clammy and poorly responsive. Her temperature is 39.5°C (103.1°F), heart rate 110bpm, respiratory rate 28 breath cycles per minute and blood pressure 85mmHg systolic to radial pulsation. Oxygen saturation by pulse oximetry is 92% on room air. A 55-year-old indigeno

In [4]:
# Contain names
pdf_filenames[1]

'10---A-55-Year-Old-Indigenous-Woman-from-Australia-W_2022_Clinical-Cases-in-.pdf'

In [5]:
# Contain full extractions of files
pdf_summaries_dicts[1]

{'Title': 'A 55-Year-Old Indigenous Woman from Australia With a Widespread Exfoliating Rash and Sepsis',
 'History': 'You are working in a remote indigenous community in tropical northern Australia, and the community health worker asks you to visit a house to assess an elderly woman who has been living in the crowded back room. Her family are worried that she has become increasingly withdrawn and hasn’t been getting out of the house much at all.',
 'Clinical Findings': 'The patient is a 55-year-old indigenous Australian woman with a widespread exfoliative rash involving all limbs and especially the armpits, buttocks and thighs (Fig. 10.1). Many flakes of skin cover the mattress she is lying on. In addition, she has fissures over her wrists and knees. She also looks pale, is clammy and poorly responsive. Her temperature is 39.5°C (103.1°F), heart rate 110bpm, respiratory rate 28 breath cycles per minute and blood pressure 85mmHg systolic to radial pulsation. Oxygen saturation by pulse o

# Connect to LMStudio
We will use MedGemma as the "teacher" model.
Requirement: Install LMStudio and and MedGemma version

In [6]:
import google.generativeai as genai
import getpass
import os
import time
from openai import OpenAI
from dotenv import load_dotenv

In [7]:
# Connect to LM Studio local server using OpenAI client

load_dotenv()
base_url = os.getenv("LOCAL_LLM_URL")
lm_model = os.getenv("LMSTUDIO_MODEL")

if not base_url:
    print('Please check your URL of the model')

if not lm_model:
    print('Please check the name of the model')

client = OpenAI(base_url=f"{base_url}/v1", api_key="lm-studio")
print(f"LM Studio configured at {base_url}, model '{lm_model}' ready.")

# quick test
resp = client.chat.completions.create(
    model=lm_model,
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Say hello and your model name."}
    ],
)
print(resp.choices[0].message.content)

LM Studio configured at http://127.0.0.1:1234, model 'medgemma-4b-it' ready.
Hello! I am Gemma, an open-weights AI assistant.



In [8]:
# Testing interaction via LMStudio API

# --- Main Program Loop ---
def main_lmstudio():
    """
    This function runs a loop to continuously get user input
    and query the LMStudio model.
    
    """
    load_dotenv()
    base_url = os.getenv("LOCAL_LLM_URL")
    lm_model = os.getenv("LMSTUDIO_MODEL")

    print(f"Connected to LMStudio. You can start asking questions to {lm_model}.")
    print("Type 'quit' or 'exit' to end the chat.")

    while True:
        # Get input from the user
        prompt = input("\nYou: ")

        if prompt.lower() in ['quit', 'exit']:
            print("Exiting the program. Goodbye!")
            break

        if not prompt:
            print("Please enter a question.")
            continue

        # Send the prompt to the model and get the response
        try:
            completion = client.chat.completions.create(
                model=lm_model,
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=0.7,
            )
            print(f"\nLMStudio Model: {completion.choices[0].message.content}")
        except Exception as e:
            print(f"An error occurred while getting the response: {e}")

# Run the main function for LMStudio
if __name__ == '__main__':
    main_lmstudio()

Connected to LMStudio. You can start asking questions to medgemma-4b-it.
Type 'quit' or 'exit' to end the chat.

LMStudio Model: I am Gemma, a large language model trained by Google DeepMind.

Exiting the program. Goodbye!


In [10]:
# Process all cases with the local model
print(f"Starting batch processing for {len(pdf_summaries_dicts)} cases...")

for i, medical_case in enumerate(pdf_summaries_dicts):
    prompt = (
        f"Medical Case Title: {medical_case.get('Title', '')}\n"
        f"History: {medical_case.get('History', '')}\n"
        f"Clinical Findings: {medical_case.get('Clinical Findings', '')}\n"
        f"Discussion: {medical_case.get('Discussion', '')}\n"
        f"Diagnosis: {medical_case.get('Summary Box First Line', '')}\n\n"
        "Based on the information above, provide a concise medical rationale for the diagnosis. Remember to label it as Rationale \n"
        "Rationale:"
    )

    try:
        # Generate content with the local model
        response = client.chat.completions.create(
            model=lm_model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
        )
        rationale = response.choices[0].message.content.strip()
        medical_case['Rationale'] = rationale
        print(f"Generated rationale for case {i+1}/{len(pdf_summaries_dicts)}: {medical_case.get('Title', 'Unknown')}")
        print(f"Rationale for case {i+1}: {medical_case.get('Rationale')}")
        
        # Local models don't need the 60s sleep required by free-tier APIs
        time.sleep(4) 
        
    except Exception as e:
        print(f"Error generating rationale for case {i+1} ({medical_case.get('Title', 'Unknown Title')}): {e}")
        medical_case['Rationale'] = "Error generating rationale."

print("All medical cases processed with generated rationales.")

Starting batch processing for 93 cases...
Generated rationale for case 1/93: A 20-Year-Old Woman from Sudan With Fever, Haemorrhage and Shock
Rationale for case 1: Rationale: The patient's presentation with fever, severe asthenia, chest/abdominal pain, nausea, vomiting, diarrhea, bleeding from gums, hepatosplenomegaly and hypotension strongly suggests Ebola virus disease (EVD). While the absence of clinical laboratory data hinders definitive confirmation, the epidemiological context (outbreak in northern Uganda) and the constellation of symptoms are highly suggestive. The patient's age, residence status (Sudanese refugee), and lack of contact with known sick individuals further support this suspicion.
Generated rationale for case 2/93: A 55-Year-Old Indigenous Woman from Australia With a Widespread Exfoliating Rash and Sepsis
Rationale for case 2: Rationale: The patient presents with a widespread, exfoliative rash involving typical areas affected by scabies (armpits, buttocks, thighs),

KeyboardInterrupt: 