In [15]:
import pymupdf
import os
from pathlib import Path

def batch_pdf_to_images(input_dir, output_dir, image_format="jpeg", zoom_factor=1.0):
    """
    Convert all PDFs in a directory to images

    Args:
        input_dir: Directory containing PDF files
        output_dir: Directory to save images
        image_format: Output image format
        zoom_factor: Resolution multiplier
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)

    # Create output directory
    output_path.mkdir(parents=True, exist_ok=True)

    # Find all PDF files
    pdf_files = list(input_path.glob("*.pdf"))

    if not pdf_files:
        print("No PDF files found in the input directory.")
        return

    print(f"Found {len(pdf_files)} PDF files to process...")

    for pdf_file in pdf_files:
        try:
            doc = pymupdf.open(pdf_file)
            pdf_name = pdf_file.stem

            # Create subdirectory for this PDF's images
            pdf_output_dir = output_path / pdf_name
            pdf_output_dir.mkdir(exist_ok=True)

            # Convert each page
            for page_num in range(doc.page_count):
                page = doc[page_num]

                if zoom_factor != 1.0:
                    mat = pymupdf.Matrix(zoom_factor, zoom_factor)
                    pix = page.get_pixmap(matrix=mat)
                else:
                    pix = page.get_pixmap()

                output_file = pdf_output_dir / f"page_{page_num + 1}.{image_format}"
                pix.save(str(output_file))

            doc.close()
            print(f"✓ Processed {pdf_file.name}: {doc.page_count} pages")

        except Exception as e:
            print(f"✗ Error processing {pdf_file.name}: {str(e)}")

# Usage
batch_pdf_to_images("med_guidelines", "med_guidelines_images", "jpeg", 1.0)



Found 4 PDF files to process...
✗ Error processing 临床常用生化检验项目参考区间第2部分.pdf: document closed
✗ Error processing 临床常用生化检验项目参考区间第4部分.pdf: document closed
✗ Error processing 临床常用生化检验项目参考区间第1部分.pdf: document closed
✗ Error processing 临床常用生化检验项目参考区间第3部分.pdf: document closed


In [16]:
## convert image to base64
import base64

def image_to_base64(image_path):
    """
    Convert a JPG image file to base64 encoded string
    
    Args:
        image_path (str): Path to the JPG image file
        
    Returns:
        str: Base64 encoded string of the image
    """
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read())
        return encoded_string.decode('utf-8')


In [17]:

import json
from json_repair import repair_json
import ast
import traceback

from llama_index.core.llms import ChatMessage
from llama_index.llms.openai_like import OpenAILike
# from prompt_template.ocr_template import *
from configs import *


def completion_to_prompt(completion):
    return f"<|im_start|>system\n<|im_end|>\n<|im_start|>user\n{completion}<|im_end|>\n<|im_start|>assistant\n"


def messages_to_prompt(messages):
    prompt = ""
    for message in messages:
        if message.role == "system":
            prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
        elif message.role == "user":
            prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
        elif message.role == "assistant":
            prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"

    if not prompt.startswith("<|im_start|>system"):
        prompt = "<|im_start|>system\n" + prompt

    prompt = prompt + "<|im_start|>assistant\n"
    return prompt


class MedDocProcessor:

    def __init__(self):
        print("Initializing LLM client...")
        self.llm = OpenAILike(
            model=VL_MODEL_NAME,
            api_base=VL_LLM_API_BASE,
            api_key="EMPTY",
            is_chat_model=True,
            temperature=0,
            # max_tokens=VL_MAX_TOKENS,
            messages_to_prompt=messages_to_prompt,
            completion_to_prompt=completion_to_prompt,
            timeout=TIME_OUT,
        )
        print("LLM client initialized.")


    def _call_llm_with_image(self, prompt_text: str, image_base64: str) -> str:
        """Constructs messages and calls the LLM, returning the response content."""
        messages = [
            ChatMessage(
                role="user",
                content=[
                    {"block_type": "text", "text": prompt_text},
                    {
                        "block_type": "image",
                        # "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"}
                        # "image": f'data:image/jpeg;base64,{image_base64}'
                        "image": image_base64
                    },
                ],
            )
        ]
        print("checkpoint 1")
        response = self.llm.chat(messages)
        print("checkpoint 2")
        return response.message.content.strip()

    def process_image(self, image_base64: str) -> dict:
        """
        Processes a single medical image file to extract data.

        Args:
            image_base64 (str): The file path to the imagebase64.

        Returns:
            dict: A dictionary containing the extracted data.
        """
        # Initialize the final result structure

        guideline_extraction_prompt = """
            你是一个专业的图像识别专家，请从图片中识别出医学信息，以markdown格式输出，保留原文件的结构。

        """
        # test_result = ""
        test_result = self._call_llm_with_image(guideline_extraction_prompt, image_base64)

        # try:
        #     print("hhh")
        # except Exception as e:
        #     final_result["error"] = str(e)
        # print(test_result)
        return test_result


In [18]:
import os
from pathlib import Path
from typing import Dict, List

def read_images_with_full_paths(root_directory: str,
                              image_extensions: tuple = ('.jpg', '.jpeg')) -> Dict[str, List[str]]:
    """
    Loop through subdirectories and read full image paths, grouping them by subfolder.
    
    Args:
        root_directory: Path to the root directory containing subfolders with images
        image_extensions: Tuple of image file extensions to look for
        
    Returns:
        Dictionary with subfolder names as keys and lists of full image paths as values
    """
    # Dictionary to store results
    image_groups = {}
    
    # Check if root directory exists
    if not os.path.exists(root_directory):
        print(f"Root directory not found: {root_directory}")
        return image_groups
    
    # Get all subdirectories in the root directory
    subfolders = [f.path for f in os.scandir(root_directory) if f.is_dir()]
    
    print(f"Found {len(subfolders)} subfolders in {root_directory}")
    
    # Process each subfolder
    for subfolder_path in subfolders:
        subfolder_name = os.path.basename(subfolder_path)
        print(f"\nProcessing subfolder: {subfolder_name}")
        
        # Get all image files in the subfolder
        image_files = []
        
        try:
            for file in os.listdir(subfolder_path):
                # Check if file has an image extension (case insensitive)
                if file.lower().endswith(image_extensions):
                    # Store full path
                    full_path = os.path.join(subfolder_path, file)
                    image_files.append(full_path)
            
            # Sort image files for consistent ordering
            image_files.sort()
            # Add to results dictionary
            image_groups[subfolder_name] = image_files
                        
        except Exception as e:
            print(f"  Error processing {subfolder_name}: {str(e)}")
            continue
    
    return image_groups

In [19]:
print(image_groups)

{'临床常用生化检验项目参考区间第3部分': ['/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_1.jpeg', '/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_2.jpeg', '/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_3.jpeg', '/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_4.jpeg', '/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_5.jpeg', '/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_6.jpeg', '/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_7.jpeg', '/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_8.jpeg'], '临床常用生化检验项目参考区间第4部分': ['/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第4部分/page_1.jpeg', '/Users/jlkj/work/medical_info_extraction_v

In [None]:
processer = MedDocProcessor()
root_dir = "/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images"
image_groups = read_images_with_full_paths(root_dir)
image_text = {}
for key, images in image_groups.items():
    print(f"Processing {key}")
    pdf_text = ""
    for image in images:
        print(f"Processing {image}")
        base64_result = image_to_base64(image)
        # result = processer.process_image(base64_result)
        pdf_text += processer.process_image(base64_result)
        # print(pdf_text)
    image_text.append(pdf_text)

# jpg_path = "/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第3部分/page_1.jpeg"
# base64_result = image_to_base64(jpg_path)
# processer = MedDocProcessor()
# result = processer.process_image(base64_result)
# print(result)
# print("Base64 encoded image:")
# print(len(base64_result))




In [None]:
# jpg_path = "/Users/jlkj/work/medical_info_extraction_vlm/med_guidelines_images/临床常用生化检验项目参考区间第1部分/page_3.jpeg"
# base64_result = image_to_base64(jpg_path)
# processer = MedDocProcessor()
# result = processer.process_image(base64_result)
# print(result)
# print("Base64 encoded image:")
# print(len(base64_result))

In [None]:
import json
from json_repair import repair_json

from llama_index.core.llms import ChatMessage
from llama_index.llms.openai_like import OpenAILike
# from prompt_template.ocr_template import *
from configs import *

LLM_API_BASE = "http://192.168.100.27:15001/v1/"
MODEL_NAME = "qwen3-32b"

class ChatQuery:
    def __init__(self) -> None:
        self.llm = OpenAILike(
            model=MODEL_NAME,
            api_base=LLM_API_BASE,
            api_key='EMPTY',
            is_chat_model=True,
            temperature=0.7,
            # max_tokens=MAX_TOKENS,
            timeout=TIME_OUT,
        ) 

        self.llm_2 = OpenAILike(
            model=MODEL_NAME,
            api_base=LLM_API_BASE,
            api_key='EMPTY',
            is_chat_model=True,
            temperature=0.7,
            # max_tokens=MAX_TOKENS,
            timeout=TIME_OUT,
            additional_kwargs={"extra_body": {"chat_template_kwargs": {"enable_thinking": False}}},
        )

        self.extraction_prompt = """
            你是一个专业的医学专家，请从输入的文档中识别提取医学指标参考值，有如下要求：
            1. 如果文档有上下标等特别说明，请将说明置于指标名后，以括号括起来表示。
            2. 输出内容请以json格式输出，格式如下：
            {
                "指标名": "血清钾 (K)",
                "适用人群": "(男/女)",
                "参考值范围": 3.5～5.3,
                “单位”:mmol/L
            }
            3. 如果没有找到相关信息，则置key的值为空。
        """
        
    # @mlflow.trace
    def chat_query(self, user_message, **kwargs):
        user_prompt = user_message
        messages = [
            ChatMessage(role="assistant", content=self.extraction_prompt),
            ChatMessage(role="user", content=user_prompt)
        ]
        response = self.llm.chat(messages, **kwargs)
        # result_str = response.message.content
        # if isinstance(response, str):
        #     response = repair_json(response)
        return response
        
    def chat_query_no_think(self, user_message, **kwargs):
        user_prompt = user_message
        messages = [
            ChatMessage(role="assistant", content=self.extraction_prompt),
            ChatMessage(role="user", content=user_prompt)
        ]
        response = self.llm_2.chat(messages, **kwargs)
        result_str = response.message.content
        print(result_str)
        return response

In [None]:
chat = ChatQuery()
for text in image_text:
    result = chat.chat_query_no_think(text)
    print(result)