# Imports

In [1]:
import os
import re
import time
import json
import copy
import PyPDF2
import requests
from pathlib import Path
from typing import List, Optional


# Global Configuration and Metadata Schema Definition

In [2]:
METADATA_TEMPLATE = {
    "paper_id": None,
    "doi": None,
    "title": None,
    "published_year": None,
    "author_list": [],
    "countries": [],
    "purpose_of_work": None,
    "keywords": []
}

OLLAMA_URL = "http://127.0.0.1:11434/api/generate"
PDFS = "./pdfs"
OUTPUT_FOLDER = "./extracted_metadata"

API_TIMEOUT = 1000
SLEEP_DURATION = 5
BATCH_SIZE = 10

EXTRACTION_MODEL = "qwen3:latest"
CORRECTION_MODEL = "qwen3:latest"



def get_output_filename():
    model_name = EXTRACTION_MODEL.replace(":", "").replace("/", "_")
    return f"{model_name}_extracted.json"

OUTPUT_FILENAME = get_output_filename()

# PDF File Discovery and Natural Sorting

the following cell locates all PDF files in the specified input directory and returns them in **naturally sorted order** (based on numeric components in filenames).

In [3]:
def natural_sorting_key(filename: str) -> List[int]:
    numbers = re.findall(r'\d+', filename)
    return [int(x) for x in numbers] if numbers else [0]

def get_pdf_files(folder_path: str) -> List[str]:
    if not os.path.exists(folder_path):
        print(f"Error: '{folder_path}' folder not found")
        return []
    pdf_files = [f for f in os.listdir(folder_path) if f.lower().endswith('.pdf')]

    if not pdf_files:
        print(f"No pdf file found in '{folder_path}' folder")
        return []
    
    return sorted(pdf_files, key=natural_sorting_key)


In [4]:
pdf_files = get_pdf_files(PDFS)
print(pdf_files)

['1.pdf', '2.pdf']


# Text Extraction from PDFs

the following cell reads a PDF file and extracts text, which typically contains key bibliographic information.

In [5]:
def extract_first_page(pdf_path: str) -> Optional[str]:
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)

            if len(pdf_reader.pages) > 0:
                first_page = pdf_reader.pages[0]
                text = first_page.extract_text()
                return text
            else:
                print(f"PDF empty - no pages: {pdf_path}")
                return None
            
    except Exception as e:
        print(f"PDF read error ({pdf_path}): {e}")
        return None

In [6]:
sample_path = os.path.join(PDFS, pdf_files[0])
sample_text = extract_first_page(sample_path)
print(sample_text)

Citation: Valea, A.; Costachescu, M.;
Stanciu, M.; Nistor, C.; Sima, O.-C.;
Carsote, M.; Nistor, T.V .I.; Tanasescu,
D.; Popa, F.L.; Ciobica, M.-L. A
Real-Life Study in Patients Newly
Diagnosed with Autoimmune
Hashimoto’s Thyroiditis: Analysis of
Asthenia as Admission Complaint.
Life2024 ,14, 1380. https://doi.org/
10.3390/life14111380
Academic Editors: Daniele Sola and
Stelvio Tonello
Received: 26 September 2024
Revised: 23 October 2024
Accepted: 24 October 2024
Published: 27 October 2024
Copyright: ©2024 by the authors.
Licensee MDPI, Basel, Switzerland.
This article is an open access article
distributed under the terms and
conditions of the Creative Commons
Attribution (CC BY) license (https://
creativecommons.org/licenses/by/
4.0/).
life
Article
A Real-Life Study in Patients Newly Diagnosed with
Autoimmune Hashimoto’s Thyroiditis: Analysis of
Asthenia as Admission Complaint
Ana Valea1,2,†, Mihai Costachescu3,4,†, Mihaela Stanciu5,6, Claudiu Nistor4,7,*
, Oana-Claudia Sima3,*,
Mara 

# Prompts

## Extraction Prompt 

In [None]:
def get_extraction_prompt(text: str) -> str:
    return f"""
You are an expert at extracting metadata from academic papers. Your task is to fill in the values in the exact JSON format provided below, **strictly and solely based on the content of the academic paper text you will be given. Do not invent or infer any information not explicitly present or clearly derivable from the text.**

IMPORTANT: Do not use any thinking tags like <think></think> in your response. Provide ONLY the JSON output directly.

- **doi**: The Digital Object Identifier of the paper. Look for it typically near the top or bottom of the first page, often preceded by "doi:" or "https://doi.org/". If not found, use `null`.
- **title**: The full title of the academic paper. Usually found prominently at the top of the first page. **You MUST find the title; it is a fundamental part of any academic paper. If it's not immediately obvious, thoroughly scan the first few paragraphs or the very top section of the page.** It is highly improbable for a paper to lack a title; therefore, only use `null` as an absolute last resort if, despite rigorous scanning, no identifiable title is present in the provided text.
- **published_year**: The year the paper was officially published. Look for it near the publication details (e.g., citation, received/accepted/published dates). If not found, use `null`.
- **author_list**: A list of the full names of all authors. Extract names exactly as they appear in the author section.
Example: ["John Doe", "Jane Smith"]
- **countries**: A list of unique countries associated with the authors' affiliations. ***If an affiliation mentions a city, identify its corresponding country.*** Ensure each country appears only once in the list. If no country is found, use `[]`.
Example: ["USA", "Germany", "Japan"]
- **purpose_of_work**: A concise summary (20-40 words) explaining the main goal or objective of the research presented in the paper. Extract this from anywhere within the provided text, identifying the core reason or problem the paper addresses. If multiple relevant parts are found, synthesize them into a single, concise summary within the word limit. If the main goal cannot be clearly identified or summarized within 20-40 words, use `null`.
- **keywords**: A list of significant keywords that describe the paper's main topics. Prioritize extracting these directly from a dedicated "Keywords" section if one is present. **If no such section exists, extract 3 to 4 key terms or phrases that best represent the paper's main topics from the abstract and main text.** Ensure individual keywords are extracted correctly, even if they are separated by commas (,) or semicolons (;). If no suitable keywords are found, use `[]`.
Example: ["Advanced oxidation protein products", "Apoptosis", "Reactive oxygen species", "Hashimoto's thyroiditis"]

Return ONLY a valid JSON object (no explanation, no markdown, no comments). If any field is missing (i.e., not found explicitly or clearly derivable from the text based on the strict instructions above), use `null` (for string fields) or `[]` (for list fields). Do NOT change the provided keys or the structure of the JSON.

JSON Template:
{{
"doi": null,
"title": null,
"published_year": null,
"author_list": [],
"countries": [],
"purpose_of_work": null,
"keywords": []
}}

Text to analyze:
\"\"\"
{text}
\"\"\"
""" 

## Correction Prompt

In [None]:
def get_correction_prompt(llm_response: str) -> str:
    return f"""You are a meticulous proofreader. Your task is to correct only spelling errors and remove any excess spaces from the provided text. Do not change any other part of the text, its structure, or its meaning. For author_list names, you may only fix spacing and punctuation, but must NOT change, add, or remove letters. Return the corrected text exactly as it was given, with only the specified edits. Ensure each country appears only once in the list, even if it's mentioned multiple times.
Return ONLY a valid JSON object (no explanation, no markdown, no comments).
Text to correct:
{llm_response}
""" 

# Local LLM API Interaction

the following cell defines a unified interface for sending prompts to a local large language model via the Ollama API.

In [44]:
def make_llm_request(model: str, prompt: str) -> tuple[Optional[str], float]:
    try:
        payload = {
            "model": model,
            "prompt": prompt,
            "stream": False
        }

        response = requests.post(OLLAMA_URL, json=payload, timeout=API_TIMEOUT)
        if response.status_code == 200:
            result = response.json()
            return result.get('response', ''), 0.0
        else:
            print(f"API Error: {response.status_code}")
            return None, 0.0
        
    except Exception as e:
        print(f"LLM Error: {str(e)}")
        return None, 0.0
    
def extract_metadata_with_llm(prompt: str) -> tuple[Optional[str], float]:
    return make_llm_request(EXTRACTION_MODEL, prompt)

def correct_response_with_llm(prompt: str) -> tuple[Optional[str], float]:
    return make_llm_request(CORRECTION_MODEL, prompt)

In [45]:
text = extract_first_page(sample_path)
prompt = get_extraction_prompt(text)

response, _ = extract_metadata_with_llm(prompt)
print(response)

{
"doi": "10.3390/life14111380",
"title": "A Real-Life Study in Patients Newly Diagnosed with Autoimmune Hashimoto’s Thyroiditis: Analysis of Asthenia as Admission Complaint",
"published_year": 2024,
"author_list": [
"Ana Valea",
"Mihai Costachescu",
"Mihaela Stanciu",
"Claudiu Nistor",
"Oana-Claudia Sima",
"Mara Carsote",
"Tiberiu Vasile Ioan Nistor",
"Denisa Tanasescu",
"Florina Ligia Popa",
"Mihai-Lucian Ciobica"
],
"countries": [
"Romania"
],
"purpose_of_work": "To analyze the relationship between thyroid panel results and asthenia as an admission complaint in newly diagnosed patients with Hashimoto’s thyroiditis.",
"keywords": [
"Hashimoto's thyroiditis",
"asthenia",
"thyroid panel",
"admission complaint"
]
}


# LLM Response parsing

the following cell sanitizes raw LLM responses by removing auxiliary text and formatting artifacts, extracts the first valid JSON object, and parses it into a Python dictionary.

In [46]:
def parse_llm_output(raw: str) -> Optional[dict]:
    try:
        raw = re.sub(r"<thinking>.*? </thinking>", "", raw, flags=re.DOTALL | re.IGNORECASE)
        raw = re.sub(r"^```(?:json)?|```$", "", raw.strip(), flags=re.IGNORECASE | re.MULTILINE).strip()

        start_idx = raw.find('{')

        if start_idx == -1:
            print("NO Valid JSON object found (missing opening '{' )")
            return None
        
        raw_json = raw[start_idx:]
        data = json.loads(raw_json)

        if isinstance(data, dict) and "metadata" in data and isinstance(data["metadata"], dict):
            data = data["metadata"]

        print(f"LLM response successfully parsed: {len(data)} fields found")
        return data 
    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        return None

In [47]:
parsed = parse_llm_output(response)
print(json.dumps(parsed, indent=2))

LLM response successfully parsed: 7 fields found
{
  "doi": "10.3390/life14111380",
  "title": "A Real-Life Study in Patients Newly Diagnosed with Autoimmune Hashimoto\u2019s Thyroiditis: Analysis of Asthenia as Admission Complaint",
  "published_year": 2024,
  "author_list": [
    "Ana Valea",
    "Mihai Costachescu",
    "Mihaela Stanciu",
    "Claudiu Nistor",
    "Oana-Claudia Sima",
    "Mara Carsote",
    "Tiberiu Vasile Ioan Nistor",
    "Denisa Tanasescu",
    "Florina Ligia Popa",
    "Mihai-Lucian Ciobica"
  ],
  "countries": [
    "Romania"
  ],
  "purpose_of_work": "To analyze the relationship between thyroid panel results and asthenia as an admission complaint in newly diagnosed patients with Hashimoto\u2019s thyroiditis.",
  "keywords": [
    "Hashimoto's thyroiditis",
    "asthenia",
    "thyroid panel",
    "admission complaint"
  ]
}


# Metadata Aggregation and JSON Saving

the followwoing cell combines the metadata extracted by the LLM with a predefined template, and saves the result to a JSON file. It handles batch-based file naming.

In [60]:
def save_metadata_to_json(pdf_filename: str, llm_response: str, part_number: Optional[int] = None, run_dir: str = None) -> Optional[str]:

    output_dir = run_dir or OUTPUT_FOLDER
    os.makedirs(output_dir, exist_ok=True)

    print(f"Saving JSON to folder: {output_dir}")

    if part_number is not None:
        base_filename = OUTPUT_FILENAME.replace('.json', f'_part_{part_number}.json')
    else:
        base_filename = OUTPUT_FILENAME

    json_path = Path(output_dir) / base_filename

    metadata = copy.deepcopy(METADATA_TEMPLATE)
    metadata["paper_id"] = Path(pdf_filename).stem

    parsed = parse_llm_output(llm_response)

    if parsed:
        metadata.update(parsed)

    try:
        if json_path.exists():
            with open(json_path, "r", encoding="utf-8") as fp:
                existing_data = json.load(fp)
                paper_list = existing_data.get("papers", [])
        else:
            paper_list = []

        paper_list.append(metadata)

        with open(json_path, "w", encoding="utf-8") as fp:
            json.dump({"papers": paper_list}, fp, ensure_ascii=False, indent=2)
        print(f"JSON Saved -> {json_path}, (total {len(paper_list)} papers)")
        return str(json_path)
    except OSError as e:
        print(f"File Write Error: {e}")
        return None

# End-to-End Processing of a Single PDF Document

the following cell orchestrates the complete processing pipeline for a single PDF: extracting first-page text, generating LLM prompts, performing metadata extraction and correction, and saving the finalized metadata to disk.

In [69]:
def process_single_pdf(pdf_file: str, pdf_index: int, total_files: int, part_numbers: int):

    pdf_path = os.path.join(PDFS, pdf_file)
    print(f"=== PDF {pdf_index}/{total_files}: {pdf_file} (Part {part_numbers}) ===")

    first_page = extract_first_page(pdf_path)

    if first_page:
        print("Sending to LLM...")
        extraction_prompt = get_extraction_prompt(first_page)

        initial_response, _  =  extract_metadata_with_llm(extraction_prompt)

        if initial_response:
            print("Initial LLM Response:")
            print("-" * 50)
            print(initial_response)
            print("-" * 50)
                
            print("Sending for correction...")
            correction_prompt = get_correction_prompt(initial_response)
            corrected_response, _  = correct_response_with_llm(correction_prompt)

            if corrected_response:
                print("Corrected LLM Response:")
                print("-" * 50)
                print(corrected_response)
                print("-" * 50)

                json_path = save_metadata_to_json(pdf_file, corrected_response, part_numbers)
                if json_path is None:
                    print("JSON could not be saved, LLM response should be checked.")
                time.sleep(SLEEP_DURATION)
                return True
            else:
                print("Correction failed, using original response...")
                json_path = save_metadata_to_json(pdf_file, initial_response, part_numbers)
                return True
        else:
            return False
    else:
        print("PDF Reading Error, LLM processing skipped")
        return False

# Framework Entry Point: Batch PDF Metadata Extraction

The following cell serves as the entry point of the framework, coordinating batch-wise PDF discovery, progress tracking, and sequential metadata extraction across all documents.

In [70]:
def process_pdfs():
    print("PDF Metadata Extractor (Batch Processing)")
    print("=" * 50)

    pdf_files = get_pdf_files(PDFS)
    if not pdf_files:
        return
    
    total_files = len(pdf_files)
    total_batches = (total_files + BATCH_SIZE - 1) // BATCH_SIZE
    
    print(f"{total_files} PDF files found.")
    print(f"Batch size: {BATCH_SIZE} PDFs per part")
    print(f"Total parts: {total_batches}")
    print(f"Processing order: {', '.join(pdf_files[:5])}{'...' if total_files > 5 else ''}\n")

    for batch_number in range(total_batches):
        part_number = batch_number + 1
        start_idx = batch_number * BATCH_SIZE
        end_idx = min(start_idx + BATCH_SIZE, total_files)
        batch_files = pdf_files[start_idx:end_idx]

        print(f"\n PART {part_number}/{total_batches} - Processing PDFs {start_idx + 1}-{end_idx}")
        print("=" * 60)

        for i, pdf_file in enumerate(batch_files):
            global_index = start_idx + i + 1
            success = process_single_pdf(pdf_file, global_index, total_files, part_number)

            if not success:
                print(f"PDF {pdf_file} could not be processed completely")
        
        print(f"\n Part {part_number} completed!")
        
        if part_number < total_batches:
            print("Waiting 5 seconds before next part...")
            time.sleep(5)
    
    print(f"\n Complete: All {total_files} PDFs processed in {total_batches} parts!")

# Framework Execution

In [71]:
process_pdfs()

PDF Metadata Extractor (Batch Processing)
4 PDF files found.
Batch size: 10 PDFs per part
Total parts: 1
Processing order: 1.pdf, 2.pdf, 3.pdf, 4.pdf


 PART 1/1 - Processing PDFs 1-4
=== PDF 1/4: 1.pdf (Part 1) ===
Sending to LLM...
Initial LLM Response:
--------------------------------------------------
{
"doi": "10.3390/life14111380",
"title": "A Real-Life Study in Patients Newly Diagnosed with Autoimmune Hashimoto’s Thyroiditis: Analysis of Asthenia as Admission Complaint",
"published_year": 2024,
"author_list": [
"Ana Valea",
"Mihai Costachescu",
"Mihaela Stanciu",
"Claudiu Nistor",
"Oana-Claudia Sima",
"Mara Carsote",
"Tiberiu Vasile Ioan Nistor",
"Denisa Tanasescu",
"Florina Ligia Popa",
"Mihai-Lucian Ciobica"
],
"countries": [
"Romania"
],
"purpose_of_work": "To analyze the relationship between thyroid function and asthenia as an admission complaint in newly diagnosed patients with Hashimoto’s thyroiditis.",
"keywords": [
"Hashimoto's Thyroiditis",
"Asthenia",
"Thyroid Panel",
