In [2]:
import fitz
import re
import os
import numpy as np
import csv
import spacy


def get_configuration_constants():
    return {
        'DEFAULT_KEYWORDS': [
            "Cu||Li", "Li||Cu", "Li-Cu", "Cu-Li", 'Aurbach',
            "coulombic efficiency", "CE", "coulombic efficiencies",
            "conductivity", "conductivities", 
            "viscosity", "viscosities"
        ],
        'SIMILARITY_THRESHOLD': 0.5,
        'SPACY_MODEL': "en_core_web_md"
    }


def load_spacy_model(model_name=None):
    if model_name is None:
        model_name = get_configuration_constants()['SPACY_MODEL']
    
    try:
        return spacy.load(model_name)
    except OSError:
        print(f"Model '{model_name}' not found. Trying 'en_core_web_sm'...")
        try:
            return spacy.load("en_core_web_sm")
        except OSError:
            raise RuntimeError("No spaCy model found. Please install with: python -m spacy download en_core_web_sm")


def prepare_keyword_embeddings(nlp_model, keywords):
    keyword_embeddings = {}
    for kw in keywords:
        doc = nlp_model(kw)
        if doc.vector_norm:
            keyword_embeddings[kw] = doc.vector / doc.vector_norm
    return keyword_embeddings


def calculate_similarity(nlp_model, caption, keyword_embeddings, similarity_threshold):
    caption_doc = nlp_model(caption)
    if not caption_doc.vector_norm:
        return []
    
    caption_vec = caption_doc.vector / caption_doc.vector_norm
    found_keywords = []
    
    for kw, kw_vec in keyword_embeddings.items():
        similarity = np.dot(caption_vec, kw_vec)
        if similarity >= similarity_threshold:
            found_keywords.append((kw, similarity))
    
    found_keywords.sort(key=lambda x: x[1], reverse=True)
    return found_keywords


def is_caption_text(text):
    # Check if caption starts with specific keywords
    if re.search(r"^(Figure|Fig\.?|Supplementary Figure)\b", text.strip(), re.IGNORECASE):
        return True
    return False


def find_image_caption(page, img_info, text_blocks):
    img_rect = page.get_image_bbox(img_info)
    best_caption = ""
    best_distance = float('inf')
    
    for block in text_blocks:
        if "lines" not in block:
            continue
            
        block_rect = fitz.Rect(block["bbox"])
        block_text = " ".join([span["text"] for line in block["lines"] for span in line["spans"]]).strip()
        
        if not is_caption_text(block_text):
            continue
        
        vertical_distance = abs(block_rect.y0 - img_rect.y1)
        horizontal_overlap = min(img_rect.x1, block_rect.x1) - max(img_rect.x0, block_rect.x0)
        
        if horizontal_overlap > 0 and vertical_distance < 100:
            if vertical_distance < best_distance:
                best_caption = block_text
                best_distance = vertical_distance
    
    return best_caption


def extract_and_save_image(pdf, xref, page_num, output_dir, pdf_name, img_index):
    try:
        base_image = pdf.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
    except Exception as e:
        print(f"    - Could not extract image on page {page_num+1}: {str(e)}")
        return None
    
    image_filename = f"{pdf_name}_p{page_num+1}_fig{img_index+1}.{image_ext}"
    image_path = os.path.join(output_dir, image_filename)
    
    with open(image_path, "wb") as img_file:
        img_file.write(image_bytes)
    
    return image_filename, image_path


def process_single_pdf(pdf_path, nlp_model, keyword_embeddings, similarity_threshold, output_dir):
    pdf = fitz.open(pdf_path)
    results = []
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    for page_num in range(len(pdf)):
        page = pdf.load_page(page_num)
        
        images = page.get_images(full=True)
        if not images:
            continue
        
        text_blocks = page.get_text("dict")["blocks"]
        
        for img_index, img_info in enumerate(images):
            xref = img_info[0]
            
            caption = find_image_caption(page, img_info, text_blocks)
            
            if caption:
                found_keywords = calculate_similarity(nlp_model, caption, keyword_embeddings, similarity_threshold)
                
                if found_keywords:
                    image_result = extract_and_save_image(pdf, xref, page_num, output_dir, pdf_name, img_index)
                    if image_result:
                        image_filename, image_path = image_result
                        results.append({
                            'image_filename': image_filename,
                            'image_path': image_path,
                            'page': page_num + 1,
                            'caption': caption,
                            'keywords': found_keywords
                        })
    
    pdf.close()
    return results


def get_all_pdf_files(paper_folder, si_folder):
    """Get all PDF files from both folders"""
    all_files = []
    
    # Get files from paper folder
    if os.path.exists(paper_folder):
        paper_files = [f for f in os.listdir(paper_folder) if f.lower().endswith('.pdf')]
        for f in paper_files:
            all_files.append((os.path.join(paper_folder, f), f))
    
    # Get files from SI folder
    if os.path.exists(si_folder):
        si_files = [f for f in os.listdir(si_folder) if f.lower().endswith('.pdf')]
        for f in si_files:
            all_files.append((os.path.join(si_folder, f), f))
    
    return all_files


def write_results_to_csv(output_csv, all_results):
    fieldnames = ['pdf_file', 'image_file', 'page', 'caption', 'keywords', 'similarity_scores']
    
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for result in all_results:
            writer.writerow({
                'pdf_file': result['pdf_file'],
                'image_file': result['image_filename'],
                'page': result['page'],
                'caption': result['caption'],
                'keywords': "; ".join([k for k, _ in result['keywords']]),
                'similarity_scores': "; ".join([f"{s:.3f}" for _, s in result['keywords']])
            })


def process_pdf_folders(paper_folder, si_folder, output_csv, extracted_images_folder, keywords=None, similarity_threshold=None):
    config = get_configuration_constants()
    
    if keywords is None:
        keywords = config['DEFAULT_KEYWORDS']
    if similarity_threshold is None:
        similarity_threshold = config['SIMILARITY_THRESHOLD']
    
    nlp = load_spacy_model()
    keyword_embeddings = prepare_keyword_embeddings(nlp, keywords)
    
    output_dir = os.path.join(os.getcwd(), extracted_images_folder)
    os.makedirs(output_dir, exist_ok=True)
    
    all_results = []
    
    # Get all PDF files from both folders
    all_files = get_all_pdf_files(paper_folder, si_folder)
    total_files = len(all_files)
    
    print(f"Found {total_files} PDF files to process...")
    print("=" * 50)
    
    for idx, (file_path, filename) in enumerate(all_files, 1):
        print(f"[{idx}/{total_files}] Processing: {filename}", end="")
        
        try:
            pdf_results = process_single_pdf(file_path, nlp, keyword_embeddings, similarity_threshold, output_dir)
            
            for result in pdf_results:
                result['pdf_file'] = filename
                all_results.append(result)
            
            if pdf_results:
                print(f" - ✓ Extracted {len(pdf_results)} images")
            else:
                print(f" - No images found")
        
        except Exception as e:
            print(f" - ✗ Error: {str(e)}")
    
    print("=" * 50)
    write_results_to_csv(output_csv, all_results)
    print(f"Processing completed! {len(all_results)} images extracted")
    print(f"Results saved to: {output_csv}")
    print(f"Images saved to: {output_dir}")


def main():
    paper_folder = os.path.join(os.getcwd(), "paper")
    si_folder = os.path.join(os.getcwd(), "paper_SI")
    OUTPUT_CSV = "extracted_images_report.csv"
    EXTRACTED_IMAGES_FOLDER = "extracted_images"
    
    config = get_configuration_constants()
    
    process_pdf_folders(
        paper_folder=paper_folder,
        si_folder=si_folder,
        output_csv=OUTPUT_CSV,
        extracted_images_folder=EXTRACTED_IMAGES_FOLDER,
        keywords=config['DEFAULT_KEYWORDS'],
        similarity_threshold=config['SIMILARITY_THRESHOLD']
    )


if __name__ == "__main__":
    main()

Found 16 PDF files to process...
[1/16] Processing: Lia_1.pdf - ✓ Extracted 6 images
[2/16] Processing: Lia_2.pdf - ✓ Extracted 4 images
[3/16] Processing: Lia_3.pdf - ✓ Extracted 7 images
[4/16] Processing: Lia_4.pdf - ✓ Extracted 4 images
[5/16] Processing: Lia_5.pdf - ✓ Extracted 5 images
[6/16] Processing: Lia_6.pdf - ✓ Extracted 3 images
[7/16] Processing: Lia_7.pdf - ✓ Extracted 4 images
[8/16] Processing: Lia_8.pdf - ✓ Extracted 6 images
[9/16] Processing: Lib_1.pdf - ✓ Extracted 14 images
[10/16] Processing: Lib_2.pdf - ✓ Extracted 24 images
[11/16] Processing: Lib_3.pdf - ✓ Extracted 13 images
[12/16] Processing: Lib_4.pdf - ✓ Extracted 7 images
[13/16] Processing: Lib_5.pdf - ✓ Extracted 9 images
[14/16] Processing: Lib_6.pdf - ✓ Extracted 12 images
[15/16] Processing: Lib_7.pdf - ✓ Extracted 9 images
[16/16] Processing: Lib_8.pdf - ✓ Extracted 7 images
Processing completed! 134 images extracted
Results saved to: extracted_images_report.csv
Images saved to: c:\Users\dkbay\Dow