In [None]:
import fitz  # PyMuPDF
import re
import os
import numpy as np
import csv
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from datetime import datetime

# Tải mô hình ngôn ngữ spaCy
nlp = spacy.load("en_core_web_md")
paper_path = "D:/STUDY/CODE/Paper_retrieval/paper"
OUTPUT_CSV = "extracted_images_report.csv"
KEYWORDS = [
        "Cu||Li", "Li||Cu", "Li-Cu", "Cu-Li", 
        "coulombic efficiency", "CE", "coulombic efficiencies",
        "conductivity", "conductivities", 
        "viscosity", "viscosities",
        "lithium metal", "electrolyte", "anode", "cathode",
        "cycling performance", "capacity retention"
    ]
SIMILARITY_THRESHOLD = 0.4

In [None]:
def process_pdf_folder(folder_path, output_csv, keywords=None, similarity_threshold=0.4):
    """Xử lý tất cả file PDF trong thư mục và lưu kết quả vào CSV"""
    if keywords is None:
        keywords = [
            "Cu||Li", "Li||Cu", "Li-Cu", "Cu-Li", 
            "coulombic efficiency", "CE", "coulombic efficiencies",
            "conductivity", "conductivities", 
            "viscosity", "viscosities"
        ]
    
    # Chuẩn bị embedding cho các từ khóa
    keyword_embeddings = {}
    for kw in keywords:
        doc = nlp(kw)
        if doc.vector_norm:
            keyword_embeddings[kw] = doc.vector / doc.vector_norm
    
    # Tạo thư mục lưu ảnh
    output_dir = os.path.join(folder_path, "extracted_images")
    os.makedirs(output_dir, exist_ok=True)
    
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = [
            'pdf_file', 'image_file', 'image_path', 'page', 
            'caption', 'keywords', 'similarity_scores'
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for filename in os.listdir(folder_path):
            if filename.lower().endswith('.pdf'):
                pdf_path = os.path.join(folder_path, filename)
                print(f"\nProcessing PDF: {filename}")
                
                try:
                    # Xử lý từng PDF
                    pdf_results = process_single_pdf(
                        pdf_path, 
                        keyword_embeddings, 
                        similarity_threshold,
                        output_dir
                    )
                    
                    # Ghi kết quả vào CSV
                    for img_data in pdf_results:
                        writer.writerow({
                            'pdf_file': filename,
                            'image_file': img_data['image_filename'],
                            'image_path': img_data['image_path'],
                            'page': img_data['page'],
                            'caption': img_data['caption'],
                            'keywords': "; ".join([k for k, _ in img_data['keywords']]),
                            'similarity_scores': "; ".join([f"{s:.3f}" for _, s in img_data['keywords']])
                        })
                    
                    print(f"  - Extracted {len(pdf_results)} relevant images")
                
                except Exception as e:
                    print(f"  - Error processing {filename}: {str(e)}")
    
    print(f"\nProcessing completed! Results saved to {output_csv}")

def process_single_pdf(pdf_path, keyword_embeddings, similarity_threshold, output_dir):
    pdf = fitz.open(pdf_path)
    results = []
    
    for page_num in range(len(pdf)):
        page = pdf.load_page(page_num)
        
        images = page.get_images(full=True)
        if not images:
            continue
        
        text_blocks = page.get_text("dict")["blocks"]
        
        for img_info in images:
            xref = img_info[0]
            try:
                base_image = pdf.extract_image(xref)
                image_bytes = base_image["image"]
                image_ext = base_image["ext"]
            except Exception as e:
                print(f"    - Could not extract image on page {page_num+1}: {str(e)}")
                continue
            
            # Tạo tên file ảnh duy nhất
            timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
            image_filename = f"img_p{page_num+1}_{xref}_{timestamp}.{image_ext}"
            image_path = os.path.join(output_dir, image_filename)
            
            with open(image_path, "wb") as img_file:
                img_file.write(image_bytes)
            
            # Tìm caption
            caption = find_image_caption(page, img_info, text_blocks)
            
            if caption:
                # Tính toán similarity với các từ khóa
                caption_doc = nlp(caption)
                if caption_doc.vector_norm:
                    caption_vec = caption_doc.vector / caption_doc.vector_norm
                    found_keywords = []
                    
                    for kw, kw_vec in keyword_embeddings.items():
                        similarity = np.dot(caption_vec, kw_vec)
                        if similarity >= similarity_threshold:
                            found_keywords.append((kw, similarity))
                    
                    # Sắp xếp theo độ tương đồng giảm dần
                    found_keywords.sort(key=lambda x: x[1], reverse=True)
                    
                    if found_keywords:
                        results.append({
                            'image_filename': image_filename,
                            'image_path': image_path,
                            'page': page_num + 1,
                            'caption': caption,
                            'keywords': found_keywords
                        })
    
    return results

def find_image_caption(page, img_info, text_blocks):
    """Tìm caption cho hình ảnh dựa trên vị trí"""
    img_rect = page.get_image_bbox(img_info)
    best_caption = ""
    best_distance = float('inf')
    
    for block in text_blocks:
        if "lines" not in block:
            continue
            
        block_rect = fitz.Rect(block["bbox"])
        block_text = " ".join(
            [span["text"] for line in block["lines"] for span in line["spans"]]
        ).strip()
        
        # Kiểm tra xem có phải là caption
        if not is_caption_text(block_text):
            continue
        
        # Tính khoảng cách từ hình đến caption
        vertical_distance = abs(block_rect.y0 - img_rect.y1)
        horizontal_overlap = min(img_rect.x1, block_rect.x1) - max(img_rect.x0, block_rect.x0)
        
        # Ưu tiên caption gần nhất có overlap ngang
        if horizontal_overlap > 0 and vertical_distance < 100:
            if vertical_distance < best_distance:
                best_caption = block_text
                best_distance = vertical_distance
    
    return best_caption

def is_caption_text(text):
    """Kiểm tra xem text có đặc điểm của caption không"""
    # Kiểm tra các từ chỉ định caption
    if re.search(r"\b(Fig\.?|Figure|Table|Scheme|Chart)\b", text, re.IGNORECASE):
        return True
    
    # Kiểm tra độ dài phù hợp cho caption
    words = text.split()
    if 5 < len(words) < 150:
        return True
    
    return False

if __name__ == "__main__":
    # Xử lý tất cả PDF trong thư mục
    process_pdf_folder(
        folder_path=paper_path,
        output_csv=OUTPUT_CSV,
        keywords=KEYWORDS,
        similarity_threshold=SIMILARITY_THRESHOLD
    )

Image image_p0_xref8.png - Caption: [No caption found]
Image image_p0_xref28.png - Caption: Figure 1:  Correlation of two models with the same MSE loss.
Image image_p1_xref74.png - Caption: Figure 2:  Proposed No-Reference Speech Quality Assessment model.
