In [None]:
import fitz
import re
import os
import csv
import numpy as np
from PIL import Image
import io


def get_configuration_constants():
    return {
        'DEFAULT_KEYWORDS': [
            "Cu||Li", "Li||Cu", "Li-Cu", "Cu-Li", "Li || Cu", "Cu || Li",
            "coulombic efficiency", "CE", "coulombic efficiencies", "CEs",
            "conductivity", "conductivities", "viscosity", "viscosities",
            "aurbach"
        ],
        'ENABLE_IMAGE_FILTER': False,  # Set to True to enable image quality filtering
        # Image quality filtering parameters
        'MIN_WIDTH': 200,           # Minimum width in pixels
        'MIN_HEIGHT': 150,          # Minimum height in pixels
        'MIN_AREA': 50000,          # Minimum total area (width × height)
        'MIN_FILE_SIZE': 10240,      # Minimum file size in bytes (10KB)
        'MAX_FILE_SIZE': 3145728,   # Maximum file size in bytes (3MB)
        'COMPLEXITY_THRESHOLD': 0.1  # Minimum image complexity (entropy-based)
    }


def find_keywords_in_text(text, keywords):
    found_keywords = []
    search_text = text.lower()
    
    for keyword in keywords:
        search_keyword = keyword.lower()
        
        if re.search(r'[^\w\s]', search_keyword):
            if search_keyword in search_text:
                found_keywords.append(keyword)
        else:
            pattern = r'\b' + re.escape(search_keyword) + r'\b'
            if re.search(pattern, search_text):
                found_keywords.append(keyword)
    
    return found_keywords


def is_caption_text(text):
    if re.search(r"^(Figure|Fig\.?|Supplementary Figure|Table|Supplementary Table)\b", text.strip(), re.IGNORECASE):
        return True
    return False


def calculate_image_entropy(image_array):
    if len(image_array.shape) == 3:
        gray = np.dot(image_array[...,:3], [0.2989, 0.5870, 0.1140])
    else:
        gray = image_array
    
    hist, _ = np.histogram(gray.flatten(), bins=256, range=(0, 256))
    hist = hist[hist > 0]
    
    if len(hist) == 0:
        return 0
    
    prob = hist / hist.sum()
    entropy = -np.sum(prob * np.log2(prob))
    
    return entropy / 8.0


def check_image_quality(image_bytes, config):
    try:
        image = Image.open(io.BytesIO(image_bytes))
        width, height = image.size
        area = width * height
        aspect_ratio = width / height if height > 0 else 0
        file_size = len(image_bytes)
        
        rejection_reasons = []
        
        if file_size < config['MIN_FILE_SIZE']:
            rejection_reasons.append(f"file_size_too_small ({file_size} < {config['MIN_FILE_SIZE']})")
        elif file_size > config['MAX_FILE_SIZE']:
            rejection_reasons.append(f"file_size_too_large ({file_size} > {config['MAX_FILE_SIZE']})")
        
        if width < config['MIN_WIDTH']:
            rejection_reasons.append(f"width_too_small ({width} < {config['MIN_WIDTH']})")
        if height < config['MIN_HEIGHT']:
            rejection_reasons.append(f"height_too_small ({height} < {config['MIN_HEIGHT']})")
        if area < config['MIN_AREA']:
            rejection_reasons.append(f"area_too_small ({area} < {config['MIN_AREA']})")
        
        if width == height and width < 100:
            rejection_reasons.append("likely_icon_or_logo")
        
        if min(width, height) < 20:
            rejection_reasons.append("likely_decorative_element")
        
        try:
            image_array = np.array(image)
            entropy = calculate_image_entropy(image_array)
            if entropy < config['COMPLEXITY_THRESHOLD']:
                rejection_reasons.append(f"low_complexity ({entropy:.3f} < {config['COMPLEXITY_THRESHOLD']})")
        except Exception:
            pass
        
        is_quality = len(rejection_reasons) == 0
        return is_quality, rejection_reasons
        
    except Exception as e:
        return False, [f"processing_error: {str(e)}"]


def find_table_caption(page, text_blocks):
    table_captions = []
    
    for block in text_blocks:
        if "lines" not in block:
            continue
            
        block_text = " ".join([span["text"] for line in block["lines"] for span in line["spans"]]).strip()
        
        if re.search(r"^(Table|Supplementary Table)\b", block_text.strip(), re.IGNORECASE):
            block_rect = fitz.Rect(block["bbox"])
            table_captions.append({
                'text': block_text,
                'rect': block_rect
            })
    
    return table_captions


def extract_table_as_image(page, caption_info, output_dir, pdf_name, page_num, table_index):
    config = get_configuration_constants()
    
    caption_rect = caption_info['rect']
    
    table_rect = fitz.Rect(
        caption_rect.x0,
        caption_rect.y1,
        caption_rect.x1,
        min(caption_rect.y1 + 300, page.rect.height)
    )
    
    mat = fitz.Matrix(2, 2)
    pix = page.get_pixmap(matrix=mat, clip=table_rect)
    
    img_data = pix.tobytes("png")
    
    if config['ENABLE_IMAGE_FILTER']:
        is_quality, _ = check_image_quality(img_data, config)
        if not is_quality:
            return None
    
    table_filename = f"{pdf_name}_p{page_num+1}_table{table_index+1}.png"
    table_path = os.path.join(output_dir, table_filename)
    
    with open(table_path, "wb") as img_file:
        img_file.write(img_data)
    
    return table_filename, table_path


def find_image_caption(page, img_info, text_blocks):
    img_rect = page.get_image_bbox(img_info)
    best_caption = ""
    best_distance = float('inf')
    
    for block in text_blocks:
        if "lines" not in block:
            continue
            
        block_rect = fitz.Rect(block["bbox"])
        block_text = " ".join([span["text"] for line in block["lines"] for span in line["spans"]]).strip()
        
        if not is_caption_text(block_text):
            continue
        
        vertical_distance = abs(block_rect.y0 - img_rect.y1)
        horizontal_overlap = min(img_rect.x1, block_rect.x1) - max(img_rect.x0, block_rect.x0)
        
        if horizontal_overlap > 0 and vertical_distance < 100:
            if vertical_distance < best_distance:
                best_caption = block_text
                best_distance = vertical_distance
    
    return best_caption


def extract_and_save_image(pdf, xref, page_num, output_dir, pdf_name, img_index):
    config = get_configuration_constants()
    
    try:
        base_image = pdf.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
    except Exception as e:
        print(f"    - Could not extract image on page {page_num+1}: {str(e)}")
        return None
    
    if config['ENABLE_IMAGE_FILTER']:
        is_quality, _ = check_image_quality(image_bytes, config)
        if not is_quality:
            return None
    
    image_filename = f"{pdf_name}_p{page_num+1}_fig{img_index+1}.{image_ext}"
    image_path = os.path.join(output_dir, image_filename)
    
    with open(image_path, "wb") as img_file:
        img_file.write(image_bytes)
    
    return image_filename, image_path


def process_single_pdf(pdf_path, keywords, output_dir):
    pdf = fitz.open(pdf_path)
    results = []
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    
    for page_num in range(len(pdf)):
        page = pdf.load_page(page_num)
        
        images = page.get_images(full=True)
        text_blocks = page.get_text("dict")["blocks"]
        
        for img_index, img_info in enumerate(images):
            xref = img_info[0]
            
            caption = find_image_caption(page, img_info, text_blocks)
            
            if caption:
                found_keywords = find_keywords_in_text(caption, keywords)
                
                if found_keywords:
                    image_result = extract_and_save_image(pdf, xref, page_num, output_dir, pdf_name, img_index)
                    if image_result:
                        image_filename, image_path = image_result
                        results.append({
                            'type': 'image',
                            'image_filename': image_filename,
                            'image_path': image_path,
                            'page': page_num + 1,
                            'caption': caption,
                            'keywords': found_keywords
                        })
        
        table_captions = find_table_caption(page, text_blocks)
        
        for table_index, caption_info in enumerate(table_captions):
            caption_text = caption_info['text']
            found_keywords = find_keywords_in_text(caption_text, keywords)
            
            if found_keywords:
                table_result = extract_table_as_image(page, caption_info, output_dir, pdf_name, page_num, table_index)
                if table_result:
                    table_filename, table_path = table_result
                    results.append({
                        'type': 'table',
                        'image_filename': table_filename,
                        'image_path': table_path,
                        'page': page_num + 1,
                        'caption': caption_text,
                        'keywords': found_keywords
                    })
    
    pdf.close()
    return results


def get_all_pdf_files(paper_folder, si_folder):
    all_files = []
    
    if os.path.exists(paper_folder):
        paper_files = [f for f in os.listdir(paper_folder) if f.lower().endswith('.pdf')]
        for f in paper_files:
            all_files.append((os.path.join(paper_folder, f), f))
    
    if os.path.exists(si_folder):
        si_files = [f for f in os.listdir(si_folder) if f.lower().endswith('.pdf')]
        for f in si_files:
            all_files.append((os.path.join(si_folder, f), f))
    
    return all_files


def write_results_to_csv(output_csv, all_results):
    fieldnames = ['pdf_file', 'type', 'image_file', 'page', 'caption', 'keywords']
    
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        
        for result in all_results:
            writer.writerow({
                'pdf_file': result['pdf_file'],
                'type': result['type'],
                'image_file': result['image_filename'],
                'page': result['page'],
                'caption': result['caption'],
                'keywords': "; ".join(result['keywords'])
            })


def process_pdf_folders(paper_folder, si_folder, output_csv, extracted_images_folder, keywords=None):
    config = get_configuration_constants()
    
    if keywords is None:
        keywords = config['DEFAULT_KEYWORDS']
    
    output_dir = os.path.join(os.getcwd(), extracted_images_folder)
    os.makedirs(output_dir, exist_ok=True)
    
    all_results = []
    all_files = get_all_pdf_files(paper_folder, si_folder)
    total_files = len(all_files)
    
    print(f"Found {total_files} PDF files to process...")
    print(f"Searching for keywords: {', '.join(keywords)}")
    print(f"Image quality filtering: {'Enabled' if config['ENABLE_IMAGE_FILTER'] else 'Disabled'}")
    print("=" * 50)
    
    for idx, (file_path, filename) in enumerate(all_files, 1):
        print(f"[{idx}/{total_files}] Processing: {filename}", end="")
        
        try:
            pdf_results = process_single_pdf(file_path, keywords, output_dir)
            
            for result in pdf_results:
                result['pdf_file'] = filename
                all_results.append(result)
            
            images_count = len([r for r in pdf_results if r['type'] == 'image'])
            tables_count = len([r for r in pdf_results if r['type'] == 'table'])
            
            if pdf_results:
                print(f" - ✓ Extracted {images_count} images, {tables_count} tables")
            else:
                print(f" - N/A")
        
        except Exception as e:
            print(f" - ✗ Error: {str(e)}")
    
    print("=" * 50)
    write_results_to_csv(output_csv, all_results)
    
    total_images = len([r for r in all_results if r['type'] == 'image'])
    total_tables = len([r for r in all_results if r['type'] == 'table'])
    
    print(f"Processing completed! {total_images} images and {total_tables} tables extracted")
    print(f"Results saved to: {output_csv}")
    print(f"Files saved to: {output_dir}")


def main():
    paper_folder = os.path.join(os.getcwd(), "paper")
    si_folder = os.path.join(os.getcwd(), "paper_SI")
    OUTPUT_CSV = "extracted_images_report.csv"
    EXTRACTED_IMAGES_FOLDER = "extracted_images"
    
    config = get_configuration_constants()
    
    process_pdf_folders(
        paper_folder=paper_folder,
        si_folder=si_folder,
        output_csv=OUTPUT_CSV,
        extracted_images_folder=EXTRACTED_IMAGES_FOLDER,
        keywords=config['DEFAULT_KEYWORDS']
    )


if __name__ == "__main__":
    main()

Found 16 PDF files to process...
Searching for keywords: Cu||Li, Li||Cu, Li-Cu, Cu-Li, Li || Cu, Cu || Li, coulombic efficiency, CE, coulombic efficiencies, CEs, conductivity, conductivities, viscosity, viscosities, aurbach
Image quality filtering: Disabled
[1/16] Processing: Lia_1.pdf - ✓ Extracted 2 images, 1 tables
[2/16] Processing: Lia_2.pdf - ✓ Extracted 1 images, 0 tables
[3/16] Processing: Lia_3.pdf - ✓ Extracted 1 images, 0 tables
[4/16] Processing: Lia_4.pdf - ✓ Extracted 2 images, 0 tables
[5/16] Processing: Lia_5.pdf - N/A
[6/16] Processing: Lia_6.pdf - N/A
[7/16] Processing: Lia_7.pdf - ✓ Extracted 2 images, 0 tables
[8/16] Processing: Lia_8.pdf - ✓ Extracted 1 images, 0 tables
[9/16] Processing: Lib_1.pdf - ✓ Extracted 3 images, 0 tables
[10/16] Processing: Lib_2.pdf - ✓ Extracted 6 images, 1 tables
[11/16] Processing: Lib_3.pdf - ✓ Extracted 1 images, 0 tables
[12/16] Processing: Lib_4.pdf - N/A
[13/16] Processing: Lib_5.pdf - ✓ Extracted 1 images, 0 tables
[14/16] Proce