In [9]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import json
import os
from collections import defaultdict

# Load the JSON file
with open('blocks.json', 'r') as f:
    blocks = json.load(f)

# Load the PDF file
pdf_file = "Parker.pdf"
doc = fitz.open(pdf_file)

# Create an output directory for cropped images
output_dir = "cropped_regions"
os.makedirs(output_dir, exist_ok=True)

# Initialize results dictionary
results = []

render_dpi = 300
pdf_dpi = 72
magnifier = 2.77777777778
scaling_factor = pdf_dpi / render_dpi * magnifier

# Group blocks by page
grouped_blocks = defaultdict(list)
for block in blocks:
    grouped_blocks[block["page"]].append(block)

# Process each page
for page_num, rectangles in grouped_blocks.items():
    page = doc[page_num - 1]  

    for rect in rectangles:
        # Scale the coordinates
        x = rect["x"] * scaling_factor
        y = rect["y"] * scaling_factor
        width = rect["width"] * scaling_factor
        height = rect["height"] * scaling_factor
        rect_coords = fitz.Rect(x, y, x + width, y + height)

        # Render cropped region at higher DPI
        zoom = render_dpi / pdf_dpi
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat, clip=rect_coords)

        # Save cropped region for debugging
        image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        # output_filename = os.path.join(output_dir, f"page_{page_num}_uid_{rect['uid']}.png")
        # image.save(output_filename)
        # print(f"Saved cropped region to {output_filename}")

        # Add to results (if needed for OCR)
        results.append({
            "page": page_num,
            "uid": rect["uid"],  # Include UID for reference
            "label": rect.get("label", ""),  # Optional label for debugging
            "text": pytesseract.image_to_string(image)
        })

# Export the results to a JSON file
with open('extracted_text.json', 'w') as f:
    json.dump(results, f, indent=4)

print(f"Cropped regions saved to '{output_dir}' and results saved to 'extracted_text.json'.")


Cropped regions saved to 'cropped_regions' and results saved to 'extracted_text.json'.


In [None]:
import pandas as pd

# Load the JSON file
with open('extracted_text.json', 'r') as f:
    data = json.load(f)

# Convert to a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
df.to_csv(index=False
    

Unnamed: 0,page,uid,label,text
0,1,59cbb14d-52c7-4835-886a-fa9ea35cc0b9,session,"CHARLIE PARKER.\nPR, Kansas City, MISSOURI, pr..."
1,1,2d70373d-b98b-4ec9-b7b2-c2e2e1503f07,session,"JAY McSHANN and his ORCHESTRA.\nPR, ‘Trocadero..."
2,1,dd412ca2-0d82-44b9-88d9-d2c7b31d7dcb,musos,"EE OE BD NE Nt!\n\nCharlie Parker, unaccompani..."
3,1,72385c2a-7fa3-47b5-a9dd-c2d46d53f9aa,musos,"Bernard ‘Buddy’ Anderson, Orville ‘Piggy’ Mino..."
4,1,ba007d0b-b954-43c4-a6e0-df0057e5cc45,tracks,HONEYSUCKLE ROSE -inc. into 2:15\nBODY AND SOU...
...,...,...,...,...
733,254,4de93ea2-e3f7-4e30-860a-fc16d1195c0a,tracks,@ America(F) 30AM-6053 @ B&C(GB) Bird-1 # Debu...
734,257,59905df3-6275-40fd-a489-a563f934acf7,musos,Pp @ Clef 11100\n@® Clef EPC-287\n
735,257,077d6007-e346-4197-8786-1463f42acee3,tracks,Note: -q: the session ledger lists eight incom...
736,263,c942dc82-afca-49c5-8850-19ad7bcb4dcf,musos,Aw sf bt Wt Vet OV ONS IME IS A\n\n@ GoJz(i) C...
