In [1]:
import pypdf
import re
import json

def read_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

def parse_poems_to_json(raw_text):
    poems = []
    current_title = None
    current_lines = []

    lines = raw_text.splitlines()
    for line in lines:
        line = line.strip()

        # Match poem title line like: 6. Telephone Booth Number 580
        match = re.match(r"^(\d+)\.\s+(.*)", line)
        if match:
            # Save previous poem
            if current_title:
                poems.append({
                    "title": current_title,
                    "content": "\n".join(current_lines)
                })
            current_title = match.group(2)
            current_lines = []
        elif current_title and line != "":
            current_lines.append(line)

    # Add last poem
    if current_title:
        poems.append({
            "title": current_title,
            "content": "\n".join(current_lines)
        })

    return { "poems": poems }

# Run it
pdf_path = "Pedro Pietri Loose joints.pdf"
pdf_text = read_pdf(pdf_path)
poems_json = parse_poems_to_json(pdf_text)

# Write to JSON file
with open("pedro_pietri_poems.json", "w", encoding="utf-8") as f:
    json.dump(poems_json, f, ensure_ascii=False, indent=2)

print("JSON saved: pedro_pietri_poems.json")



JSON saved: pedro_pietri_poems.json
