# PymuPDF reader
- Read the tutorial here: https://pymupdf.readthedocs.io/en/latest/tutorial.html

In [None]:
import fitz
import re

print(f"Using PyMuPDF v{fitz.__version__} from {fitz.__file__}")


: 

In [5]:
doc = fitz.open("/users/ChandlerShortlidge/Downloads/data-scientist-resume-example.pdf")

In [6]:
# inspect number of pages
print(f"Pages: {doc.page_count}")

Pages: 1


### Accessing Metadata

In [7]:
meta = doc.metadata

for key, value in meta.items():
    print(f"{key}: {value}")

format: PDF 1.4
title: 
author: 
subject: 
keywords: 
creator: 
producer: PDFShift.io
creationDate: D:20220728203846+00'00'
modDate: D:20220728203846+00'00'
trapped: 
encryption: None


### Extracting Text and Images

In [8]:
# Extract text from each page

for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    text = page.get_text()
    print(f"--- Page {page_num + 1} ---")
    print(text[:200], "...") #print first 200 chars

#close when done
# doc.close()

--- Page 1 ---
KANDACE LOUDOR
DATA SCIENTIST
CONTACT
kloudor@email.com
(123) 456-7890
Mount Laurel, NJ
LinkedIn
Github
EDUCATION
B.S.
Statistics
Rutgers University
September 2011 - April 2015
New Brunswick, NJ
SKILL ...


### Keyword Searching
- To locate where a term appears on the page (useful for highlighting)

In [9]:
keyword = "experience"
# Loop over pages and search for your keyword
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)

    matches = page.search_for("experience")
    if matches:
        print(f"Found '{keyword}' on page {page_num + 1} at:")
        for r in matches:
             # print bounding box coordinates
            print(f"    {r}")
            # add a yellow highlight annotation
            highlight = page.add_highlight_annot(r)


# save PDF with highlights
out_path = "highlighted_output.pdf"
doc.save(out_path, garbage=4, deflate=True, clean=True)


print(f"Saved highlighted PDF as: {out_path}")                  

Found 'experience' on page 1 at:
    Rect(265.640625, 117.10921478271484, 363.8422546386719, 137.010009765625)
Saved highlighted PDF as: highlighted_output.pdf


### get_links()
- https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_links

In [10]:
# Loop through pages and inspect links
for page_num in range(doc.page_count):
    page = doc.load_page(page_num) 
    links = page.links() # list of link-dicts from page

    if not links:
        continue

    print(f"Page {page_num + 1} links:")
    for link in links: 
    # Common keys: 'kind', 'from', and either 'uri' or 'xref'/'to'
        kind = link["kind"]
        location = link["from"] # where the link is located
        uri = link.get("uri") # external URL, if any
        to = link.get("to") # (page, ...) for internal jumps

        if uri:
            print(f"  → URI: {uri}")
        elif to:
            print(f"  → Internal link to page {to[0] + 1}")
        else:
            print("  → Other link kind:", kind)

        print(f"    location on page: {location}")    


Page 1 links:
  → URI: https://linkedin.com/
    location on page: Rect(116.25, 190.5, 160.5, 205.5)
  → URI: https://github.com/
    location on page: Rect(125.25, 206.25, 160.5, 221.25)


## Convert to markdown 

In [11]:
def pdf_to_markdown(pdf_path, md_path):
    doc = fitz.open(pdf_path)
    md_lines = []

    for p in range(doc.page_count):
        page = doc.load_page(p)
        # get a nested dict of blocks → lines → spans
        page_dict = page.get_text("dict")

        for block in page_dict["blocks"]:
            # Only text blocks (ignore images, drawings)
            if block["type"] != 0:
                continue

            # Join all spans in this block into one text string
            text = " ".join(span["text"] for line in block["lines"] for span in line["spans"])
            text = text.strip()
            if not text:
                continue

            #  Simple heuristic: ALL CAPS + short → H2
            if text.isupper() and len(text) < 60:
                md_lines.append(f"## {text.title()}")
                continue

            # Detect bullet lists (e.g. lines starting with • or –)
            if re.match(r"^[•\-\u2022]\s+", text):
                item = re.sub(r"^[•\-\u2022]\s+", "", text)
                md_lines.append(f"-{item}")
                continue

            # Otherwise treat as paragraph
            md_lines.append(text)

        # Add a page break marker (optional)
        md_lines.append("\n---\n")

        

# Write out the Markdown
    with open(md_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(md_lines))

    print(f"Converted {pdf_path} → {md_path}")

# Usage
pdf_to_markdown("/Users/chandlershortlidge/Downloads/data-scientist-resume-example.pdf", "KANDACE LOUDOR.md")


Converted /Users/chandlershortlidge/Downloads/data-scientist-resume-example.pdf → KANDACE LOUDOR.md
