# PymuPDF reader
- Read the tutorial here: https://pymupdf.readthedocs.io/en/latest/tutorial.html

In [2]:
# automatically reload modules, including the ones you wrote yourself
%load_ext autoreload
%autoreload 2

import fitz
import re

import pdf_processor as ppr

print(f"Using PyMuPDF v{fitz.__version__} from {fitz.__file__}")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using PyMuPDF v1.26.3 from /Users/eliza/Desktop/resume-reader/.venv/lib/python3.12/site-packages/fitz/__init__.py


In [3]:
RESUME_PATH = "/Users/eliza/Desktop/resume-reader/data-scientist-resume-example.pdf"

In [5]:
doc = ppr.load_file(RESUME_PATH)
ppr.report_metadata(doc)

Loaded document with 1 pages
format: PDF 1.4
title: 
author: 
subject: 
keywords: 
creator: 
producer: PDFShift.io
creationDate: D:20220728203846+00'00'
modDate: D:20220728203846+00'00'
trapped: 
encryption: None


### Extracting Text and Images

In [6]:
# Extract text from each page

for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    text = page.get_text()

#close when done
# doc.close()

In [7]:
text

'KANDACE LOUDOR\nDATA SCIENTIST\nCONTACT\nkloudor@email.com\n(123) 456-7890\nMount Laurel, NJ\nLinkedIn\nGithub\nEDUCATION\nB.S.\nStatistics\nRutgers University\nSeptember 2011 - April 2015\nNew Brunswick, NJ\nSKILLS\nPython (NumPy, Pandas,\nScikit-learn, Keras, Flask)\nSQL (MySQL, Postgres)\nGit\nTime Series Forecasting\nProductionizing Models\nRecommendation Engines\nCustomer Segmentation\nAWS\nWORK EXPERIENCE\nData Scientist\nGrubhub\nJune 2018 - current / Princeton, NJ\nDeployed a recommendation engine to production to\nconditionally recommend other menu items based on past order\nhistory, increasing average order size by 7%\nImplemented various time series forecasting techniques to\npredict surge in orders, lowering customer wait by 10 minutes\nDesigned a model in a pilot to increase incentives for drivers\nduring peak hours, increasing driver availability by 22%\nLed a team of 3 data scientist to model the ordering process 5\nunique ways, reported results, and made recommendation

### Keyword Searching
- To locate where a term appears on the page (useful for highlighting)

In [None]:
keyword = "AWS"

def highlight_keywords(doc: fitz.Documnt, kw_list: list[str]) -> fitz.Document:
    ...

# Loop over pages and search for your keyword
for page_num in range(doc.page_count):
    page = doc.load_page(page_num)
    print(page)

    matches = page.search_for(keyword)
    if matches:
        print(f"Found '{keyword}' on page {page_num + 1} at:")
        for r in matches:
             # print bounding box coordinates
            print(f"    {r}")
            # add a yellow highlight annotation
            highlight = page.add_highlight_annot(r)


# save PDF with highlights
out_path = "highlighted_output.pdf"
doc.save(out_path, garbage=4, deflate=True, clean=True)


print(f"Saved highlighted PDF as: {out_path}")                  

page 0 of /Users/eliza/Desktop/resume-reader/data-scientist-resume-example.pdf
Found 'AWS' on page 1 at:
    Rect(152.390625, 516.9819946289062, 176.43511962890625, 531.9755859375)
Saved highlighted PDF as: highlighted_output.pdf


### get_links()
- https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_links

In [None]:
# Loop through pages and inspect links
# LET'S REWRITE IT **UNDERSTANDING** WHAT IT DOES
# everything begins with http or htto
for page_num in range(doc.page_count):
    page = doc.load_page(page_num) 
    links = page.links() # list of link-dicts from page

    if not links:
        continue

    print(f"Page {page_num + 1} links:")
    for link in links: 
    # Common keys: 'kind', 'from', and either 'uri' or 'xref'/'to'
        kind = link["kind"]
        location = link["from"] # where the link is located
        uri = link.get("uri") # external URL, if any
        to = link.get("to") # (page, ...) for internal jumps

        if uri:
            print(f"  -> URI: {uri}")
        elif to:
            print(f"  → Internal link to page {to[0] + 1}")
        else:
            print("  → Other link kind:", kind)

        print(f"    location on page: {location}")    


Page 1 links:
  → URI: https://linkedin.com/
    location on page: Rect(116.25, 190.5, 160.5, 205.5)
  → URI: https://github.com/
    location on page: Rect(125.25, 206.25, 160.5, 221.25)


## Convert to markdown 

In [12]:
def pdf_to_markdown(pdf_path, md_path):
    doc = fitz.open(pdf_path)
    md_lines = []

    for p in range(doc.page_count):
        page = doc.load_page(p)
        # get a nested dict of blocks → lines → spans
        page_dict = page.get_text("dict")

        for block in page_dict["blocks"]:
            # Only text blocks (ignore images, drawings)
            if block["type"] != 0:
                continue

            # Join all spans in this block into one text string
            text = " ".join(span["text"] for line in block["lines"] for span in line["spans"])
            text = text.strip()
            if not text:
                continue

            #  Simple heuristic: ALL CAPS + short → H2
            if text.isupper() and len(text) < 60:
                md_lines.append(f"## {text.title()}")
                continue

            # Detect bullet lists (e.g. lines starting with • or –)
            if re.match(r"^[•\-\u2022]\s+", text):
                item = re.sub(r"^[•\-\u2022]\s+", "", text)
                md_lines.append(f"-{item}")
                continue

            # Otherwise treat as paragraph
            md_lines.append(text)

        # Add a page break marker (optional)
        md_lines.append("\n---\n")

        

# Write out the Markdown
    with open(md_path, "w", encoding="utf-8") as f:
        f.write("\n\n".join(md_lines))

    print(f"Converted {pdf_path} → {md_path}")

# Usage
pdf_to_markdown(RESUME_PATH, "output.md")


Converted /Users/eliza/Desktop/resume-reader/data-scientist-resume-example.pdf → output.md
