This will try to parse resumes using OCR and fragmentation by blocks.
First step is to define to target data structure

In [None]:
from dataclasses import dataclass, field
from typing import List, Optional

@dataclass
class Resume:
    name         : Optional[str]  = None
    email        : Optional[str]  = None
    phone        : Optional[str]  = None
    education    : List           = field(default_factory=list)
    experience   : List           = field(default_factory=list)
    skills       : List           = field(default_factory=list)
    introduction : Optional[str]  = None
    technologies : List           = field(default_factory=list)
    hyperlinks   : List           = field(default_factory=list)

   

Enumerate the available resumes

In [None]:
import os
import io
import PyPDF2
import matplotlib.pyplot as plt
from pdf2image import convert_from_bytes

POPPLER_PATH = (
    r"C:\ProgramData\chocolatey\lib\poppler-24.08.0\Library\bin"
)
CV_NUMBER = 9
# 1) locate the first PDF
resumes_dir = os.path.join(os.getcwd(), "resumes")
files = os.listdir(resumes_dir)
pdf_files = [f for f in files if f.lower().endswith(".pdf")]

if not pdf_files:
    raise RuntimeError("No PDF found in resumes/")

first_pdf = pdf_files[CV_NUMBER]
file_path = os.path.join(resumes_dir, first_pdf)
print(f"Loading {first_pdf!r}")

# 2) read its bytes and load into PyPDF2
with open(file_path, "rb") as f:
    pdf_bytes = f.read()

pdf_reader = PyPDF2.PdfReader(io.BytesIO(pdf_bytes))
print(f"→ {len(pdf_reader.pages)} pages")

# 3) convert only page 1 to a PIL.Image
images = convert_from_bytes(
    pdf_bytes,
    dpi=450,
    first_page=1,
    last_page=1,
    poppler_path=POPPLER_PATH
)
img = images[0]  # PIL.Image

# 4) display inline in Jupyter
plt.figure(figsize=(8, 11))
plt.imshow(img)
plt.axis("off")
plt.show()

Work in a single file

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw

# assume `img` is your original PIL page‐image

# --- 0) PREPROCESSING: grayscale → binarize so text is black, background white ---
gray_arr = np.array(img.convert("L"))
thresh = 128
# ink mask: True where text (dark)
ink = gray_arr < thresh
# build a display image: 0=black text, 255=white bg
bin_arr = np.where(ink, 0, 255).astype("uint8")
# replace img with the binarized version (RGB so we can draw red lines)
img = Image.fromarray(bin_arr, mode="L").convert("RGB")

# now `ink` is the same mask you’ll use below, and `img` is pure b/w for drawing
w, h = img.size

# 1) binary ink mask
# (we already have `ink` from above, so you can drop the old gray<250 line)

# 2) detect vertical splits
col_sums = ink.sum(axis=0)
min_v_gap = 50
v_thresh = h * 0.01
v_splits = []
x = 0
while x < w:
    if col_sums[x] < v_thresh:
        start = x
        while x < w and col_sums[x] < v_thresh:
            x += 1
        end = x
        if (end - start) >= min_v_gap:
            v_splits.append((start + end)//2)
    else:
        x += 1

# 3) define column dividers
x_divs = [0] + v_splits + [w]

# 4) find horizontal splits per column
min_h_gap = 50
h_splits_by_col = []
for x0, x1 in zip(x_divs[:-1], x_divs[1:]):
    sub = ink[:, x0:x1]
    row_sums = sub.sum(axis=1)
    h_thresh = (x1 - x0) * 0.01

    ys = []
    y = 0
    while y < h:
        if row_sums[y] < h_thresh:
            start = y
            while y < h and row_sums[y] < h_thresh:
                y += 1
            end = y
            if (end - start) >= min_h_gap:
                ys.append((start + end)//2)
        else:
            y += 1

    h_splits_by_col.append(ys)

# 5) draw the splits and collect rect info
out = img.copy()
draw = ImageDraw.Draw(out)
for x in v_splits:
    draw.line([(x, 0), (x, h)], fill="red", width=2)

sections = []
for ci, (x0, x1) in enumerate(zip(x_divs[:-1], x_divs[1:])):
    y_divs = [0] + h_splits_by_col[ci] + [h]
    for ri, (y0, y1) in enumerate(zip(y_divs[:-1], y_divs[1:])):
        draw.line([(x0, y1), (x1, y1)], fill="red", width=4)
        width = x1 - x0
        height = y1 - y0
        sections.append((ci, ri, x0, y0, width, height))

# 6) print each section’s top-left and size
print(f"Found {len(sections)} sections:")
for ci, ri, x0, y0, w0, h0 in sections:
    print(f"  col {ci}, row {ri}: start=({x0},{y0}), "
          f"width={w0}px, height={h0}px")
   
# 7) display the image
plt.figure(figsize=(8, 11))
plt.imshow(out)
plt.axis("off")

For each sections we can apply OCR algorithm

In [None]:
from PIL import ImageDraw
import matplotlib.pyplot as plt

# 1) List out your sections so you can pick one
print("Available sections:")
for idx, (ci, ri, x0, y0, w0, h0) in enumerate(sections):
    print(f"{idx}: col={ci}, row={ri}, start=({x0},{y0}), "
          f"size=({w0}×{h0})")

# 2) Choose which one to display
section_idx = 7
ci, ri, x0, y0, w0, h0 = sections[section_idx]

# 3) Crop the section
sec_img = img.crop((x0, y0, x0 + w0, y0 + h0))

# 4) Display at full (native) resolution
dpi = plt.rcParams['figure.dpi']
figsize = (w0 / dpi, h0 / dpi)
plt.figure(figsize=figsize, dpi=dpi)
plt.imshow(sec_img, interpolation='nearest')
plt.axis('off')
plt.title(f"Section {section_idx} (col={ci},row={ri}), {w0}×{h0}px")

# 5) (Optional) Show full page with that region highlighted
out2 = img.copy()
draw2 = ImageDraw.Draw(out2)
draw2.rectangle(
    [(x0, y0), (x0 + w0, y0 + h0)],
    outline="blue",
    width=4
)
plt.figure(figsize=(8, 11))
plt.imshow(out2)
plt.axis("off")
plt.title("Full page with selected section highlighted")