In [17]:
import fitz
import re
import os
from PIL import Image

In [None]:
def generate_questions_answer_images(
    test_pdf: str, 
    answers_pdf: str | None = None, 
    dpi: int = 200, 
    margin_top: int = 8
):
    """
    Has test and answers, with each doc being the same length
    Has test and answers, with answers being shorter
    Has test only with answers at bottom
    Has test only with answers filled in
    """

    test_doc = fitz.open(test_pdf)
    current_question_num = 1

    question_tops = []

    for page_idx, page in enumerate(test_doc):
        rd = page.get_text("dict")

        for block_idx, block in enumerate(rd["blocks"]):
            for line in block.get("lines", []):
                x0, y0, x1, y1 = line["bbox"]

                line_text = "".join(span["text"] for span in line["spans"]).strip().lower()
                if any([
                    line_text.startswith(prefix.replace("_", str(current_question_num))) 
                    for prefix in ["_.", "problem _.", "question _.", "q_.", "_ ("]
                ]):
                    question_top = {
                        "page_idx": page_idx,
                        "y": max(y0 - margin_top, 0)
                    }

                    question_tops.append(question_top)
                    current_question_num += 1

    if len(question_tops) == 0:
        raise Exception(f"Error: No questions found")
    elif len(question_tops) == 1:
        raise Exception(f"Error: Only one question found")

    os.makedirs("data/images", exist_ok=True)

    def get_slice(page_idx, y0, y1, doc=test_doc):
        page = doc[page_idx]
        page_width, page_height = page.rect.width, page.rect.height

        if y0 < 0 or y0 > y1 or y1 < 0 or y1 > page.rect.height:
            raise ValueError("y0 or y1 is out of the page.")

        rect = fitz.Rect(0, y0, page_width, y1)
        pix = page.get_pixmap(clip=rect, dpi=dpi)

        return Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    for question_idx, question_top in enumerate(question_tops):
        question_num = question_idx + 1

        page_start = question_top["page_idx"]
        y_start = question_top["y"]

        if question_idx + 1 < len(question_tops):
            next_question_top = question_tops[question_idx + 1]
            page_end = next_question_top["page_idx"]
            y_end = next_question_top["y"]
        else:
            page_end = test_doc.page_count - 1
            y_end = test_doc[page_end].rect.height

        question_slices = []
        curr_page_idx = page_start

        while True:
            curr_page = test_doc[curr_page_idx]
            page_width, page_height = curr_page.rect.width, curr_page.rect.height

            if curr_page_idx == page_start and curr_page_idx == page_end:
                question_slices.append(get_slice(curr_page_idx, y_start, y_end))
                break
            elif curr_page_idx == page_start:
                question_slices.append(get_slice(curr_page_idx, y_start, page_height))
            elif curr_page_idx == page_end:
                question_slices.append(get_slice(curr_page_idx, 0, y_end))
                break
            else:
                question_slices.append(get_slice(curr_page_idx, 0, page_height))

            curr_page_idx += 1

        question_height = sum(question_slice.height for question_slice in question_slices)
        question_width = max(question_slice.width for question_slice in question_slices)
        question_canvas = Image.new("RGB", (question_width, question_height), "white")

        curr_y = 0

        for question_slice in question_slices:
            question_canvas.paste(question_slice, (0, curr_y))
            curr_y += question_slice.height

        question_canvas.save(f"data/images/Q{question_num}.png")

generate_questions_answers_images("data/example/prefix-is-_-(.pdf")

math 126c
second midterm
spring 2014
your name
your signature
student id #
section
10:30
11:30
(circle one)
ca
cb
problem
total points
score
1
16
2
9
3
8
4
8
5
9
total
50
• this exam is closed book. you may use one 8 1
2 × 11 sheet of notes.
• graphing calculators are not allowed.
• in order to receive credit, you must show your work. explain why your answers are correct.
• if you use a trial and error (or guess and check) method when a calculus method is available,
you will not receive full credit.
• place a box around
your final answer
to each question.
• if you need more room, use the backs of the pages and indicate to the reader that you have
done so.
• raise your hand if you have a question.
math 126c
second midterm
spring 2014
1
(16 points)
evaluate the following double integrals.
(a) (8 points)
zz
r
x
1 + xy da,
r = [0, 1] × [0, 2]
(b) (8 points)
zz
d xy2 da,
d is the triangle with vertices (0, 0), (0, 2) and (1, 2).
math 126c
second midterm
spring 2014
2
(9 points)
let f(x, y) 

Exception: Error: No questions found

In [None]:
def 