<a href="https://colab.research.google.com/github/elijahManPerson/Flappy-Bird/blob/master/OCR_working_pynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 1) Install system packages and fonts
!apt-get update -qq
!apt-get install -y fonts-dejavu-core -qq
!pip -q install --upgrade openai pymupdf reportlab pandas openpyxl tqdm pillow

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package fonts-dejavu-core.
(Reading database ... 126675 files and directories currently installed.)
Preparing to unpack .../fonts-dejavu-core_2.37-2build1_all.deb ...
Unpacking fonts-dejavu-core (2.37-2build1) ...
Setting up fonts-dejavu-core (2.37-2build1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m67.8 MB/s[0m eta [

In [2]:
# 2) Python implementation
import os
import io
import time
import re
import base64
import urllib.request
from pathlib import Path
from typing import List, Tuple

from PIL import Image
import fitz  # PyMuPDF
import pandas as pd
from tqdm import tqdm

from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4
from reportlab.lib.utils import ImageReader
from reportlab.pdfbase.pdfmetrics import stringWidth
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

from openai import OpenAI

In [3]:
# 3) Mount Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [4]:
# 4) Configuration - change these paths if you want
DRIVE_FOLDER = "/content/drive/MyDrive/handwriting_pdfs"   # where your PDFs live
OUTPUT_DIR   = "/content/drive/MyDrive/handwriting_outputs"  # where outputs will be written
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

In [5]:
# 5) Secure API key input
from getpass import getpass
OPENAI_KEY = getpass("Paste your OpenAI API key (starts with sk-): ").strip()
os.environ["OPENAI_API_KEY"] = OPENAI_KEY
client = OpenAI(api_key=OPENAI_KEY)

Paste your OpenAI API key (starts with sk-): ··········


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




In [6]:
# 6) Register DejaVu font (robust fallback) - Need to work out what they use int the marking platform
def register_dejavu_font():
    candidates = [
        "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
        "/usr/share/fonts/truetype/DejaVu/DejaVuSans.ttf",
        "/usr/share/fonts/truetype/freefont/FreeSans.ttf",
    ]
    for p in candidates:
        if os.path.exists(p):
            pdfmetrics.registerFont(TTFont("DejaVuSans", p))
            return "DejaVuSans"
    # fallback: download DejaVu
    try:
        url = "https://github.com/dejavu-fonts/dejavu-fonts/raw/master/ttf/DejaVuSans.ttf"
        local = "/tmp/DejaVuSans.ttf"
        urllib.request.urlretrieve(url, local)
        pdfmetrics.registerFont(TTFont("DejaVuSans", local))
        return "DejaVuSans"
    except Exception:
        # last resort, use builtin Helvetica (limited unicode)
        return "Helvetica"

BODY_FONT = register_dejavu_font()
print("Using font:", BODY_FONT)

Using font: DejaVuSans


In [7]:
# 7) Utilities: render page, compress image, data URL
def render_pdf_page_to_png(page: fitz.Page, dpi: int = 200) -> bytes:
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    pix = page.get_pixmap(matrix=mat, alpha=False)
    return pix.tobytes("png")

def prepare_image_for_api(png_bytes: bytes, max_width: int = 1600, jpeg_quality: int = 75) -> str:
    img = Image.open(io.BytesIO(png_bytes)).convert("RGB")
    if img.width > max_width:
        new_h = int(max_width * img.height / img.width)
        img = img.resize((max_width, new_h), Image.LANCZOS)
    buff = io.BytesIO()
    img.save(buff, format="JPEG", quality=jpeg_quality, optimize=True)
    b64 = base64.b64encode(buff.getvalue()).decode("utf-8")
    return f"data:image/jpeg;base64,{b64}"

In [8]:
# 8) Robust Responses API call with fallback
MODEL_CANDIDATES = ["gpt-4o", "gpt-4o-mini"]  # change if you have other vision models available
def extract_text_from_response(resp) -> str:
    # prefer .output_text
    text = getattr(resp, "output_text", None)
    if text:
        return text.strip()
    # attempt to parse .output list style
    out = getattr(resp, "output", None)
    if isinstance(out, list):
        parts = []
        for item in out:
            if isinstance(item, dict):
                c = item.get("content") or item.get("text") or item.get("message")
                if isinstance(c, str):
                    parts.append(c)
                elif isinstance(c, list):
                    for sub in c:
                        if isinstance(sub, dict) and sub.get("type") == "output_text":
                            parts.append(sub.get("text", ""))
        if parts:
            return "\n".join(parts).strip()
    return f"[OCR error: {repr(last_err)}]"


In [9]:
# 8A) This is the prompt wording...
def ocr_image_with_openai(client: OpenAI, png_bytes: bytes, max_output_tokens: int = 4096) -> str:
    data_url = prepare_image_for_api(png_bytes)
    prompt = (
        "You are an OCR engine. Transcribe all readable text exactly. "
        "Preserve line breaks and paragraphs. Do not add summaries, corrections, or commentary."
        "Look for small or faint writing."
        "Ignore pictures, emojis and symbols that can't be typed on a standard keyboard."
    )
    last_err = None
    for model_name in MODEL_CANDIDATES:
        try:
            resp = client.responses.create(
                model=model_name,
                input=[{
                    "role": "user",
                    "content": [
                        {"type": "input_text", "text": prompt},
                        {"type": "input_image", "image_url": data_url},
                    ],
                }],
                temperature=0,
                max_output_tokens=max(16, int(max_output_tokens)),
            )
            text = extract_text_from_response(resp)
            if text:
                return text
        except Exception as e:
            last_err = e
            # attempt chat completions fallback if available
            try:
                completion = client.chat.completions.create(
                    model=model_name,
                    messages=[
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": prompt},
                                {"type": "image_url", "image_url": {"url": data_url}},
                            ],
                        }
                    ],
                    temperature=0,
                    max_tokens=2048,
                )
                content = completion.choices[0].message.content
                if isinstance(content, str):
                    return content.strip()
            except Exception:
                pass
        time.sleep(1)
    return f"[OCR error: {repr(last_err)}]"


In [10]:
# 9) Clean transcript lines, with extra aggressiveness for pages 2 and 3 - this is th tex for removing a standard booklet print
def clean_transcript_text(text: str, page_index: int = None) -> str:
    if not text:
        return ""
    remove_phrases = {
        "WRITING",
        "DO NOT WRITE OUTSIDE THE BORDER",
        "END OF TEST",
        "DO NOT WRITE OUTSIDE THE BOX",
        "DO NOT WRITE OUTSIDE THE ANSWER AREA",
        "DO NOT WRITE OUTSIDE THE MARGIN",
        "START OF TEST",
    }
    lines = text.splitlines()
    cleaned_lines = []

    # We will treat header/footer removal more aggressively for pages 2 and 3 (1-based)
    aggressive = (page_index is not None and page_index in (2, 3))

    for i, raw_line in enumerate(lines, start=1):
        line = raw_line.strip()
        if not line:
            continue
        up = line.upper()

        # exact phrase removal
        if up in remove_phrases:
            continue

        # pure punctuation
        if re.fullmatch(r'^[\W_]+$', line):
            continue

        # numeric-only or digits with slashes/dashes/spaces: likely booklet codes
        if re.fullmatch(r'^[\d\-\s\/]{1,12}$', line):
            # Remove if aggressive, or if it appears in the first or last 6 lines
            if aggressive or i <= 6 or (len(lines) - i) < 6:
                continue

        # common small booklet markers like a single digit on its own near header/footer
        if aggressive and re.fullmatch(r'^\d{1,2}$', line):
            continue

        # everything else, keep
        cleaned_lines.append(line)

    return "\n".join(cleaned_lines).strip()

In [11]:
# 10) Combine images for pages 1..3 into one stacked image - This is important step.
def compose_combined_image(image_bytes_list: List[bytes], page_width: float, margin: float = 36, per_image_max_height: int = 2000) -> bytes:
    imgs = [Image.open(io.BytesIO(b)).convert("RGB") for b in image_bytes_list]
    target_w = int(page_width - 2 * margin)
    resized = []
    for img in imgs:
        w, h = img.size
        ratio = target_w / w if w > 0 else 1.0
        new_h = max(1, int(h * ratio))
        if new_h > per_image_max_height:
            scale = per_image_max_height / new_h
            ratio *= scale
            new_h = int(new_h * scale)
        resized.append(img.resize((int(target_w), new_h), Image.LANCZOS))

    total_h = sum(im.size[1] for im in resized)
    combined = Image.new("RGB", (target_w, total_h), color=(255, 255, 255))
    y = 0
    for im in resized:
        combined.paste(im, (0, y))
        y += im.size[1]
    out = io.BytesIO()
    combined.save(out, format="PNG", optimize=True)
    return out.getvalue()

In [12]:
# 11) Word-wrap helper for PDF - targeting in
def wrap_text(text: str, font: str = BODY_FONT, size: int = 11, width: float = A4[0] - 72) -> List[str]:
    words = text.split()
    lines = []
    current = ""
    for word in words:
        candidate = f"{current} {word}".strip() if current else word
        if stringWidth(candidate, font, size) <= width:
            current = candidate
        else:
            if current:
                lines.append(current)
            if stringWidth(word, font, size) > width:
                tmp = ""
                for ch in word:
                    if stringWidth(tmp + ch, font, size) <= width:
                        tmp += ch
                    else:
                        lines.append(tmp)
                        tmp = ch
                current = tmp
            else:
                current = word
    if current:
        lines.append(current)
    return lines

In [13]:
# 12) Build transcript PDF with first 3 pages combined - this is the details
def build_transcript_pdf(original_images: List[bytes], transcripts: List[str], output_path: str, title: str):
    page_width, page_height = A4
    margin = 36
    line_height = 14
    c = canvas.Canvas(output_path, pagesize=A4)
    c.setTitle(title)

    # combine pages 1..3 if present
    if original_images:
        n_combine = min(3, len(original_images))
        combined_img_bytes = compose_combined_image(original_images[:n_combine], page_width, margin=margin)
        img = Image.open(io.BytesIO(combined_img_bytes))
        iw, ih = img.size
        ratio = min((page_width - 2 * margin) / iw, (page_height - 2 * margin) / ih)
        dw, dh = iw * ratio, ih * ratio
        x_pos = (page_width - dw) / 2
        y_pos = (page_height - dh) / 2
        c.setFont(BODY_FONT, 10)
        c.drawString(margin, page_height - margin + 6, f"{title} | original pages 1-{n_combine}")
        c.drawImage(ImageReader(io.BytesIO(combined_img_bytes)), x_pos, y_pos, dw, dh, preserveAspectRatio=True)
        c.showPage()

        for idx in range(n_combine, len(original_images)):
            img_bytes = original_images[idx]
            img = Image.open(io.BytesIO(img_bytes))
            iw, ih = img.size
            ratio = min((page_width - 2 * margin) / iw, (page_height - 2 * margin) / ih)
            dw, dh = iw * ratio, ih * ratio
            x_pos = (page_width - dw) / 2
            y_pos = (page_height - dh) / 2
            c.setFont(BODY_FONT, 10)
            c.drawString(margin, page_height - margin + 6, f"{title} | original page {idx+1}")
            c.drawImage(ImageReader(io.BytesIO(img_bytes)), x_pos, y_pos, dw, dh, preserveAspectRatio=True)
            c.showPage()
    else:
        c.showPage()

    # transcript section
    c.setFont(BODY_FONT, 14)
    c.drawString(margin, page_height - margin, "Full transcript")
    y = page_height - margin - 24
    c.setFont(BODY_FONT, 11)

    # prepare combined transcripts: pages 1..3 combined as one block
    combined_blocks = []
    n_combine = min(3, len(transcripts))
    if n_combine > 0:
        t = "\n".join(clean_transcript_text(transcripts[i], page_index=i+1) for i in range(n_combine))
        combined_blocks.append((f"[Pages 1-{n_combine}]", t))
    for i in range(n_combine, len(transcripts)):
        header = f"[Page {i+1}]"
        txt = clean_transcript_text(transcripts[i], page_index=i+1)
        combined_blocks.append((header, txt))

    for header, block in combined_blocks:
        for chunk in [header, ""] + (block.splitlines() if block else []) + [""]:
            lines = [""] if chunk == "" else wrap_text(chunk, BODY_FONT, 11)
            for line in lines:
                if y < margin:
                    c.showPage()
                    y = page_height - margin - 24
                    c.setFont(BODY_FONT, 14)
                    c.drawString(margin, page_height - margin, "Full transcript (continued)")
                    c.setFont(BODY_FONT, 11)
                c.drawString(margin, y, line)
                y -= line_height

    c.showPage()
    c.save()


PDFs: 100%|██████████| 6/6 [04:15<00:00, 42.64s/it]


Done. Results saved to /content/drive/MyDrive/handwriting_outputs


In [None]:

# 13) Main processing function - making it happen
def process_pdfs(root_dir: str, output_dir: str, dpi: int = 180, pause: float = 0.45):
    root = Path(root_dir)
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)
    pdf_paths = sorted(root.rglob("*.pdf"))
    if not pdf_paths:
        print(f"No PDFs found under {root}")
        return

    rows = []
    for pdf_path in tqdm(pdf_paths, desc="PDFs"):
        try:
            doc = fitz.open(pdf_path)
        except Exception as e:
            print("Failed to open", pdf_path, ":", e)
            continue

        original_images = []
        transcripts = []
        for page_index in range(len(doc)):
            page = doc.load_page(page_index)
            png_bytes = render_pdf_page_to_png(page, dpi=dpi)
            original_images.append(png_bytes)

            try:
                transcript = ocr_image_with_openai(client, png_bytes, max_output_tokens=4096)
            except Exception as e:
                transcript = f"[OCR error: {repr(e)}]"
            transcripts.append(transcript)
            time.sleep(pause)

        # Save combined PDF (with pages 1..3 merged)
        out_pdf = out / f"{pdf_path.stem}__with_transcript.pdf"
        build_transcript_pdf(original_images, transcripts, str(out_pdf), title=pdf_path.name)

        # Prepare Excel rows: combine pages 1..3 into one row labelled "1-3"
        n_combine = min(3, len(transcripts))
        if n_combine > 0:
            combined_text = "\n".join(clean_transcript_text(transcripts[i], page_index=i+1) for i in range(n_combine))
            rows.append({
                "file_name": pdf_path.name,
                "file_path": str(pdf_path),
                "page_number": f"1-{n_combine}",
                "transcript": combined_text,
            })
        for i in range(n_combine, len(transcripts)):
            cleaned = clean_transcript_text(transcripts[i], page_index=i+1)
            rows.append({
                "file_name": pdf_path.name,
                "file_path": str(pdf_path),
                "page_number": str(i+1),
                "transcript": cleaned,
            })

        doc.close()

    # write Excel
    if rows:
        df = pd.DataFrame(rows)
        excel_path = out / "handwriting_ocr_transcripts.xlsx"
        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
            df.to_excel(writer, index=False, sheet_name="transcripts")
        print("Done. Results saved to", out)
    else:
        print("No transcripts produced.")


In [None]:

# 14) Run
process_pdfs(DRIVE_FOLDER, OUTPUT_DIR, dpi=180, pause=0.45)