In [None]:


import os, json, glob
from PIL import Image

# ---- 0) Mount Drive safely ----
from google.colab import drive

MOUNT_POINT = "/content/gdrive"   # use a clean mount point
os.makedirs(MOUNT_POINT, exist_ok=True)

# If it's not empty, choose another mount folder
if os.path.isdir(MOUNT_POINT) and os.listdir(MOUNT_POINT):
    MOUNT_POINT = "/content/gdrive2"
    os.makedirs(MOUNT_POINT, exist_ok=True)

drive.mount(MOUNT_POINT)

MYDRIVE = f"{MOUNT_POINT}/MyDrive"
print("✅ Mounted. MYDRIVE =", MYDRIVE)


# ---- 1) Auto-find dataset folders ----
def find_one(pattern, label):
    hits = glob.glob(pattern, recursive=True)
    if not hits:
        raise FileNotFoundError(
            f"Could not find {label}.\nPattern tried: {pattern}\n"
            f"Tip: check folder names/case in Drive and ensure SinFUND exists under MyDrive."
        )
    hits = sorted(hits, key=len)  # pick shortest path
    return hits[0], hits

train_images_pattern = f"{MYDRIVE}/**/SinFUND/**/training_data/images"
test_images_pattern  = f"{MYDRIVE}/**/SinFUND/**/testing_data/images"

TRAIN_IMAGES, _ = find_one(train_images_pattern, "training_data/images")
TEST_IMAGES,  _ = find_one(test_images_pattern,  "testing_data/images")

TRAIN_ANNS = TRAIN_IMAGES.replace("/images", "/annotations")
TEST_ANNS  = TEST_IMAGES.replace("/images", "/annotations")

# If annotations path isn't standard, locate them
if not os.path.exists(TRAIN_ANNS):
    TRAIN_ANNS, _ = find_one(f"{MYDRIVE}/**/SinFUND/**/training_data/annotations", "training_data/annotations")
if not os.path.exists(TEST_ANNS):
    TEST_ANNS, _ = find_one(f"{MYDRIVE}/**/SinFUND/**/testing_data/annotations", "testing_data/annotations")

print("\n✅ Using paths:")
print("TRAIN_IMAGES:", TRAIN_IMAGES)
print("TRAIN_ANNS:  ", TRAIN_ANNS)
print("TEST_IMAGES: ", TEST_IMAGES)
print("TEST_ANNS:   ", TEST_ANNS)


# ---- 2) Output folder ----
OUT_ROOT = f"{MYDRIVE}/sinfund_olmocr_pairs"
print("\nOUT_ROOT:", OUT_ROOT)


# ---- 3) Annotation -> words ----
def load_words_from_annotation(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    words = []

    # FUNSD classic: {"form":[{"words":[{"text":..,"box":[..]}], ...}]}
    if isinstance(data, dict) and "form" in data and isinstance(data["form"], list):
        for item in data["form"]:
            if not isinstance(item, dict):
                continue
            if "words" in item and isinstance(item["words"], list):
                for w in item["words"]:
                    if not isinstance(w, dict):
                        continue
                    t = (w.get("text") or "").strip()
                    b = w.get("box") or w.get("bbox")
                    if t and b and len(b) == 4:
                        words.append((t, b))
            else:
                t = (item.get("text") or "").strip()
                b = item.get("box") or item.get("bbox")
                if t and b and len(b) == 4:
                    words.append((t, b))

    # other variants
    for key in ["words", "annotations", "tokens"]:
        if isinstance(data, dict) and key in data and isinstance(data[key], list):
            for w in data[key]:
                if not isinstance(w, dict):
                    continue
                t = (w.get("text") or w.get("label") or "").strip()
                b = w.get("box") or w.get("bbox")
                if t and b and len(b) == 4:
                    words.append((t, b))

    # dedup
    seen = set()
    out = []
    for t, b in words:
        k = (t, tuple(b))
        if k not in seen:
            seen.add(k)
            out.append((t, b))
    return out


def words_to_lines(words, y_threshold=12):
    if not words:
        return []

    items = []
    for t, (x0, y0, x1, y1) in words:
        xc = (x0 + x1) / 2
        yc = (y0 + y1) / 2
        items.append((t, x0, y0, x1, y1, xc, yc))

    items.sort(key=lambda z: (z[6], z[1]))  # y then x

    lines = []
    current = [items[0]]
    for it in items[1:]:
        if abs(it[6] - current[-1][6]) <= y_threshold:
            current.append(it)
        else:
            lines.append(current)
            current = [it]
    lines.append(current)

    out_lines = []
    for line in lines:
        line.sort(key=lambda z: z[1])  # x left->right
        out_lines.append(" ".join(z[0] for z in line).strip())
    return [ln for ln in out_lines if ln]


# ---- 4) Writers ----
def write_md(md_path, text, split):
    front = (
        "---\n"
        "primary_language: si\n"
        f"source: SinFUND_{split}\n"
        "task: ocr_transcription\n"
        "---\n\n"
    )
    os.makedirs(os.path.dirname(md_path), exist_ok=True)
    with open(md_path, "w", encoding="utf-8") as f:
        f.write(front + text.strip() + "\n")


def image_to_pdf(image_path, pdf_path):
    img = Image.open(image_path).convert("RGB")
    os.makedirs(os.path.dirname(pdf_path), exist_ok=True)
    img.save(pdf_path, "PDF", resolution=300.0)


# ---- 5) Convert split ----
def prepare_split(src_images, src_ann, out_root, split, y_threshold=12):
    out_pdf = os.path.join(out_root, split, "pdfs")
    out_md  = os.path.join(out_root, split, "md")
    os.makedirs(out_pdf, exist_ok=True)
    os.makedirs(out_md, exist_ok=True)

    imgs = [f for f in os.listdir(src_images)
            if f.lower().endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff"))]
    imgs.sort()

    missing_ann = 0
    made = 0

    for fname in imgs:
        base = os.path.splitext(fname)[0]
        img_path = os.path.join(src_images, fname)
        json_path = os.path.join(src_ann, base + ".json")

        if not os.path.exists(json_path):
            missing_ann += 1
            continue

        words = load_words_from_annotation(json_path)
        lines = words_to_lines(words, y_threshold=y_threshold)
        page_text = "\n".join(lines).strip()
        if not page_text:
            page_text = "[empty]"

        pdf_path = os.path.join(out_pdf, base + ".pdf")
        md_path  = os.path.join(out_md,  base + ".md")

        image_to_pdf(img_path, pdf_path)
        write_md(md_path, page_text, split)

        made += 1

    print(f"\n[{split}] created pairs: {made} | missing annotations: {missing_ann}")
    print(f"pdfs -> {out_pdf}")
    print(f"md   -> {out_md}")


# ---- 6) Run ----
prepare_split(TRAIN_IMAGES, TRAIN_ANNS, OUT_ROOT, "train", y_threshold=12)
prepare_split(TEST_IMAGES,  TEST_ANNS,  OUT_ROOT, "test",  y_threshold=12)

print("\n✅ DONE. olmOCR-style pairs written to:", OUT_ROOT)


Mounted at /content/gdrive
✅ Mounted. MYDRIVE = /content/gdrive/MyDrive

✅ Using paths:
TRAIN_IMAGES: /content/gdrive/MyDrive/sinhala dataset/SinFUND/dataset/training_data/images
TRAIN_ANNS:   /content/gdrive/MyDrive/sinhala dataset/SinFUND/dataset/training_data/annotations
TEST_IMAGES:  /content/gdrive/MyDrive/sinhala dataset/SinFUND/dataset/testing_data/images
TEST_ANNS:    /content/gdrive/MyDrive/sinhala dataset/SinFUND/dataset/testing_data/annotations

OUT_ROOT: /content/gdrive/MyDrive/sinfund_olmocr_pairs

[train] created pairs: 80 | missing annotations: 0
pdfs -> /content/gdrive/MyDrive/sinfund_olmocr_pairs/train/pdfs
md   -> /content/gdrive/MyDrive/sinfund_olmocr_pairs/train/md

[test] created pairs: 20 | missing annotations: 0
pdfs -> /content/gdrive/MyDrive/sinfund_olmocr_pairs/test/pdfs
md   -> /content/gdrive/MyDrive/sinfund_olmocr_pairs/test/md

✅ DONE. olmOCR-style pairs written to: /content/gdrive/MyDrive/sinfund_olmocr_pairs


In [None]:
import os, random

root = "/content/gdrive/MyDrive/sinfund_olmocr_pairs"
md_dir = os.path.join(root, "train", "md")
pdf_dir = os.path.join(root, "train", "pdfs")

samples = random.sample([f for f in os.listdir(md_dir) if f.endswith(".md")], 5)

for f in samples:
    base = f.replace(".md","")
    print("\n====", base, "====")
    print("MD:", os.path.join(md_dir, f))
    print("PDF:", os.path.join(pdf_dir, base + ".pdf"))
    print(open(os.path.join(md_dir, f), "r", encoding="utf-8").read()[:600])



==== sin_train_7 ====
MD: /content/gdrive/MyDrive/sinfund_olmocr_pairs/train/md/sin_train_7.md
PDF: /content/gdrive/MyDrive/sinfund_olmocr_pairs/train/pdfs/sin_train_7.pdf
---
primary_language: si
source: SinFUND_train
task: ocr_transcription
---

පොදු
பொது } 204
General
( F * S . , T. & E. ) 2/77
( A4 * S.T . & E. 06/2023 - Amended ]
මු. රෙ. 104 ( 4 ) යටතේ අලාභයන් පිළිබද යොමු අංක / தொடர் இல / Ref . No.
අවසාන වාර්තාව
நி . பி . 104 ( 4 ) இன் கீழ் இழப்புக்கள் பற்றிய
இறுதி
அறிக்கை
FINAL REPORT OF LOSSES UNDER .R . 104 ( 4 )
වාරිමාර්ග අමාත්‍යංශය { අමාත්‍යංශයේ ලේකම්
Secretary to Ministry of அமைச்சின் செயலாளருக்கு
පිටපත
: විගණකාධිපති
பிரதி : சணக்காய்வு அதிபதி
Copy to : Auditor - General
1. දෙපාර්තමේන්තුව/සංස්ථාව වාරිමාර්ග දෙපාර්තමේන්තුව
திணைக்களம் / கூட்டுத்தபாளம்
Dep

==== sin_train_52 ====
MD: /content/gdrive/MyDrive/sinfund_olmocr_pairs/train/md/sin_train_52.md
PDF: /content/gdrive/MyDrive/sinfund_olmocr_pairs/train/pdfs/sin_train_52.pdf
---
primary_language: si
source: SinFUND_train
tas

In [None]:
!pip -q install -U transformers accelerate peft bitsandbytes datasets pillow pymupdf jiwer


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m84.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.50.0 requires pillow<12.0,>=8.0, but you have pillow 12.0.0 which is incompatible.[0m[31m


In [None]:
import os, re
import fitz  # PyMuPDF
from PIL import Image
from datasets import Dataset

ROOT = "/content/gdrive/MyDrive/sinfund_olmocr_pairs"
TRAIN_PDF = f"{ROOT}/train/pdfs"
TRAIN_MD  = f"{ROOT}/train/md"
TEST_PDF  = f"{ROOT}/test/pdfs"
TEST_MD   = f"{ROOT}/test/md"

def strip_front_matter(md_text: str) -> str:
    # removes YAML front matter between --- and ---
    md_text = md_text.strip()
    if md_text.startswith("---"):
        parts = md_text.split("---", 2)
        if len(parts) == 3:
            return parts[2].strip()
    return md_text

def pdf_page_to_pil(pdf_path: str, max_side=1024):
    doc = fitz.open(pdf_path)
    page = doc[0]
    pix = page.get_pixmap(alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    doc.close()

    # resize so longest side = max_side (like olmOCR-style)
    w, h = img.size
    scale = max_side / max(w, h)
    if scale < 1.0:
        img = img.resize((int(w*scale), int(h*scale)))
    return img

def build_records(pdf_dir, md_dir):
    recs = []
    for f in sorted(os.listdir(md_dir)):
        if not f.endswith(".md"):
            continue
        base = f[:-3]
        pdf_path = os.path.join(pdf_dir, base + ".pdf")
        md_path  = os.path.join(md_dir, f)
        if not os.path.exists(pdf_path):
            continue
        txt = open(md_path, "r", encoding="utf-8").read()
        txt = strip_front_matter(txt)

        recs.append({
            "id": base,
            "pdf_path": pdf_path,
            "target_text": txt
        })
    return recs

train_recs = build_records(TRAIN_PDF, TRAIN_MD)
test_recs  = build_records(TEST_PDF, TEST_MD)

print("train:", len(train_recs), " test:", len(test_recs))
train_ds = Dataset.from_list(train_recs)
test_ds  = Dataset.from_list(test_recs)


train: 80  test: 20


In [None]:
# 1) Clean reinstall Pillow (fixes PIL._typing/_Ink mismatch)
!pip -q uninstall -y Pillow pillow
!pip -q install --no-cache-dir --force-reinstall "Pillow==10.4.0"

# 2) Reinstall the training stack cleanly
!pip -q install --no-cache-dir -U transformers accelerate peft bitsandbytes datasets pymupdf jiwer

# 3) Sanity check (note the leading !)
!python -c "import PIL; print('Pillow:', PIL.__version__)"


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/4.5 MB[0m [31m31.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m76.8 MB/s[0m eta [36m0:00:00[0m
[?25hPillow: 10.4.0


In [None]:
import PIL
print(PIL.__version__)

import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
print("Imports OK ✅")


11.3.0




Imports OK ✅


In [None]:
import os
import fitz  # pymupdf
from PIL import Image
from datasets import Dataset

ROOT = "/content/gdrive/MyDrive/sinfund_olmocr_pairs"
TRAIN_PDF = f"{ROOT}/train/pdfs"
TRAIN_MD  = f"{ROOT}/train/md"
TEST_PDF  = f"{ROOT}/test/pdfs"
TEST_MD   = f"{ROOT}/test/md"

def strip_front_matter(md_text: str) -> str:
    md_text = md_text.strip()
    if md_text.startswith("---"):
        parts = md_text.split("---", 2)
        if len(parts) == 3:
            return parts[2].strip()
    return md_text

def pdf_page_to_pil(pdf_path: str, max_side=1024):
    doc = fitz.open(pdf_path)
    page = doc[0]
    pix = page.get_pixmap(alpha=False)
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    doc.close()

    w, h = img.size
    scale = max_side / max(w, h)
    if scale < 1.0:
        img = img.resize((int(w*scale), int(h*scale)))
    return img

def build_records(pdf_dir, md_dir):
    recs = []
    for f in sorted(os.listdir(md_dir)):
        if not f.endswith(".md"):
            continue
        base = f[:-3]
        pdf_path = os.path.join(pdf_dir, base + ".pdf")
        md_path  = os.path.join(md_dir, f)
        if not os.path.exists(pdf_path):
            continue
        txt = open(md_path, "r", encoding="utf-8").read()
        txt = strip_front_matter(txt)
        recs.append({"id": base, "pdf_path": pdf_path, "target_text": txt})
    return recs

train_recs = build_records(TRAIN_PDF, TRAIN_MD)
test_recs  = build_records(TEST_PDF, TEST_MD)

print("train:", len(train_recs), " test:", len(test_recs))

train_ds = Dataset.from_list(train_recs)
test_ds  = Dataset.from_list(test_recs)


train: 80  test: 20


In [None]:
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

lora = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)
model = get_peft_model(model, lora)
model.print_trainable_parameters()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/347 [00:00<?, ?B/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]



config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

trainable params: 18,464,768 || all params: 2,227,450,368 || trainable%: 0.8290


In [None]:
import random, torch

# Make sure PROMPT exists
PROMPT = (
    "You are an OCR system. Read the document image and output the text exactly as written.\n"
    "- Do NOT translate.\n"
    "- Do NOT romanize.\n"
    "- Preserve numbers, dates, punctuation.\n"
    "- If unreadable, write [illegible].\n"
    "Return only the transcription."
)

sample = random.choice(test_recs)
print("Testing on:", sample["id"])
print("PDF:", sample["pdf_path"])

image = pdf_page_to_pil(sample["pdf_path"], max_side=768)  # 768 = safer on T4

messages = [{"role":"user","content":[
    {"type":"image","image":image},
    {"type":"text","text":PROMPT}
]}]

text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True).to(model.device)

with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=256)

pred = processor.batch_decode(out, skip_special_tokens=True)[0]
print("\nPRED:\n", pred[:2000])
print("\nREF:\n", sample["target_text"][:2000])


Testing on: sin_val_8
PDF: /content/gdrive/MyDrive/sinfund_olmocr_pairs/test/pdfs/sin_val_8.pdf

PRED:
 system
You are a helpful assistant.
user
You are an OCR system. Read the document image and output the text exactly as written.
- Do NOT translate.
- Do NOT romanize.
- Preserve numbers, dates, punctuation.
- If unreadable, write [illegible].
Return only the transcription.
assistant
[illegible]

REF:
 සේවායෝජක ශාඛා නියෝජිත
අංකය | අංකය අංකය
කාඩ් වර්ගය
මාස්ටර් කාඩ් වීසා
Classic ☑ Gold Platinum Signature World
11/03/2024
දිනය අනු අංකය
පුද්ගලික විස්තර
යා මිය මෙනවිය ආචාර්ය වෙනත් ( සදහන් කරන්න ) .
සයුරංග
මුල් නම් සු පු න් . S
වෙනත් නම්
හෙට්ටි
වාසගම පතිර ණ ආ ර ච් චි ගේ
කාඩ් පතේ සඳහත් විය යුතු තම ( උපරිම අකුරු 19 යි )
සුපුන්
සයුරංග
ජාතික හැඳුනුම් පතේ හෝ ව්දෙස් ගමන් බලපත්‍රයට අනුව
පුරුෂ ස්ත්‍රී උපත් දිනය
දිනය මාසය වසර
ශ්‍රී
පුරවැසිභාවය ලාංකික
ජාතික හැඳුනුම් පත් අංකය 9 8 2 4 5 1 4 3 4 V
C 1 6 4 3 367
විදෙස් ගමන් බලපත්‍ර / රියදුරු බලපත්‍ර අංකය .
( අදාළ ඡායා පිටපත් අමුණන්න )
විවාහක අවිවාහක බව – 