<a href="https://colab.research.google.com/github/divyanshpanwar03/Checkbox-Detection-in-PDF-via-Textract/blob/main/textract_no_OCR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install boto3
!pip install pdf2image
!pip install PyMuPDF
!pip install PyPDF2
!apt-get update
!apt-get install -y poppler-utils
!pip install pdf2image pillow



Get:1 https://cli.github.com/packages stable InRelease [3,917 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:6 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [83.8 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://cli.github.com/packages stable/main amd64 Packages [356 B]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [9,684 kB]
Get:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease [24.6 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,888 kB]
Get:14 http://security.u

In [4]:
import os
os.environ["AWS_ACCESS_KEY_ID"] = ""          # <-- CHANGE
os.environ["AWS_SECRET_ACCESS_KEY"] = ""      # <-- CHANGE
os.environ["AWS_DEFAULT_REGION"] = "ap-south-1"


import boto3
from pdf2image import convert_from_path
import re
from collections import defaultdict


PDF_PATH = "/content/1 Depersonalised IA Report_testing.pdf"   # <-- CHANGE


textract = boto3.client("textract")


SEVERITY = {"HIGH", "MEDIUM", "LOW"}
CATEGORY = {"R", "C", "F", "O", "U"}
ROOT_CAUSE = {"S", "P", "SD", "LD", "PD"}

ISSUE_TERMINATORS = {
    "AUDITOR’S RECOMMENDATION",
    "AGREED ACTION PLAN",
    "PERSON RESPONSIBLE",
    "TARGET DATE"
}


def analyze_page(image_bytes):
    return textract.analyze_document(
        Document={"Bytes": image_bytes},
        FeatureTypes=["FORMS", "LAYOUT"]
    )["Blocks"]


# -----------------------------
# PAGE HAS SEVERITY?
# Blocks continuation pages like page 6
# -----------------------------
def page_has_severity_group(lines):
    texts = {l["Text"].strip().upper() for l in lines}
    return bool(texts.intersection(SEVERITY))


# -----------------------------
# STRICT ISSUE HEADING DETECTION
# -----------------------------
def detect_issue_headings(lines):
    if not page_has_severity_group(lines):
        return []

    headings = []
    pattern = re.compile(r"^\d+\.\s+")

    for l in lines:
        text = l["Text"].strip()

        if not pattern.match(text):
            continue
        if len(text.split()) < 4:
            continue
        if text.islower():
            continue

        headings.append(l)

    headings.sort(key=lambda x: x["Geometry"]["BoundingBox"]["Top"])
    return headings


# -----------------------------
# ISSUE TERMINATORS
# -----------------------------
def detect_terminators(lines):
    return [
        l for l in lines
        if l["Text"].strip().upper() in ISSUE_TERMINATORS
    ]


# -----------------------------
# ASSIGN CHECKBOX TO ISSUE
# -----------------------------
def assign_checkbox_to_issue(cb, issues, terminators):
    cb_y = cb["Geometry"]["BoundingBox"]["Top"]

    for i, issue in enumerate(issues):
        top = issue["Geometry"]["BoundingBox"]["Top"]
        next_top = (
            issues[i + 1]["Geometry"]["BoundingBox"]["Top"]
            if i + 1 < len(issues)
            else 1.0
        )

        term_tops = [
            t["Geometry"]["BoundingBox"]["Top"]
            for t in terminators
            if top < t["Geometry"]["BoundingBox"]["Top"] < next_top
        ]

        bottom = min(term_tops) if term_tops else next_top

        if top <= cb_y < bottom:
            return issue["Text"]

    return None


# -----------------------------
# HORIZONTAL BAND LABEL MATCHING
# -----------------------------
def map_checkbox_to_label(cb, lines, allowed_labels):
    cb_box = cb["Geometry"]["BoundingBox"]
    cb_center_y = cb_box["Top"] + cb_box["Height"] / 2
    cb_left = cb_box["Left"]
    y_band = cb_box["Height"] * 0.6

    candidates = []

    for line in lines:
        text = line["Text"].strip().upper()
        if text not in allowed_labels:
            continue

        lb = line["Geometry"]["BoundingBox"]
        label_center_y = lb["Top"] + lb["Height"] / 2

        if abs(label_center_y - cb_center_y) <= y_band and lb["Left"] > cb_left:
            candidates.append((lb["Left"] - cb_left, text))

    if not candidates:
        return None

    candidates.sort(key=lambda x: x[0])
    return candidates[0][1]


# -----------------------------
# CLASSIFY GROUP
# -----------------------------
def classify_group(label):
    if label in SEVERITY:
        return "Severity"
    if label in CATEGORY:
        return "Category"
    if label in ROOT_CAUSE:
        return "Root Cause"
    return None


# -----------------------------
# MAIN PIPELINE
# -----------------------------
def process_pdf(pdf_path):
    pages = convert_from_path(pdf_path, dpi=300)
    final_output = {}

    for page_idx, page in enumerate(pages, start=1):
        print(f"\nProcessing page {page_idx}...")

        img_bytes = page.convert("RGB").tobytes("jpeg", "RGB")
        blocks = analyze_page(img_bytes)

        lines = [b for b in blocks if b["BlockType"] == "LINE"]
        checkboxes = [
            b for b in blocks
            if b["BlockType"] == "SELECTION_ELEMENT"
            and b["SelectionStatus"] == "SELECTED"
        ]

        issues = detect_issue_headings(lines)
        if not issues:
            continue

        terminators = detect_terminators(lines)

        page_result = defaultdict(lambda: {
            "Severity": None,
            "Category": [],
            "Root Cause": []   # ✅ MULTI ROOT CAUSE
        })

        for cb in checkboxes:
            issue = assign_checkbox_to_issue(cb, issues, terminators)
            if not issue:
                continue

            for label_set in (SEVERITY, CATEGORY, ROOT_CAUSE):
                label = map_checkbox_to_label(cb, lines, label_set)
                if not label:
                    continue

                group = classify_group(label)

                if group == "Severity":
                    page_result[issue]["Severity"] = label

                elif group == "Category":
                    if label not in page_result[issue]["Category"]:
                        page_result[issue]["Category"].append(label)

                elif group == "Root Cause":
                    if label not in page_result[issue]["Root Cause"]:
                        page_result[issue]["Root Cause"].append(label)

        if page_result:
            final_output[f"page_{page_idx}"] = dict(page_result)

    return final_output


# -----------------------------
# RUN
# -----------------------------
output = process_pdf(PDF_PATH)


# -----------------------------
# PRINT FINAL OUTPUT
# -----------------------------
print("\n================ FINAL OUTPUT ================\n")
for page, issues in output.items():
    print(page)
    for issue, data in issues.items():
        print(" ", issue)
        print("    Severity   :", data["Severity"])
        print("    Category   :", data["Category"])
        print("    Root Cause :", data["Root Cause"])



Processing page 1...

Processing page 2...

Processing page 3...

Processing page 4...

Processing page 5...

Processing page 6...

Processing page 7...

Processing page 8...


page_2
  1. Inadequate IKEJFHKEG QEJGBQEkwjbgg
    Severity   : LOW
    Category   : ['C', 'O']
    Root Cause : ['PD']
page_4
  2. Inadequate kjdbgkjg qejtboqeutoqgjqhoqdbg
    Severity   : MEDIUM
    Category   : ['C', 'O']
    Root Cause : ['PD']
page_5
  3. Delay in kefkejf kjegbjqebgoqkegkqgjadvbkj
    Severity   : MEDIUM
    Category   : ['C', 'O']
    Root Cause : ['PD']
  4. Deficiencies in DR drill report
    Severity   : LOW
    Category   : ['O']
    Root Cause : []
page_7
  5. Absence of Business skjbfkjsebfksjebf
    Severity   : LOW
    Category   : ['U']
    Root Cause : ['PD', 'LD']
