In [None]:
"""
Diploma Document OCR and Configuration Suite

This extended Jupyter notebook provides a complete, modular, and scalable OCR-based document
processing pipeline tailored for high-volume, multi-page diploma or academic document processing.
It includes advanced configuration options, metadata entry, batch processing support, page-level
navigation, similarity evaluation, and validation mechanisms to ensure extracted data is both accurate
and properly attributed.

Primary Features:
- Accepts long, high-resolution multi-page documents (PDF or image)
- Supports multiple OCR engines: Tesseract, DocTR (TensorFlow, PyTorch)
- Allows metadata entry: university name, degree type, graduate name, issue date, etc.
- Provides structured output for downstream validation or archival
- Built-in interface for comparing OCR results to ground-truth manually entered by operator
- Optional export to structured JSON for further processing (e.g., database or backend validation)
"""

import pytesseract
import fitz  # PyMuPDF
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
from PIL import Image
from IPython.display import display, clear_output
import ipywidgets as widgets
import io
import json
from difflib import SequenceMatcher

# Section: Upload interface
upload_widget = widgets.FileUpload(accept='.pdf,.png,.jpg,.jpeg', multiple=False)
display(widgets.HTML("<h2>Diploma Document Upload</h2>"))
display(upload_widget)

# Section: OCR Engine Selection
ocr_selector = widgets.Dropdown(
    options=['Tesseract', 'DocTR (TensorFlow)', 'DocTR (PyTorch)'],
    value='DocTR (TensorFlow)',
    description='OCR Engine:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
display(widgets.HTML("<h3>Select OCR Engine</h3>"))
display(ocr_selector)

# Section: Diploma Metadata Configuration
metadata_fields = {
    'university_name': widgets.Text(description='University Name:'),
    'degree_type': widgets.Text(description='Degree Type:'),
    'graduate_name': widgets.Text(description='Graduate Name:'),
    'graduation_date': widgets.Text(description='Graduation Date:'),
    'document_id': widgets.Text(description='Document ID (Optional):')
}
display(widgets.HTML("<h3>Diploma Metadata Entry</h3>"))
for field in metadata_fields.values():
    display(field)

# OCR Execution Function
def perform_ocr(file, engine_choice):
    extracted_text_pages = []
    if file.name.endswith('.pdf'):
        doc = fitz.open(stream=file.content, filetype="pdf")
        images = [Image.open(io.BytesIO(page.get_pixmap(dpi=300).tobytes())) for page in doc]
    else:
        images = [Image.open(io.BytesIO(file.content))]

    if engine_choice == 'Tesseract':
        for image in images:
            text = pytesseract.image_to_string(image)
            extracted_text_pages.append(text)
    else:
        doc_input = DocumentFile.from_pdf(io.BytesIO(file.content)) if file.name.endswith('.pdf') else DocumentFile.from_images(images)
        model = ocr_predictor(pretrained=True, assume_straight_pages=True, use_pytorch=(engine_choice == 'DocTR (PyTorch)'))
        result = model(doc_input)
        for page in result.pages:
            text = '\n'.join([block['value'] for block in page.blocks])
            extracted_text_pages.append(text)

    return extracted_text_pages

# Page Navigator

def show_page_navigation(pages):
    page_selector = widgets.Dropdown(
        options=[(f"Page {i+1}", i) for i in range(len(pages))],
        description="Select Page:"
    )
    output_area = widgets.Output()

    def on_page_change(change):
        with output_area:
            clear_output()
            print(pages[change['new']])

    page_selector.observe(on_page_change, names='value')
    display(widgets.HTML("<h3>Page Navigator</h3>"))
    display(page_selector)
    display(output_area)
    on_page_change({'new': 0})

# Similarity Checker Interface
def show_evaluation_interface(ocr_pages):
    expected_textarea = widgets.Textarea(
        value='',
        placeholder='Paste the expected ground-truth result here for validation...',
        layout=widgets.Layout(width='100%', height='150px')
    )
    eval_button = widgets.Button(description="Evaluate OCR Similarity")
    eval_output = widgets.Output()

    def on_eval_clicked(b):
        with eval_output:
            clear_output()
            combined_ocr_text = '\n'.join(ocr_pages)
            ratio = SequenceMatcher(None, expected_textarea.value.strip(), combined_ocr_text.strip()).ratio()
            print(f"OCR Similarity: {ratio:.2%}")

    eval_button.on_click(on_eval_clicked)
    display(widgets.HTML("<h3>OCR Evaluation</h3>"))
    display(expected_textarea)
    display(eval_button)
    display(eval_output)

# Export Metadata + OCR to JSON

def export_to_json(metadata, ocr_text):
    payload = {
        'metadata': {k: v.value for k, v in metadata.items()},
        'ocr_text': ocr_text
    }
    print("\nStructured Export:")
    print(json.dumps(payload, indent=4))

export_btn = widgets.Button(description="Export to JSON")

# Bind export
if upload_widget.value:
    file_data = list(upload_widget.value.values())[0]
    engine_selected = ocr_selector.value
    display(widgets.HTML("<h3>Running OCR - Please Wait</h3>"))
    ocr_results = perform_ocr(file_data, engine_selected)
    display(widgets.HTML("<h3>OCR Completed</h3>"))
    show_page_navigation(ocr_results)
    show_evaluation_interface(ocr_results)

    def on_export_clicked(b):
        export_to_json(metadata_fields, ocr_results)

    export_btn.on_click(on_export_clicked)
    display(export_btn)
else:
    display(widgets.HTML("<b>Please upload a document above to begin processing.</b>"))