# OCR

> OCR evaluation reports

In [None]:
#| default_exp ocr

In [None]:
#| export
import os
import re
import time
from io import BytesIO
from dotenv import load_dotenv
import base64
from fastcore.all import *
from rich import print
import urllib.parse
from pathlib import Path
from tqdm import tqdm

import pandas as pd
from mistralai import Mistral
from PIL import Image
# from tqdm import tqdm

from evaluatr.readers import load_evals


In [None]:
#| export
load_dotenv()
mistral_api_key = os.getenv("MISTRAL_API_KEY")

In [None]:
#| exports
def get_doc_subtype(
    id:str, # ID of the evaluation
    fname:str, # Name of the file
    evals # Evaluations data
    )->str: # Document Subtype
    "Get Document Subtype for a given file in the evaluation dataset"
    eval_data = L(evals).filter(lambda x: x['id']==id)
    if not eval_data: return None
    
    docs = L(eval_data[0]['docs'])
    matches = docs.filter(lambda x: Path(x['File URL']).name==fname)
    return matches[0]['Document Subtype'] if matches else None

Given an evaluation `id` and a `pdf` file name of one of its supporting doc we can check its `subtype`.

For instance, for the "Final Evaluation of the EU-IOM Joint ..." evaluation:

In [None]:
#| eval: false
fname_json = '../_data/output/evaluations.json'
path_pdf = Path('../_data/pdf_library')

title = 'Final Evaluation of the EU-IOM Joint Initiative for migrant protection and reintegration in the Horn of Africa'
evals = load_evals(fname_json)

results = [o for o in evals.filter(lambda x: title.lower() in x['meta']['Title'].lower())]; results

[{'id': '49d2fba781b6a7c0d94577479636ee6f',
  'docs': [{'Document Subtype': 'Evaluation report',
    'File URL': 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Abridged%20Evaluation%20Report_%20Final_Olta%20NDOJA.pdf',
    'File description': 'Evaluation Report'},
   {'Document Subtype': 'Evaluation brief',
    'File URL': 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Evaluation%20Learning%20Brief_Final_Olta%20NDOJA.pdf',
    'File description': 'Evaluation Brief'},
   {'Document Subtype': 'Annexes',
    'File URL': 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/Final%20Evaluation%20Report%20Final_Olta%20NDOJA.pdf',
    'File description': 'Abridged Report'},
   {'Document Subtype': 'Management response',
    'File URL': 'https://evaluation.iom.int/sites/g/files/tmzbdl151/files/docs/resources/HoA%20EU%20JI%20Final%20Eval%20-%20Management%20Response%20Matrix%20-%20Final.pdf',
    'File description': 'Manageme

In [None]:
#| eval: false
id = '49d2fba781b6a7c0d94577479636ee6f'
for o in path_pdf.ls().filter(lambda x: x.name == id)[0].ls():
    print(f'Name: {o.name}\nSubtype: {get_doc_subtype(id, o.name, evals)}')

In [None]:
#| export
src_dir = Path("../_data/")

## Mistral client

In [None]:
#| export
def ocr(
    pdf_path:Union[Path,str], # Path to the PDF file to process
    model:str="mistral-ocr-latest", # Model name to use for OCR processing
    include_images:bool=True, # Whether to include base64-encoded images in the response
    api_key:str=mistral_api_key # Mistral API key for authentication
):
    "Upload a PDF file and process it with OCR in one go"
    if isinstance(pdf_path, str): pdf_path = Path(pdf_path)
    cli = Mistral(api_key=api_key)
    
    uploaded_pdf = cli.files.upload(
        file={
            "file_name": pdf_path.stem,
            "content": pdf_path.read_bytes(),
        },
        purpose="ocr"
    )

    signed_url = cli.files.get_signed_url(file_id=uploaded_pdf.id)
    
    return cli.ocr.process(
        model=model,
        document={
            "type": "document_url",
            "document_url": signed_url.url,
        },
        include_image_base64=include_images
    )

In [None]:
# pdf_report = '../_data/pdf_library/49d2fba781b6a7c0d94577479636ee6f/Abridged%20Evaluation%20Report_%20Final_Olta%20NDOJA.pdf'

In [None]:
# Note: filename will be cleaned upstream in a next version
#| exports
def clean_pdf_name(pdf_name: str) -> str:
    """
    Clean PDF name to create folder-friendly string.
    Removes special characters and spaces, replaces with underscores.
    """
    # Remove URL encoding
    import urllib.parse
    pdf_name = urllib.parse.unquote(pdf_name)
    
    # Replace spaces and special characters with underscores
    # Replace any character that is not a word character (\w), whitespace (\s), or hyphen (-) with underscore
    cleaned = re.sub(r'[^\w\s-]', '_', pdf_name)
    
    # Replace any sequence of hyphens or whitespace with a single underscore
    cleaned = re.sub(r'[-\s]+', '_', cleaned)
    
    # Replace multiple consecutive underscores with a single underscore
    cleaned = re.sub(r'_+', '_', cleaned)
    cleaned = cleaned.strip('_')  # Remove leading/trailing underscores
    
    return cleaned.lower()

In [None]:
clean_pdf_name("Final%20Evaluation%20Report%20Final_Olta%20NDOJA.pdf")

'final_evaluation_report_final_olta_ndoja_pdf'

In [None]:
# Notes, brainstorm
## we can fix the headings hierarchy passing the headings to an LLM
## we can also pass the toc if found
## we could then replace each heading with the corresponding fixed text + the page number
## Hence we could ask an LLM to propose semantic chunks based on titles, level and numbe or pages

In [None]:
# pdf_report

In [None]:
#| eval: false
r = ocr(pdf_report)

In [None]:
#| eval: false
len(r.pages)

31

In [None]:
#| eval: false
r.pages[2].markdown

In [None]:
#| eval: false 
print(r.pages[3].markdown)

In [None]:
#| eval: false
print(r.pages[10].markdown)

In [None]:
#| eval: false
# from IPython.display import display, Markdown
# Markdown(r.pages[4].markdown)

In [None]:
# def combine_pages(r):
#     "Combine all pages into single markdown"
#     return "".join([page.markdown + "\n" for page in r.pages])

In [None]:
# r_all = combine_pages(r); 
# print(r_all);

In [None]:
#| eval: false
# Extract all lines starting with one or more # using regex
import re
# Find all markdown headings in the text
# ^ - matches start of line
# #+ - matches one or more # characters that denote heading level
# .* - matches any characters until end of line
# $ - matches end of line
# re.MULTILINE flag makes ^ and $ match start/end of each line rather than whole string
re.findall(r'^#+.*$', r_all, re.MULTILINE)

['# **PPMi**',
 '# CONTENTS ',
 '# 1. Introduction ',
 '# 2. Background of the JI-HoA ',
 '### 2.1. Context and design of the JI-HoA',
 '# 2.2. External factors affecting the implementation of the JI ',
 '# 3. Methodology ',
 '# 4. Findings ',
 '### 4.1. Relevance',
 '### 4.1.1. Relevance of programme activities for migrants, returnees, and communities',
 '## Overall performance score for relevance: $3.9 / 5$ <br> Robustness score for the evidence: $4.5 / 5$',
 '### 4.1.1.1 Needs of migrants',
 '### 4.1.1.2 Needs of returnees',
 '# 4.1.1.3 Needs of community members ',
 "### 4.1.2. Programme's relevance to the needs of stakeholders",
 '### 4.1.2.1 Needs of governments',
 '# 4.1.2.2 Needs of other stakeholders ',
 '### 4.2. Coherence',
 "# 4.2.1. The JI-HoA's alignment with the objectives and standards of IOM, and objectives of the EU ",
 '### 4.2.2. Alignment with other initiatives',
 '# 4.3. Effectiveness ',
 '### 4.3.1. Specific Objective 1: Partner countries and relevant stakeholder

In [None]:
#| export
def save_page_images(
    page, # OCR page object containing images
    dest_folder: Path # Destination folder path
): 
    "Save all images from a page to destination folder as PNG"
    for img in page.images:
        img_data = base64.b64decode(img.image_base64.split(',')[1])
        img_bytes = BytesIO(img_data)
        pil_img = Image.open(img_bytes)
        output_path = dest_folder / img.id
        pil_img.save(output_path)

In [None]:
#| eval: false
save_page_images(r.pages[4], Path('.'))

In [None]:
# def clean_fname(fname):
#     "Convert filename to a folder-friendly string"
#     sanitized = fname.replace(' ', '_').replace('-', '_')
#     sanitized = re.sub(r'_{2,}', '_', sanitized) 
#     sanitized = re.sub(r'[^\w\s]', '', sanitized)
#     return sanitized.lower()

## Process Pdfs

In [None]:
#| exports
def setup_output_dirs(md_library_path="../_data/md_library"):
    "Set up the output directory structure for markdown files"
    md_output_dir = Path(md_library_path)
    mkdir(md_output_dir, parents=True, exist_ok=True, overwrite=False)
    return md_output_dir

In [None]:
#| exports
def process_report(
    report_path:Path, # Path to the report directory
    md_output_dir:Path # Path to the output directory
    ) -> tuple[list[Path], str]:
    "Process a single report directory containing PDFs"
    pdfs = report_path.ls(file_exts='.pdf')
    eval_report_path = report_path.name
    mkdir(md_output_dir / eval_report_path, parents=True, exist_ok=True, overwrite=False)
    return pdfs, eval_report_path

In [None]:
#| exports
def process_pdf_page(
    pdf_path:Path, # Path to the PDF file
    page_nb:int, # Page number
    md_output_dir:Path, # Path to the output directory
    eval_report_path:str # Name of the report
    ):
    "Process a single page from a PDF, saving markdown and images"""
    pdf_name = clean_pdf_name(pdf_path.name)
    pdf_dir = md_output_dir / eval_report_path / pdf_name
    mkdir(pdf_dir, parents=True, exist_ok=True, overwrite=False)
    
    r = ocr(pdf_path)
    page = r.pages[page_nb]
    
    # Save markdown
    fname_page = f"page_{page_nb+1}.md"
    page_path = pdf_dir / fname_page
    page_path.write_text(page.markdown)
    
    # Save images
    img_dir = pdf_dir / 'img'
    mkdir(img_dir, parents=True, exist_ok=True, overwrite=False)
    if hasattr(page, 'images') and page.images:
        save_page_images(page, img_dir)
    
    return r, page

In [None]:
#| exports
def process_pdf(
    pdf_path:Path, # Path to the PDF file
    md_output_dir:Path, # Path to the output directory
    eval_report_path:str # Name of the report
    ):
    "Process all pages in a PDF"
    r = ocr(pdf_path)
    for page_nb in range(len(r.pages)):
        process_pdf_page(pdf_path, page_nb, md_output_dir, eval_report_path)
    return r

In [None]:
#| exports
def process_all_reports(
    reports:list[Path], # List of paths to the report directories
    md_library_path="../_data/md_library" # Path to the output directory
    ):
    "Process provided evaluation reports and their PDFs"
    md_output_dir = setup_output_dirs(md_library_path)
        
    for report in tqdm(reports, desc="Processing reports"):
        pdfs, eval_report_path = process_report(report, md_output_dir)
        
        for pdf in tqdm(pdfs, desc=f"Processing PDFs in {eval_report_path}", leave=False):
            process_pdf(pdf, md_output_dir, eval_report_path)

In [None]:
#| eval: false
pdf_library = Path("../_data/pdf_library")
report_id_test = '49d2fba781b6a7c0d94577479636ee6f'
reports = [p for p in pdf_library.ls() if p.name == report_id_test]; reports
process_all_reports(reports)

Processing reports:   0%|          | 0/1 [40:12<?, ?it/s]


KeyboardInterrupt: 