<a href="https://colab.research.google.com/github/drfperez/utilities/blob/main/PDF2Word.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# =====================================================
# üßÆ OCR MATEM√ÄTIC POTENT ‚Äî 1 CEL¬∑LA (GOOGLE COLAB)
# =====================================================

# ---------- CONFIGURACI√ì ----------
dpi = 450            # clau per s√≠mbols petits
upsample = 2         # millora grecs i operadors
crear_pdf_searchable = True

# ---------- INSTAL¬∑LACI√ì ----------
!apt-get update -qq
!apt-get install -y -qq poppler-utils tesseract-ocr \
    tesseract-ocr-cat tesseract-ocr-spa tesseract-ocr-eng ghostscript
!pip install -q pytesseract pdf2image python-docx easyocr \
    opencv-python-headless ocrmypdf Pillow numpy

# ---------- IMPORTS ----------
import os, re, cv2, numpy as np
from pdf2image import convert_from_path
from google.colab import files
from PIL import Image
import pytesseract, easyocr, ocrmypdf
from docx import Document

# ---------- TESSERACT (CATAL√Ä + MATES) ----------
TESS_LANG = "cat+spa+eng"
TESS_CONFIG = (
    "--oem 1 --psm 6 "
    "-c preserve_interword_spaces=1 "
    "-c tessedit_char_whitelist="
    "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    "Œ±Œ≤Œ≥Œ¥ŒµŒ∂Œ∑Œ∏ŒπŒ∫ŒªŒºŒΩŒæŒøœÄœÅœÉœÑœÖœÜœáœàœâ"
    "ŒëŒíŒìŒîŒïŒñŒóŒòŒôŒöŒõŒúŒùŒûŒüŒ†Œ°Œ£Œ§Œ•Œ¶ŒßŒ®Œ©"
    "+-‚àí√ó√∑=‚â†‚âà‚â§‚â•<>^*/()[]{}|"
    "‚àë‚àè‚à´‚àö‚àû¬∞¬±¬µœÄŒªŒîŒ©"
    ".,;:!?@#%‚Ç¨$\\ "
)

# ---------- FUNCIONS ----------
def pil_to_cv(img):
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def cv_to_pil(img):
    return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))

def deskew(img):
    gray = cv2.bitwise_not(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY))
    coords = np.column_stack(np.where(gray > 0))
    if len(coords) < 20:
        return img
    angle = cv2.minAreaRect(coords)[-1]
    angle = -(90 + angle) if angle < -45 else -angle
    h, w = img.shape[:2]
    M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1)
    return cv2.warpAffine(img, M, (w, h),
        flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

def preprocess_math(img, up=2):
    if up > 1:
        img = cv2.resize(img, None, fx=up, fy=up,
                         interpolation=cv2.INTER_CUBIC)
    g = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    g = cv2.fastNlMeansDenoising(g, None, 12, 7, 21)
    clahe = cv2.createCLAHE(3.0, (8,8))
    g = clahe.apply(g)
    t = cv2.adaptiveThreshold(
        g, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY, 35, 11
    )
    return cv2.cvtColor(t, cv2.COLOR_GRAY2BGR)

def clean_math_text(t):
    t = t.replace("\r", "\n")
    t = re.sub(r'-(\n)', '', t)
    t = re.sub(r'[ \t]+', ' ', t)
    t = re.sub(r'\n{3,}', '\n\n', t)
    t = re.sub(r'[\x00-\x1f\x7f]', '', t)

    fixes = {
        "<=": "‚â§", ">=": "‚â•", "!=": "‚â†",
        " - ": " ‚àí ", " x ": " √ó ", " / ": " √∑ ",
        " O ": " 0 ", " l ": " 1 "
    }
    for k, v in fixes.items():
        t = t.replace(k, v)
    return t.strip()

# ---------- PUJAR PDF ----------
uploaded = files.upload()
pdf_path = next(iter(uploaded))

# ---------- OCR ----------
images = convert_from_path(pdf_path, dpi=dpi)
reader = easyocr.Reader(["en"], gpu=False)  # ‚ö†Ô∏è nom√©s EN (correcte)
doc = Document()

for i, img in enumerate(images):
    print(f"üßÆ P√†gina {i+1}/{len(images)}")
    cv = preprocess_math(deskew(pil_to_cv(img)), upsample)

    t_text = pytesseract.image_to_string(
        cv_to_pil(cv),
        lang=TESS_LANG,
        config=TESS_CONFIG
    )

    e_text = "\n".join(reader.readtext(cv, detail=0))

    final = e_text if len(e_text) > len(t_text)*0.85 else t_text
    final = clean_math_text(final)

    doc.add_heading(f"P√†gina {i+1}", level=2)
    doc.add_paragraph(final)
    if i < len(images)-1:
        doc.add_page_break()

# ---------- DESAR ----------
base = os.path.splitext(pdf_path)[0]
docx_out = f"OCR_MATES_{base}.docx"
doc.save(docx_out)
files.download(docx_out)

if crear_pdf_searchable:
    pdf_out = f"OCR_MATES_searchable_{base}.pdf"
    ocrmypdf.ocr(
        pdf_path,
        pdf_out,
        language="cat",
        deskew=True,
        force_ocr=True
    )
    files.download(pdf_out)

print("‚úÖ OCR matem√†tic complet")

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


Saving digitalproblemes.pdf to digitalproblemes (1).pdf




Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet

In [None]:
# @title üìÑ OCR per a PDF Escanejats (Imatges)
idioma_seleccionat = "Catal√†" # @param ["Catal√†", "Castell√†", "Angl√®s", "Mix (Catal√† + Castell√†)"]

import re
import pytesseract
from pdf2image import convert_from_path
from docx import Document
from google.colab import files

# 1. INSTAL¬∑LACI√ì (M√©s r√†pida si ja s'ha executat abans)
print("‚öôÔ∏è Preparant eines...")
!sudo apt-get update > /dev/null
!sudo apt-get install tesseract-ocr tesseract-ocr-cat tesseract-ocr-spa tesseract-ocr-eng poppler-utils > /dev/null
!pip install pytesseract pdf2image python-docx > /dev/null

lang_codes = {"Catal√†": "cat", "Castell√†": "spa", "Angl√®s": "eng", "Mix (Catal√† + Castell√†)": "cat+spa"}
lang_code = lang_codes[idioma_seleccionat]

def clean_text(text):
    # Elimina car√†cters de control no v√†lids per a XML/Word
    return re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)

# 2. EXECUCI√ì
uploaded = files.upload()

if uploaded:
    pdf_filename = next(iter(uploaded))
    try:
        # Augmentem la resoluci√≥ (dpi=300) per a millor lectura d'imatges
        print(f"üì∏ Convertint p√†gines a alta resoluci√≥...")
        images = convert_from_path(pdf_filename, dpi=300)
        doc = Document()

        for i, image in enumerate(images):
            print(f"üîç Llegint p√†gina {i+1}...")
            # '--psm 1' detecta autom√†ticament l'orientaci√≥ i segments de text
            raw_text = pytesseract.image_to_string(image, lang=lang_code, config='--psm 1')

            safe_text = clean_text(raw_text)
            doc.add_heading(f'P√†gina {i+1}', level=2)
            doc.add_paragraph(safe_text)

            if i < len(images) - 1:
                doc.add_page_break()

        out_name = f"OCR_{pdf_filename.replace('.pdf', '.docx')}"
        doc.save(out_name)
        print(f"\n‚úÖ Enllestit! Descarregant...")
        files.download(out_name)

    except Exception as e:
        print(f"\n‚ùå Error: {e}")

‚öôÔ∏è Preparant eines...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


Saving digitalproblemes.pdf to digitalproblemes (1).pdf
üì∏ Convertint p√†gines a alta resoluci√≥...
üîç Llegint p√†gina 1...
üîç Llegint p√†gina 2...
üîç Llegint p√†gina 3...
üîç Llegint p√†gina 4...
üîç Llegint p√†gina 5...
üîç Llegint p√†gina 6...

‚úÖ Enllestit! Descarregant...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>