<a href="https://colab.research.google.com/github/dawissem/fullpack-ia-module-1/blob/main/OCR_facture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🧾 Projet 4 – OCR intelligent de factures (Tesseract + OpenCV + spaCy) – Bloc unique Colab
#==========================================================================================

########################################
# 1) Dépendances et installation
########################################
!sudo apt-get update -qq
!sudo apt-get install -y tesseract-ocr poppler-utils >/dev/null 2>&1
!pip install -q pytesseract opencv-python-headless pdf2image spacy kaggle tqdm

########################################
# 2) Authentification Kaggle (→ dataset)
########################################
import os, json, zipfile, glob, re, csv
from pathlib import Path
from tqdm import tqdm

# Si tu n'as PAS encore de variables d'environnement, upload ton kaggle.json
if ("KAGGLE_USERNAME" not in os.environ) or ("KAGGLE_KEY" not in os.environ):
    from google.colab import files, auth
    print("⬆️  Uploade ton fichier kaggle.json (Settings Kaggle → Create API Token)")
    files.upload()                       # Sélectionne kaggle.json !
    !mkdir -p ~/.kaggle && mv kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json

# Télécharger et décompresser le dataset
!kaggle datasets download -d osamahosamabdellatif/high-quality-invoice-images-for-ocr -p /content -q
!unzip -o /content/high-quality-invoice-images-for-ocr.zip -d /content/invoices >/dev/null

########################################
# 3) Imports Python
########################################
import cv2
import pytesseract
from pdf2image import convert_from_path
import spacy
import pandas as pd

# spaCy : petit modèle EN pour NER (org/person/…) – suffisant pour démos rapides
!python -m spacy download en_core_web_sm -q
nlp = spacy.load("en_core_web_sm")

########################################
# 4) Fonctions utilitaires
########################################
def pdf_to_images(pdf_path, out_dir):
    """Convertit un PDF en images PNG (300 DPI)."""
    pages = convert_from_path(pdf_path, dpi=300)
    img_paths = []
    for i, page in enumerate(pages):
        img_path = Path(out_dir) / f"{Path(pdf_path).stem}_p{i+1}.png"
        page.save(img_path, "PNG")
        img_paths.append(str(img_path))
    return img_paths

date_regex   = re.compile(r"\b(?:\d{1,2}[/-]\d{1,2}[/-]\d{2,4}|\d{4}[/-]\d{1,2}[/-]\d{1,2})\b")
amount_regex = re.compile(r"(?:USD|EUR|£|\$)?\s?[\d{1,3}(?:,\d{3})*|(?:\d+)]+\.\d{2}")

def extract_fields(text:str)->dict:
    """Retourne {vendor, date, total} à partir du texte OCR brut."""
    # 1) Date
    date_match = date_regex.search(text)
    date_val   = date_match.group(0) if date_match else None

    # 2) Montant (on prend le plus grand nombre à 2 décimales)
    amounts = [m.group(0).strip() for m in amount_regex.finditer(text)]
    if amounts:
        # enlève signes, virgules → float pour trouver max
        numeric  = [float(re.sub(r"[^\d.]", "", a).replace(",", "")) for a in amounts]
        amount_val = amounts[numeric.index(max(numeric))]
    else:
        amount_val = None

    # 3) Nom fournisseur (NER org/person + heuristique : première entité org/person dans top du doc)
    vendor_val = None
    for line in text.split("\n")[:15]:          # premières lignes
        if len(line.strip()) < 3:
            continue
        doc = nlp(line)
        ents = [e.text for e in doc.ents if e.label_ in ("ORG","PERSON")]
        if ents:
            vendor_val = ents[0]
            break

    return {"vendor": vendor_val, "date": date_val, "total": amount_val}

########################################
# 5) Parcours des fichiers & OCR
########################################
root_dir = Path("/content/invoices")
images   = []
pdfs     = []

for ext in ("*.png","*.jpg","*.jpeg"):
    images.extend(root_dir.rglob(ext))
pdfs.extend(root_dir.rglob("*.pdf"))

# Ajoute les pages converties de PDF
for pdf in tqdm(pdfs, desc="Conversion PDF→PNG"):
    images += pdf_to_images(str(pdf), root_dir)

results = []

for img_path in tqdm(images, desc="OCR & extraction"):
    # Lecture d'image + pré‐processing rapide (grayscale & threshold Otsu)
    img = cv2.imread(str(img_path))
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thr  = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # OCR
    raw_text = pytesseract.image_to_string(thr, lang="eng")

    # Extraction structurée
    fields = extract_fields(raw_text)
    fields["file"] = img_path.name
    results.append(fields)

########################################
# 6) Sauvegarde JSON + aperçu
########################################
out_json = "/content/extracted_invoices.json"
with open(out_json, "w") as f:
    json.dump(results, f, indent=2)

print(f"\n✅ Extraction terminée ! {len(results)} fichiers traités.")
print(f"📄 Résultats enregistrés dans : {out_json}\n")
pd.DataFrame(results).head()


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
⬆️  Uploade ton fichier kaggle.json (Settings Kaggle → Create API Token)


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/osamahosamabdellatif/high-quality-invoice-images-for-ocr
License(s): DbCL-1.0
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m54.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Conversion PDF→PNG: 0it [00:00, ?it/s]
OCR & extraction:  11%|█         | 872/8181 [46:13<6:57:26,  3.43s/it]