In [3]:
pip install pdfplumber pytesseract pillow python-docx docx2pdf


Collecting docx2pdf
  Downloading docx2pdf-0.1.8-py3-none-any.whl.metadata (3.3 kB)
Downloading docx2pdf-0.1.8-py3-none-any.whl (6.7 kB)
Installing collected packages: docx2pdf
Successfully installed docx2pdf-0.1.8
Note: you may need to restart the kernel to use updated packages.


    click (>=7.0<=8.1.*)
          ~~~~~~^


In [6]:
import os
import pdfplumber
import pytesseract
from PIL import Image
from docx2pdf import convert
import shutil
import json

PDF_FOLDER = "data"
TEMP_FOLDER = "temp_pdf"
OUTPUT_JSON = "result.json"

# === Convertit un .docx en .pdf ===
def convert_docx_to_pdf(docx_path, output_dir):
    try:
        os.makedirs(output_dir, exist_ok=True)
        convert(docx_path, output_dir)
        pdf_name = os.path.splitext(os.path.basename(docx_path))[0] + ".pdf"
        return os.path.join(output_dir, pdf_name)
    except Exception as e:
        print(f"❌ Erreur de conversion {docx_path} → PDF : {e}")
        return None

# === Extraction unifiée pour tous les PDF (y compris DOCX convertis) ===
def extract_text_from_pdf(pdf_path):
    extracted_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                if text and len(text.strip()) > 20:
                    extracted_text += text + "\n"
                else:
                    image = page.to_image(resolution=300).original
                    ocr_text = pytesseract.image_to_string(image, lang='eng+fra')
                    extracted_text += ocr_text + "\n"
    except Exception as e:
        print(f"❌ Erreur PDF {pdf_path} : {e}")
    return extracted_text.strip() or None

# === Traitement principal ===
def process_documents(folder):
    all_docs = []
    ignored = []

    os.makedirs(TEMP_FOLDER, exist_ok=True)

    for filename in os.listdir(folder):
        filepath = os.path.join(folder, filename)
        text = None

        # PDF direct
        if filename.lower().endswith(".pdf"):
            print(f"📄 PDF : {filename}")
            text = extract_text_from_pdf(filepath)

        # DOCX converti en PDF
        elif filename.lower().endswith(".docx"):
            print(f"📄 DOCX : {filename} → PDF")
            converted_pdf = convert_docx_to_pdf(filepath, TEMP_FOLDER)
            if converted_pdf and os.path.exists(converted_pdf):
                text = extract_text_from_pdf(converted_pdf)

        if text:
            all_docs.append({
                "filename": filename,
                "text": text
            })
        else:
            print(f"⚠️ Aucun texte extrait de : {filename}")
            ignored.append(filename)

    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(all_docs, f, ensure_ascii=False, indent=2)

    shutil.rmtree(TEMP_FOLDER)
    print(f"\n✅ {len(all_docs)} fichiers traités avec succès.")
    if ignored:
        print(f"🚫 {len(ignored)} fichiers ignorés :", ignored)

if __name__ == "__main__":
    process_documents(PDF_FOLDER)


📄 DOCX : CV AHMED DIOUF DIRIEH DIBAD AVRIL 2024 VF.docx → PDF


100%|██████████| 1/1 [00:03<00:00,  3.25s/it]


📄 PDF : CV Anis JENHANI_EU_fr_Expert SIG.pdf
📄 DOCX : CV FR BM Ahmed Faresse_Consultant international Mars25.docx → PDF


100%|██████████| 1/1 [00:04<00:00,  4.36s/it]


📄 DOCX : CV Habib BEN ALI .docx → PDF


100%|██████████| 1/1 [00:03<00:00,  3.99s/it]


📄 DOCX : CV HmidaKarboul_V2.docx → PDF


100%|██████████| 1/1 [00:04<00:00,  4.48s/it]


📄 PDF : CV Jalel ZAGRANI Ex DSI BCT.pdf
📄 DOCX : CV Mohamed Anis BEN ROMDHANE -FR.docx → PDF


100%|██████████| 1/1 [00:04<00:00,  4.14s/it]


📄 DOCX : CV Mouhsine LAKHDISSI.docx → PDF


100%|██████████| 1/1 [00:04<00:00,  4.51s/it]


📄 PDF : CV Saber Jaffel.PDF
📄 DOCX : CV-Ali Kedhai-Expert SI Bancaire.docx → PDF


100%|██████████| 1/1 [00:03<00:00,  3.88s/it]


📄 PDF : CV-Expert T24.pdf
📄 PDF : CV-NIANG-Oumar-Seydou-fr.pdf
📄 PDF : CV. OMAR FEHRI FR..pdf
📄 PDF : CV_HatemGhazala_02112023.pdf

✅ 14 fichiers traités avec succès.
