In [None]:
!pip install pytesseract pdfplumber python-docx openpyxl pymupdf


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.6-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfminer.six==20250327 (from pdfplumber)
  Downloading pdfminer_six-20250327-py3-none-any.whl.metadata (4.1 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading pdfplumber-0.11.6-py3

In [None]:
import os
import logging
import pytesseract
import pdfplumber
import fitz
from PIL import Image
from docx import Document
from openpyxl import load_workbook
from pathlib import Path


logging.basicConfig(
    filename='extraction_log.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

SUPPORTED_EXTENSIONS = ['.pdf', '.docx', '.xlsx']

def extract_text_from_pdf(filepath):
    try:
        with pdfplumber.open(filepath) as pdf:
            text = ''.join(page.extract_text() or '' for page in pdf.pages)
        if not text.strip():
            text = extract_text_from_pdf_with_ocr(filepath)
        return text
    except Exception as e:
        logging.error(f"Failed to extract PDF text: {filepath} - {e}")
        return ""

def extract_text_from_pdf_with_ocr(filepath):
    try:
        text = ''
        doc = fitz.open(filepath)
        for page_num in range(len(doc)):
            pix = doc[page_num].get_pixmap(dpi=300)
            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            text += pytesseract.image_to_string(image)
        return text
    except Exception as e:
        logging.error(f"OCR failed for PDF: {filepath} - {e}")
        return ""

def extract_text_from_docx(filepath):
    try:
        doc = Document(filepath)
        return '\n'.join([para.text for para in doc.paragraphs])
    except Exception as e:
        logging.error(f"Failed to extract DOCX text: {filepath} - {e}")
        return ""

def extract_text_from_xlsx(filepath):
    try:
        workbook = load_workbook(filepath, data_only=True)
        text = ''
        for sheet in workbook.worksheets:
            for row in sheet.iter_rows(values_only=True):
                line = ' '.join([str(cell) for cell in row if cell is not None])
                text += line + '\n'
        return text
    except Exception as e:
        logging.error(f"Failed to extract XLSX text: {filepath} - {e}")
        return ""

def extract_all_text_from_folder(folder_path='uploads'):
    extracted_data = {}
    folder = Path(folder_path)

    if not folder.exists() or not folder.is_dir():
        logging.error(f"Folder not found: {folder_path}")
        return extracted_data

    for filepath in folder.rglob("*"):
        if filepath.suffix.lower() not in SUPPORTED_EXTENSIONS:
            continue

        try:
            logging.info(f"Processing file: {filepath}")
            if filepath.suffix.lower() == '.pdf':
                text = extract_text_from_pdf(filepath)
            elif filepath.suffix.lower() == '.docx':
                text = extract_text_from_docx(filepath)
            elif filepath.suffix.lower() == '.xlsx':
                text = extract_text_from_xlsx(filepath)
            else:
                continue

            extracted_data[str(filepath)] = text

        except Exception as e:
            logging.error(f"Unhandled error while processing {filepath}: {e}")

    return extracted_data

os.makedirs("extracted_texts", exist_ok=True)


for filepath, content in results.items():
    filename = os.path.basename(filepath)
    base_name = os.path.splitext(filename)[0]
    output_path = os.path.join("extracted_texts", base_name + ".txt")

    with open(output_path, "w", encoding="utf-8") as f:
        f.write(content)

print(" Tous les fichiers extraits ont été sauvegardés dans le dossier 'extracted_texts/'")


 Tous les fichiers extraits ont été sauvegardés dans le dossier 'extracted_texts/'
