In [12]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image,UnidentifiedImageError
import io
import os
from tqdm import tqdm
from pdf2image import convert_from_path
import pytesseract
from dask import delayed, compute
import dask

In [14]:
def extract_text_with_ocr_alternative(pdf_path):
    all_text = ''
    try:
        pages = convert_from_path(pdf_path, 150)  # Convert PDF to list of images
        for page in pages:
            try:
                text = pytesseract.image_to_string(page)
                all_text += text
            except Exception as e:
                print(f"Error during OCR processing: {e}")
    except Exception as e:
        print(f"Error converting PDF to images: {e}")
    return all_text


In [18]:
def convert_pdf_to_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    
    for page in doc:
        text += page.get_text()   
    doc.close()
    
    return text


In [26]:
def is_scanned_pdf_with_fitz(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        num_pages = len(doc)

        has_text = False
        total_chars = 0
        for page_num in range(num_pages):
            page_text = doc[page_num].get_text()
            char_count = len(page_text)
            if char_count > 0:  # If there's text on the page
                has_text = True
                total_chars += char_count

        if num_pages > 0:
            avg_chars_per_page = total_chars / num_pages
        else:
            avg_chars_per_page = 0

        doc.close()  # Ensure the document is closed in the try block

        return not has_text or avg_chars_per_page < 100
    except Exception as e:
        print(f"An error occurred with {pdf_path}: {e}")
        return None 

In [28]:
def count_pdf_types(folder_path):
    scanned_count, text_count, error_count = 0, 0, 0

    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            result = is_scanned_pdf_with_fitz(pdf_path)
            if result is True:
                scanned_count += 1
            elif result is False:
                text_count += 1
            else:
                error_count += 1

    return scanned_count, text_count, error_count

# Example usage
folder_path = './files'  # Update with the path to your folder
scanned_count, text_count, error_count = count_pdf_types(folder_path)
print(f"Scanned PDFs: {scanned_count}")
print(f"Text PDFs: {text_count}")
print(f"Files with errors: {error_count}")

  0%|          | 0/1898 [00:00<?, ?it/s]

An error occurred with ./files/52001312100220160024100 Los Andes 3 mayo 2022.pdf: Cannot open empty file: filename='./files/52001312100220160024100 Los Andes 3 mayo 2022.pdf'.
An error occurred with ./files/52001312100220160005300 Pasto 30 junio 2022.pdf: Cannot open empty file: filename='./files/52001312100220160005300 Pasto 30 junio 2022.pdf'.
An error occurred with ./files/52001312100120180011400 El Tambo 8 junio 2022.pdf: Cannot open empty file: filename='./files/52001312100120180011400 El Tambo 8 junio 2022.pdf'.
An error occurred with ./files/52001312100120180002800 Ipiales 9 mayo 2022.pdf: Cannot open empty file: filename='./files/52001312100120180002800 Ipiales 9 mayo 2022.pdf'.
An error occurred with ./files/47001312100220180005501 Tenerife 11 julio 2022.pdf: Cannot open empty file: filename='./files/47001312100220180005501 Tenerife 11 julio 2022.pdf'.
An error occurred with ./files/52001312100120210001000 El Tablon de Gomez 1 junio 2022.pdf: Cannot open empty file: filename='

In [23]:
import os
from tqdm.auto import tqdm  # Assuming you're using tqdm for progress indication

# Assuming is_scanned_pdf_with_fitz and convert_pdf_to_text are already defined

def process_pdf_folder(source_folder, target_folder):
    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    num_skipped = 0

    for filename in tqdm(os.listdir(source_folder)):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(source_folder, filename)
            text_file_path = os.path.join(target_folder, filename.replace(".pdf", ".txt"))

            try:
                # Determine if the PDF is scanned
                if is_scanned_pdf_with_fitz(pdf_path):
                    print(f"Processing {filename} with OCR...")
                    extracted_text = extract_text_with_ocr_alternative(pdf_path)
                else:
                    print(f"Processing {filename} by extracting text...")
                    extracted_text = convert_pdf_to_text(pdf_path)

                # Save the extracted text to a new text file
                if extracted_text:
                    with open(text_file_path, 'w', encoding='utf-8') as text_file:
                        text_file.write(extracted_text)
                    print(f"Saved extracted text to {text_file_path}")
                else:
                    print(f"No text extracted from {filename}.")
                    num_skipped += 1
            except Exception as e:
                print(f"Error processing file {filename}: {e}")
                num_skipped += 1

    return num_skipped
