Install the library `pypdf` which is used for reading PDF files and extracting content from them. This package provides functionality to access data from PDF documents, which is essential for text extraction tasks in this notebook.

In [None]:
%pip install pypdf

Define the function `extract_pdf_pages` that takes an input folder containing PDF files and an output folder to save the extracted text files. This function iterates through each PDF file in the input folder, reads it using `pypdf`, and saves each page's text into a separate text file within a structured directory format based on the PDF file's name. The example at the end demonstrates how to use this function with a specified input and output directory.

In [None]:
import os
from pypdf import PdfReader

def extract_pdf_pages(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through all files in the input folder
    for pdf_file in os.listdir(input_folder):
        # Construct the full file path
        pdf_path = os.path.join(input_folder, pdf_file)

        # Check if the file is a PDF
        if pdf_file.endswith('.pdf'):
            # Create a folder for the PDF in the output folder
            pdf_output_folder = os.path.join(output_folder, os.path.splitext(pdf_file)[0])
            if not os.path.exists(pdf_output_folder):
                os.makedirs(pdf_output_folder)

            # Read the PDF
            reader = PdfReader(pdf_path)
            num_pages = len(reader.pages)

            # Extract and save each page as a separate text file
            for i in range(num_pages):
                page = reader.pages[i]
                page_text = page.extract_text()
                digits = len(str(num_pages))
                # Create the output file path
                output_file_path = os.path.join(pdf_output_folder, f'Page_{str(i+1).zfill(digits)}.txt')

                # Save the extracted text to the file
                with open(output_file_path, 'w', encoding='utf-8') as output_file:
                    output_file.write(page_text)

# Example usage
input_folder = './Input'
output_folder = './ToText'
extract_pdf_pages(input_folder, output_folder)
