In [45]:
from PyPDF2 import PdfReader
from tqdm import tqdm


def extract_text_from_pdf(input_pdf_path, output_txt_path='book_content.txt'):
    # Creating a pdf reader object
    reader = PdfReader(input_pdf_path)

    # Open a text file for writing
    with open(f'{input_pdf_path.split('.pdf')[0]}-{output_txt_path}', 'w', encoding='utf-8') as output_file:
        # Use tqdm to iterate through all pages with a progress bar
        for page_number in tqdm(range(len(reader.pages)), desc="Extracting Pages"):
            # Getting a specific page from the pdf file
            current_page = reader.pages[page_number]

            # Extracting text from the page
            text = current_page.extract_text()

            # Post-process the text to join lines with lowercase letters to the previous line
            lines = text.split('\n')
            processed_text = [lines[0]]  # Keep the first line as is
            for line in lines[1:]:
                # Check if the line starts with a lowercase letter and there is no period before the line break
                if line and line[0].islower() and not processed_text[-1].endswith('.'):
                    # Append to the previous line
                    processed_text[-1] += ' ' + line
                else:
                    # Start a new line
                    processed_text.append(line)

            # Writing the processed text to the output file
            output_file.write("Page {}:\n{}\n\n".format(
                page_number + 1, '\n'.join(processed_text)))

    print(f"Text extracted from all pages and saved to '{output_txt_path}'")


# Example usage:
extract_text_from_pdf('LLM.pdf')

Extracting Pages: 100%|██████████| 361/361 [00:04<00:00, 87.44it/s]

Text extracted from all pages and saved to 'page_extracted.txt'





In [44]:
def join_lines(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as input_file, open(output_file_path, 'w', encoding='utf-8') as output_file:
        lines = input_file.readlines()
        processed_lines = []

        for line in lines:
            # Check if the line ends with a space
            if line.rstrip('\n').endswith(' '):
                # Append to the previous line
                processed_lines[-1] += line.rstrip('\n')
            else:
                # Start a new line
                processed_lines.append(line.rstrip('\n'))

        # Writing the processed lines to the output file
        output_file.write('\n'.join(processed_lines))


# Example usage:
join_lines('book_content.txt', 'book_content_processed.txt')