In [4]:
!pip install pdfminer.six

[0m

In [1]:
import os
from pdfminer.high_level import extract_text

In [2]:
def clear_directory(directory):
    for file_name in os.listdir(directory):
        file_path = os.path.join(directory, file_name)
        if os.path.isfile(file_path):
            os.remove(file_path)

def split_text_to_chunks(text, chunk_size, overlap):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        if end + overlap < len(text):
            end += overlap
        chunks.append(text[start:end])
        start += chunk_size
    return chunks

def save_chunks_to_txt(chunks, base_filename, output_dir):
    for index, chunk in enumerate(chunks, start=1):
        filename = os.path.join(output_dir, f"{base_filename}_{index}.txt")
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(chunk)

def main(pdf_path, chunk_size, overlap, output_dir):
    # Extract text from the PDF
    extracted_text = extract_text(pdf_path)

    # Split the extracted text into chunks considering the overlap
    chunks = split_text_to_chunks(extracted_text, chunk_size, overlap)

    # Extract the base filename without the extension
    base_filename = os.path.basename(pdf_path).split('.')[0]

    # Save each chunk to a .txt file
    save_chunks_to_txt(chunks, base_filename, output_dir)

In [4]:
if __name__ == "__main__":
    data_directory = './data'
    desired_chunk_size = 1024  # This specifies how large each chunk of text should be.
    token_overlap = 128
    output_directory = './output'

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    else:
        clear_directory(output_directory)

    # Iterate over all PDF files in the data directory
    for pdf_file in os.listdir(data_directory):
        if pdf_file.endswith('.pdf'):
            pdf_file_path = os.path.join(data_directory, pdf_file)
            main(pdf_file_path, desired_chunk_size, token_overlap, output_directory)