In [8]:
!pip install -q py_pdf_parser PyMuPDF

In [2]:
import boto3
import io
import fitz  # PyMuPDF 

In [6]:
def download_pdf_from_s3_to_memory(bucket, key):
    s3 = boto3.client('s3')
    response = s3.get_object(Bucket=bucket, Key=key)
    return response['Body'].read()

def extract_text_with_headers(pdf_document):
    chunks = {}
    current_header = None
    current_text = ""

    for page_number in range(pdf_document.page_count):
        page = pdf_document.load_page(page_number)
        blocks = page.get_text("dict")["blocks"]

        for block in blocks:
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if span["size"] > 12 or span["flags"] & 2:
                        if current_header and current_text:
                            chunks[current_header] = current_text.strip()
                        current_header = span["text"].strip()
                        current_text = ""
                    else:
                        current_text += span["text"] + " "

        if current_header and current_text:
            chunks[current_header] = current_text.strip()

    return chunks

def process_pdf_from_s3_and_chunk_by_headers(bucket, key):
    file_content = download_pdf_from_s3_to_memory(bucket, key)
    pdf_document = fitz.open(stream=io.BytesIO(file_content), filetype="pdf")
    chunks = extract_text_with_headers(pdf_document)
    return chunks

def list_pdfs_in_s3_folder(bucket, folder):
    s3 = boto3.client('s3')
    response = s3.list_objects_v2(Bucket=bucket, Prefix=folder)
    pdf_files = []

    if 'Contents' in response:
        for obj in response['Contents']:
            if obj['Key'].endswith('.pdf'):
                pdf_files.append(obj['Key'])

    return pdf_files

def process_all_pdfs_in_folders(bucket, folders):
    for folder in folders:
        pdf_files = list_pdfs_in_s3_folder(bucket, folder)
        for pdf_file in pdf_files:
            print(f"Processing PDF: {pdf_file}")
            chunks = process_pdf_from_s3_and_chunk_by_headers(bucket, pdf_file)
            # Output or further process the chunks as needed
            for header, content in chunks.items():
                print(f"Header: {header}")
                print(f"Content: {content}\n")

In [None]:
bucket_name = 'canada-gen-ai'
folders = ['asset-management-documents/', 'research paper/']

In [7]:
process_all_pdfs_in_folders(bucket_name, folders)

{'CORRELATION CO-EFFICIENT ANALYSIS METHOD': 'A thesis submitted to the Department of Electrical and Electronic Engineering in partial  fulfillment of the requirements for the degree of  Master of Science in Electrical and Electronic Engineering (EEE)',
 'ASIF ISLAM': 'Department of Electrical and Electronic Engineering (EEE)  BANGLADESH UNIVERSITY OF ENGINEERING AND YECHNOLOGY (BUET)  DHAKA    May, 2012   i',
 'Declaration': 'This is to certify that this work has been done by the undersigned and it has not been submitted  elsewhere for the award of any degree or diploma.      Signature of the Student      ‚Ä¶‚Ä¶‚Ä¶‚Ä¶‚Ä¶‚Ä¶‚Ä¶‚Ä¶‚Ä¶‚Ä¶.  (Asif Islam)    ii    The thesis titled ‚ÄúDetection of Mechanical Deformation in Old Aged Power Transformer Using  Cross Correlation Co-Efficient Analysis Method‚Äù submitted by Asif Islam, Roll ‚Äì 0409062127F,  Session ‚Äì April/2009, has been accepted as satisfactory in partial fulfillment of requirements for  the degree of Master of Science in En