In [None]:
import io
import base64
import pdfplumber
from PyPDF2 import PdfReader, PdfWriter
import logging

logger = logging.getLogger(__name__)

def process_pdf(
    decoded_pdf: bytes, 
    max_size_mb: float = 2.0, 
    max_pages: int = 30
) -> tuple[str, str]:
    """
    Extract text from a PDF using pdfplumber, limiting pages and
    final PDF size for speed and reliability. Returns:
       (extracted_text, base64_pdf_string)
    """

    # --- Step 1: Extract text from up to 'max_pages' pages ---
    extracted_pages = []
    pdf_in_mem = io.BytesIO(decoded_pdf)
    try:
        with pdfplumber.open(pdf_in_mem) as pdfplumb:
            for idx, page in enumerate(pdfplumb.pages):
                if idx >= max_pages:
                    break
                page_text = page.extract_text() or ""
                extracted_pages.append(page_text)
    except Exception as e:
        logger.error(f"pdfplumber failed: {e}")
        # Fallback: no text extracted
        extracted_pages = ["[Error reading PDF text via pdfplumber]"]

    extracted_text = "\n".join(extracted_pages)

    # Reset to the beginning so PyPDF2 can read it again
    pdf_in_mem.seek(0)

    # --- Step 2: Build a PDF with only those same 'max_pages' pages ---
    try:
        reader = PdfReader(pdf_in_mem)
    except Exception as e:
        logger.error(f"PyPDF2 failed to read PDF: {e}")
        # If PyPDF2 bombs, fallback to returning entire PDF
        return extracted_text, base64.b64encode(decoded_pdf).decode()

    # Add only the first (up to) 'max_pages' pages
    writer = PdfWriter()
    total_pages_in_pdf = len(reader.pages)
    n_pages_to_write = min(max_pages, total_pages_in_pdf)
    for idx in range(n_pages_to_write):
        writer.add_page(reader.pages[idx])

    # --- Step 3: Do a single write and measure its size ---
    final_stream = io.BytesIO()
    writer.write(final_stream)
    final_pdf_bytes = final_stream.getvalue()
    size_in_mb = len(final_pdf_bytes) / (1024 * 1024)

    # --- Step 4: If size is still too big, remove pages from the end until it fits ---
    while size_in_mb > max_size_mb and writer.get_num_pages() > 0:
        # Remove the last page
        writer.remove_page(writer.get_num_pages() - 1)
        # Also remove the corresponding text from our extracted pages
        extracted_pages.pop()

        # Re-check size
        final_stream = io.BytesIO()
        writer.write(final_stream)
        final_pdf_bytes = final_stream.getvalue()
        size_in_mb = len(final_pdf_bytes) / (1024 * 1024)

    # Reconstruct final text in case we removed pages
    extracted_text = "\n".join(extracted_pages)

    # --- Done. Return text plus base64-encoded PDF ---
    encoded_pdf = base64.b64encode(final_pdf_bytes).decode()
    return extracted_text, encoded_pdf


In [None]:
def register_file_upload_callback(app, name):
    @app.callback(
        Output("uploaded-file" + name, component_property="data"),
        Input("upload-data" + name, component_property="contents"),
        State("upload-data" + name, component_property="filename"),
        State("uploaded-file" + name, component_property="data"),
    )
    def upload_files(contents, filenames, current_data):
        encoded_files = current_data[0] if current_data else []
        file_texts = current_data[1] if current_data else []
        file_names = current_data[2] if current_data else []

        if not contents:
            return encoded_files, file_texts, file_names

        for content, filename in zip(contents, filenames):
            content_type, content_string = content.split(",")
            try:
                decoded = base64.b64decode(content_string)
            except Exception as e:
                logger.error(f"Error decoding base64 for {filename}: {e}")
                continue

            if filename.lower().endswith(".pdf"):
                logger.info(f"Processing PDF: {filename}")
                # Use the new function
                pdf_text, pdf_b64 = process_pdf(decoded, max_size_mb=2.0, max_pages=30)
                file_texts.append(pdf_text)
                encoded_files.append("data:application/pdf;base64," + pdf_b64)
                file_names.append(filename)

            elif filename.lower().endswith(".txt"):
                logger.info(f"Processing text file: {filename}")
                text_str = decoded.decode(errors="replace")
                file_texts.append(text_str)
                # For iFrame usage, keep it as base64 or plain text. E.g.:
                encoded_text = base64.b64encode(decoded).decode()
                encoded_files.append("data:text/plain;base64," + encoded_text)
                file_names.append(filename)
            else:
                logger.warning(f"Skipping unsupported file type: {filename}")
                continue

        return encoded_files, file_texts, file_names
