In [None]:
def register_file_upload_callback(app, name):
    @app.callback(
        Output('uploaded-file' + name, component_property='data'),
        Input('upload-data' + name, component_property='contents'),
        State('upload-data' + name, component_property='filename'),
        State('uploaded-file' + name, component_property='data')
    )
    def upload_files(contents, filenames, current_data):
        """
        This callback processes user-uploaded files (PDFs, TXTs). 
        It returns a tuple of:
            (
              encoded_files,  # A list of base64-encoded file contents (for display in an iframe, e.g.)
              file_contents,  # A list of the textual extractions from those files
              file_names      # A list of the actual filenames
            )
        """

        # If there's existing data, unpack it. Otherwise initialize.
        encoded_files = current_data[0] if current_data else []
        file_texts    = current_data[1] if current_data else []
        file_names    = current_data[2] if current_data else []

        if not contents:
            return encoded_files, file_texts, file_names

        # If a user uploads multiple files, contents/filenames are lists
        for content, filename in zip(contents, filenames):
            content_type, content_string = content.split(',')
            try:
                decoded = base64.b64decode(content_string)
            except Exception as e:
                logger.error(f"Error decoding base64 content for {filename}: {e}")
                continue

            # Process PDF file
            if filename.lower().endswith('.pdf'):
                logger.info(f"STARTING PDF EXTRACTION: {filename}")
                text_from_pdf, pdf_data_encoded = process_pdf(decoded, max_size_mb=2.0)
                # text_from_pdf: str (extracted text)
                # pdf_data_encoded: str (base64-encoded PDF up to max_size_mb if applicable)

                # Append the extracted text
                file_texts.append(text_from_pdf if text_from_pdf else "")
                # Store a 'data:application/pdf;base64,...' pointer for the (possibly truncated) PDF
                encoded_files.append('data:application/pdf;base64,' + pdf_data_encoded)
                file_names.append(filename)
                logger.info(f"FINISHED PDF EXTRACTION: {filename}")

            # Process TXT file
            elif filename.lower().endswith('.txt'):
                logger.info(f"Reading plain text: {filename}")
                text_str = decoded.decode(errors='replace')
                # Append the extracted text
                file_texts.append(text_str)
                # Store a 'data:text/plain;charset=utf-8,...'
                encoded_files.append('data:text/plain;charset=utf-8,' + base64.b64encode(decoded).decode())
                file_names.append(filename)

            else:
                logger.warning(f"Unsupported file type: {filename}")
                continue

        return encoded_files, file_texts, file_names


def process_pdf(decoded_pdf: bytes, max_size_mb: float = 1.0) -> Tuple[str, str]:
    """
    Helper function to:
    - Extract text from a PDF using pdfplumber (handling None-text cases).
    - Possibly limit the PDF to a certain size in MB before encoding as base64
      so you don't blow up your front-end by displaying a huge PDF.

    Returns:
      (extracted_text, base64_pdf_string)
    """
    all_text = []
    pdf_in_mem = io.BytesIO(decoded_pdf)

    # --- Extract Text Safely Using pdfplumber ---
    try:
        with pdfplumber.open(pdf_in_mem) as pdfplumb:
            for page in pdfplumb.pages:
                page_text = page.extract_text() or ""  # handle None returns
                all_text.append(page_text)
    except Exception as e:
        logger.error(f"pdfplumber failed to read PDF: {e}")
        all_text.append("[Error reading PDF text via pdfplumber]")

    extracted_text = "\n".join(all_text)

    # --- Rewind the buffer so PyPDF2 can read it again if needed ---
    pdf_in_mem.seek(0)

    # --- Potentially limit PDF size for embedding ---
    #    If you want the entire PDF, no matter how large, you can skip the chunking logic below.
    try:
        reader = PdfReader(pdf_in_mem)
    except Exception as e:
        logger.error(f"PyPDF2 failed to read PDF: {e}")
        # If PyPDF2 fails, we can fallback to returning the entire (original) PDF as base64
        return extracted_text, base64.b64encode(decoded_pdf).decode()

    writer = PdfWriter()
    current_size_mb = 0.0
    for i, page in enumerate(reader.pages):
        writer.add_page(page)
        temp_stream = io.BytesIO()
        writer.write(temp_stream)
        pdf_chunk = temp_stream.getvalue()
        size_in_mb = len(pdf_chunk) / (1024 * 1024)
        if size_in_mb <= max_size_mb:
            # If still under the size limit, keep going
            current_size_mb = size_in_mb
        else:
            # Remove the last page added if it exceeded the limit
            # Alternatively, you can break here (keeping partial pages).
            writer.pages.pop()
            logger.warning(
                f"Max size reached at page {i}. Final embedded PDF size ~ {current_size_mb:.2f} MB."
            )
            break

    # Now, write out the final truncated PDF
    final_stream = io.BytesIO()
    writer.write(final_stream)
    final_pdf_bytes = final_stream.getvalue()

    encoded_pdf = base64.b64encode(final_pdf_bytes).decode()
    return extracted_text, encoded_pdf