In [3]:
import io
import logging
from typing import Tuple, List, Optional
from dataclasses import dataclass
from contextlib import contextmanager
import pdfplumber
from PyPDF2 import PdfReader
import base64

logger = logging.getLogger(__name__)

@dataclass
class FileProcessingResult:
    encoded_files: List[str]
    file_contents: List[str]
    file_names: List[str]
    error: Optional[str] = None

class FileSizeError(Exception):
    pass

class FileTypeError(Exception):
    pass

class PDFProcessor:
    MAX_FILE_SIZE_MB = 10
    SUPPORTED_TYPES = ['pdf', 'txt']
    
    @staticmethod
    def validate_file_size(content: bytes) -> None:
        size_mb = len(content) / (1024 * 1024)
        if size_mb > PDFProcessor.MAX_FILE_SIZE_MB:
            raise FileSizeError(f"File size {size_mb:.1f}MB exceeds limit of {PDFProcessor.MAX_FILE_SIZE_MB}MB")

    @staticmethod
    def validate_file_type(filename: str) -> None:
        ext = filename.lower().split('.')[-1]
        if ext not in PDFProcessor.SUPPORTED_TYPES:
            raise FileTypeError(f"Unsupported file type: {ext}")

    @contextmanager
    def safe_pdf_processing(self, content: bytes):
        pdf_file = io.BytesIO(content)
        try:
            yield pdf_file
        finally:
            pdf_file.close()

    def process_pdf(self, content: bytes) -> str:
        with self.safe_pdf_processing(content) as pdf_file:
            text_content = []
            
            # Try with pdfplumber first
            try:
                with pdfplumber.open(pdf_file) as pdf:
                    for page in pdf.pages:
                        extracted_text = page.extract_text()
                        if extracted_text:
                            text_content.append(extracted_text)
            except Exception as e:
                logger.warning(f"pdfplumber extraction failed: {e}, trying PyPDF2")
                
                # Fallback to PyPDF2
                pdf_file.seek(0)  # Reset file pointer
                try:
                    reader = PdfReader(pdf_file)
                    for page in reader.pages:
                        text = page.extract_text()
                        if text:
                            text_content.append(text)
                except Exception as e:
                    raise Exception(f"Both PDF extraction methods failed: {e}")

        return '\n'.join(text_content)

def process_files(contents: List[str], filenames: List[str]) -> FileProcessingResult:
    """
    Process multiple files and return encoded results
    """
    if not contents or not filenames:
        return FileProcessingResult([], [], [])

    processor = PDFProcessor()
    encoded_files = []
    file_contents = []
    processed_names = []

    for content_str, filename in zip(contents, filenames):
        try:
            # Validate and decode base64 content
            try:
                content_type, content_string = content_str.split(',')
                decoded = base64.b64decode(content_string)
            except Exception as e:
                raise ValueError(f"Invalid base64 content: {e}")

            # Validate file
            processor.validate_file_type(filename)
            processor.validate_file_size(decoded)

            if filename.lower().endswith('.pdf'):
                # Process PDF
                text_content = processor.process_pdf(decoded)
                
                # Re-encode processed content
                encoded = base64.b64encode(decoded).decode()
                encoded_files.append(f'data:application/pdf;base64,{encoded}')
                file_contents.append(text_content)
                processed_names.append(filename)
                
            elif filename.lower().endswith('.txt'):
                # Process text files
                text_content = decoded.decode('utf-8')
                encoded_files.append(f'data:text/plain;charset=utf-8,{content_str}')
                file_contents.append(text_content)
                processed_names.append(filename)

        except Exception as e:
            logger.error(f"Error processing file {filename}: {e}")
            continue

    return FileProcessingResult(
        encoded_files=encoded_files,
        file_contents=file_contents,
        file_names=processed_names
    )

def register_file_upload_callback(app, name):
    @app.callback(
        Output(f'uploaded-file-{name}', component_property='data'),
        Input(f'upload-data-{name}', component_property='contents'),
        State(f'upload-data-{name}', component_property='filename'),
    )
    def upload_files(contents, filenames):
        if not contents:
            return [], [], []
            
        if not isinstance(contents, list):
            contents = [contents]
            filenames = [filenames]

        result = process_files(contents, filenames)
        return result.encoded_files, result.file_contents, result.file_names

Using custom data configuration main-f9306ececa7c2eca


Downloading and preparing dataset None/main to /Users/gyf/.cache/huggingface/datasets/openai___parquet/main-f9306ececa7c2eca/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /Users/gyf/.cache/huggingface/datasets/openai___parquet/main-f9306ececa7c2eca/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/14946 [00:00<?, ?ex/s]