In [8]:
%load_ext autoreload
%autoreload 2

# Partition Documents

In [9]:
from aerospace_chatbot.processing import DocumentProcessor
from aerospace_chatbot.services import EmbeddingService, RerankService, LLMService, DatabaseService
from aerospace_chatbot.processing import QAModel

from langchain_core.documents import Document

# Load environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

# Initialize logger
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [10]:
embedding_service='OpenAI'
embedding_model='text-embedding-3-large'

embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

doc_processor = DocumentProcessor(
    embedding_service=embedding_service
)

In [11]:
buckets = ["ams_pdfs", "esmats_pdfs"]

docs_all = []
for bucket in buckets:
    docs_bucket = DocumentProcessor.list_bucket_pdfs(bucket)
    docs_all.append(docs_bucket)

INFO:aerospace_chatbot.processing.documents:Number of PDFs found: 47
INFO:aerospace_chatbot.processing.documents:PDFs found: ['gs://ams_pdfs/AMS_1966_reocr.pdf', 'gs://ams_pdfs/AMS_1967_reocr.pdf', 'gs://ams_pdfs/AMS_1968_reocr.pdf', 'gs://ams_pdfs/AMS_1969_reocr.pdf', 'gs://ams_pdfs/AMS_1970_reocr.pdf', 'gs://ams_pdfs/AMS_1971_reocr.pdf', 'gs://ams_pdfs/AMS_1972_reocr.pdf', 'gs://ams_pdfs/AMS_1973_reocr.pdf', 'gs://ams_pdfs/AMS_1974_reocr.pdf', 'gs://ams_pdfs/AMS_1976_reocr.pdf', 'gs://ams_pdfs/AMS_1977_reocr.pdf', 'gs://ams_pdfs/AMS_1978_reocr.pdf', 'gs://ams_pdfs/AMS_1979_reocr.pdf', 'gs://ams_pdfs/AMS_1980_reocr.pdf', 'gs://ams_pdfs/AMS_1981_reocr.pdf', 'gs://ams_pdfs/AMS_1982_reocr.pdf', 'gs://ams_pdfs/AMS_1983_reocr.pdf', 'gs://ams_pdfs/AMS_1984_reocr.pdf', 'gs://ams_pdfs/AMS_1985_reocr.pdf', 'gs://ams_pdfs/AMS_1986_reocr.pdf', 'gs://ams_pdfs/AMS_1987_reocr.pdf', 'gs://ams_pdfs/AMS_1988_reocr.pdf', 'gs://ams_pdfs/AMS_1989_reocr.pdf', 'gs://ams_pdfs/AMS_1990_reocr.pdf', 'gs://ams_

In [12]:
len(docs_all)

2

In [13]:
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    partitioned_docs = doc_processor.load_and_partition_documents(docs,partition_by_api=False, upload_bucket=bucket)
    partitioned_docs_all.append(partitioned_docs)

INFO:aerospace_chatbot.processing.documents:Loading 47 documents...
INFO:aerospace_chatbot.processing.documents:Checking document 1 of 47: gs://ams_pdfs/AMS_1966_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Downloading PDF from GCS: gs://ams_pdfs/AMS_1966_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Bucket name: ams_pdfs
INFO:aerospace_chatbot.processing.documents:Blob name: AMS_1966_reocr.pdf


INFO:aerospace_chatbot.processing.documents:Checking document 2 of 47: gs://ams_pdfs/AMS_1967_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Downloading PDF from GCS: gs://ams_pdfs/AMS_1967_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Bucket name: ams_pdfs
INFO:aerospace_chatbot.processing.documents:Blob name: AMS_1967_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Checking document 3 of 47: gs://ams_pdfs/AMS_1968_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Downloading PDF from GCS: gs://ams_pdfs/AMS_1968_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Bucket name: ams_pdfs
INFO:aerospace_chatbot.processing.documents:Blob name: AMS_1968_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Checking document 4 of 47: gs://ams_pdfs/AMS_1969_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Downloading PDF from GCS: gs://ams_pdfs/AMS_1969_reocr.pdf
INFO:aerospace_chatbot.processing.documents:Bucket name: ams_pdfs
INFO:aerospace_chatbot.processing

In [None]:
len(partitioned_docs_all)