# Setup

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import time
import copy
from aerospace_chatbot.processing import DocumentProcessor
from aerospace_chatbot.services import EmbeddingService, RerankService, LLMService, DatabaseService
from aerospace_chatbot.processing import QAModel

from langchain_core.documents import Document

# Load environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

# Initialize logger
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [21]:
embedding_service='OpenAI'
embedding_model='text-embedding-3-large'

embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

doc_processor = DocumentProcessor(
    embedding_service=embedding_service
)

In [22]:
# buckets = ["ams_pdfs", "esmats_pdfs"]
buckets = ["design_criteria_pdfs"]
# buckets = ["processing-pdfs"]

docs_all = []
for bucket in buckets:
    docs_bucket = DocumentProcessor.list_bucket_pdfs(bucket)
    docs_all.append(docs_bucket)

INFO:aerospace_chatbot.processing.documents:Number of PDFs found: 18
INFO:aerospace_chatbot.processing.documents:PDFs found: ['gs://design_criteria_pdfs/AFSPCMAN 91-710v3_Range Safety.pdf', 'gs://design_criteria_pdfs/EELV SIS Rev C Final 20170616 (LE-S-002).pdf', 'gs://design_criteria_pdfs/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware.pdf', 'gs://design_criteria_pdfs/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware.pdf', 'gs://design_criteria_pdfs/MSFC-DWG-20M02540_RevE_1_Assessment of Flexible Lines for Flow Induced Vibration.pdf', 'gs://design_criteria_pdfs/MSFC-SPEC-626_Test Control Document for Assessment of Flexible Lines for Flow Induced Vibration.pdf', 'gs://design_criteria_pdfs/NASA-STD-5001B_w_change_2_Structural Design and Test Factors of Safety for Spaceflight Hardware.pdf', 'gs://design_criteria_pdfs/NASA-STD-5009_Nondestructive Evaluation Requirements For Fracture Critical Components.pdf', 'gs://d

In [23]:
len(docs_all)

1

# Partition Documents

In [24]:
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    # partitioned_docs = doc_processor.load_and_partition_documents(docs,partition_by_api=False, upload_bucket=bucket)
    partitioned_docs = doc_processor.load_and_partition_documents(docs,partition_by_api=False, upload_bucket=None)
    partitioned_docs_all.append(partitioned_docs)

INFO:aerospace_chatbot.processing.documents:Loading 18 documents...
INFO:aerospace_chatbot.processing.documents:Checking document 1 of 18: gs://design_criteria_pdfs/AFSPCMAN 91-710v3_Range Safety.pdf
INFO:aerospace_chatbot.processing.documents:Downloading PDF from GCS: gs://design_criteria_pdfs/AFSPCMAN 91-710v3_Range Safety.pdf
INFO:aerospace_chatbot.processing.documents:Bucket name: design_criteria_pdfs
INFO:aerospace_chatbot.processing.documents:Blob name: AFSPCMAN 91-710v3_Range Safety.pdf
INFO:aerospace_chatbot.processing.documents:Checking document 2 of 18: gs://design_criteria_pdfs/EELV SIS Rev C Final 20170616 (LE-S-002).pdf
INFO:aerospace_chatbot.processing.documents:Downloading PDF from GCS: gs://design_criteria_pdfs/EELV SIS Rev C Final 20170616 (LE-S-002).pdf
INFO:aerospace_chatbot.processing.documents:Bucket name: design_criteria_pdfs
INFO:aerospace_chatbot.processing.documents:Blob name: EELV SIS Rev C Final 20170616 (LE-S-002).pdf
INFO:aerospace_chatbot.processing.docume

# Chunk and Index Documents

## 2000 token, 0 overlap, voyage-3 Voyage Embedding

In [25]:
# Load partitioned documents
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    partitioned_docs = doc_processor.load_partitioned_documents(docs, partition_dir='./document_processing')
    partitioned_docs_all.append(partitioned_docs)

INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AFSPCMAN 91-710v3_Range Safety-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/EELV SIS Rev C Final 20170616 (LE-S-002)-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/MSFC-DWG-20M02540_RevE_1_Assessment of Flexible Lines for Flow Induced Vibration-partitioned.json
INFO:aerospace_chatbot.processing.d

./document_processing/partitioned/AFSPCMAN 91-710v3_Range Safety-partitioned.json
./document_processing/partitioned/EELV SIS Rev C Final 20170616 (LE-S-002)-partitioned.json
./document_processing/partitioned/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-partitioned.json
./document_processing/partitioned/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-partitioned.json
./document_processing/partitioned/MSFC-DWG-20M02540_RevE_1_Assessment of Flexible Lines for Flow Induced Vibration-partitioned.json
./document_processing/partitioned/MSFC-SPEC-626_Test Control Document for Assessment of Flexible Lines for Flow Induced Vibration-partitioned.json
./document_processing/partitioned/NASA-STD-5001B_w_change_2_Structural Design and Test Factors of Safety for Spaceflight Hardware-partitioned.json
./document_processing/partitioned/NASA-STD-5009_Nondestructive Evaluation Requirements For Fracture Critical Components-partit

In [26]:
# Set up chunking and database parameters
db_type='Pinecone'

chunk_size=2000
chunk_overlap=0

# embedding_service='OpenAI'
# embedding_model='text-embedding-3-large'
embedding_service = "Voyage"
embedding_model = "voyage-3"
embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

rerank_service='Cohere'
rerank_model='rerank-v3.5'
rerank_service = RerankService(
    model_service=rerank_service,
    model=rerank_model
)

dirname = f'./document_processing_{chunk_size}_{chunk_overlap}'
doc_processor = DocumentProcessor(
    embedding_service=embedding_service,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    work_dir=dirname
)

index_name = f'{embedding_model}-dc-demo2-{chunk_size}'
db_service = DatabaseService(
    db_type=db_type,
    index_name=index_name,
    embedding_service=embedding_service,
    rerank_service=rerank_service,
)

In [27]:
# Chunk loaded partitioned documents
chunk_obj_all = []
output_paths_all = []
for bucket, partitioned_docs in zip(buckets, partitioned_docs_all):
    chunk_obj, output_paths = doc_processor.chunk_documents(partitioned_docs)
    chunk_obj_all.append(chunk_obj)
    output_paths_all.append(output_paths)

print(len(chunk_obj_all))

INFO:aerospace_chatbot.processing.documents:Chunking documents...


Chunking ./document_processing/partitioned/AFSPCMAN 91-710v3_Range Safety-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AFSPCMAN 91-710v3_Range Safety-chunked.json
Chunking ./document_processing/partitioned/EELV SIS Rev C Final 20170616 (LE-S-002)-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/EELV SIS Rev C Final 20170616 (LE-S-002)-chunked.json
Chunking ./document_processing/partitioned/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-chunked.json
Chunking ./document_processing/partitioned/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/JSC-65829_Loads and Structural Dynamics Requirements for Spacef

INFO:aerospace_chatbot.processing.documents:Total number of chunks: 2580
INFO:aerospace_chatbot.processing.documents:Output paths: ['./document_processing_2000_0/chunked/AFSPCMAN 91-710v3_Range Safety-chunked.json', './document_processing_2000_0/chunked/EELV SIS Rev C Final 20170616 (LE-S-002)-chunked.json', './document_processing_2000_0/chunked/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-chunked.json', './document_processing_2000_0/chunked/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-chunked.json', './document_processing_2000_0/chunked/MSFC-DWG-20M02540_RevE_1_Assessment of Flexible Lines for Flow Induced Vibration-chunked.json', './document_processing_2000_0/chunked/MSFC-SPEC-626_Test Control Document for Assessment of Flexible Lines for Flow Induced Vibration-chunked.json', './document_processing_2000_0/chunked/NASA-STD-5001B_w_change_2_Structural Design and Test Factors of Safety for Spaceflight Hard

Chunked data saved at ./document_processing_2000_0/chunked/SMC-S-025_Evalation and Test Reqirements for Liquid Rocket Engines-chunked.json
1


In [29]:
print(len(chunk_obj_all[0].chunks))
# print(len(chunk_obj_all[1].chunks))
# print(len(chunk_obj_all[0].chunks)+len(chunk_obj_all[1].chunks))

2580


In [30]:
try:
    db_service.initialize_database(clear=True)
except ValueError as e:
    print(f"Database initialization failed: {str(e)}")
    print(e)
    raise e

INFO:aerospace_chatbot.services.database:Validating index voyage-3-dc-demo2-2000
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/Users/danmueller/Documents/GitHub/aerospace_chatbot/.venv/lib/python3.11/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:aerospace_chatbot.services.database:Clearing Pinecone index voyage-3-dc-demo2-2000
INFO:aerospace_chatbot.services.database:Not clearing database, but pinecone index voyage-3-dc-demo2-2000 not found, creating.
INFO:aerospace_chatbot.services.database:Pinecone index voyage-3-dc-demo2-2000 created


In [31]:
for chunk_obj in chunk_obj_all:
    print(f'Upserting {len(chunk_obj.chunks)} chunks')
    db_service.index_data(chunk_obj, batch_size=50)
    time.sleep(5)

Upserting 2580 chunks


INFO:aerospace_chatbot.services.database:No existing metadata found in voyage-3-dc-demo2-2000, adding metadata: {'chunk_size': 2000, 'chunk_overlap': 0, 'embedding_family': 'Voyage', 'embedding_model': 'voyage-3'}
INFO:aerospace_chatbot.services.database:Retry 1 of 15: Waiting for vectors to be indexed in Pinecone... Current count: 0, Expected: 1
INFO:aerospace_chatbot.services.database:Successfully verified 1 vectors in Pinecone index
INFO:aerospace_chatbot.services.database:Successfully added metadata to voyage-3-dc-demo2-2000
INFO:aerospace_chatbot.services.database:Initial vector count: 1
INFO:aerospace_chatbot.services.database:Upserting 2580 vectors
INFO:aerospace_chatbot.services.database:Upserting batch 1 of 52
INFO:aerospace_chatbot.services.database:Upserting 50 vectors for this batch...
INFO:aerospace_chatbot.services.database:Upserting batch 2 of 52
INFO:aerospace_chatbot.services.database:Upserting 50 vectors for this batch...
INFO:aerospace_chatbot.services.database:Upser

## 2000 token, 0 overlap, text-embedding-3-large OpenAI Embedding

In [32]:
# Load partitioned documents
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    partitioned_docs = doc_processor.load_partitioned_documents(docs, partition_dir='./document_processing')
    partitioned_docs_all.append(partitioned_docs)

INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AFSPCMAN 91-710v3_Range Safety-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/EELV SIS Rev C Final 20170616 (LE-S-002)-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/MSFC-DWG-20M02540_RevE_1_Assessment of Flexible Lines for Flow Induced Vibration-partitioned.json
INFO:aerospace_chatbot.processing.d

./document_processing/partitioned/AFSPCMAN 91-710v3_Range Safety-partitioned.json
./document_processing/partitioned/EELV SIS Rev C Final 20170616 (LE-S-002)-partitioned.json
./document_processing/partitioned/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-partitioned.json
./document_processing/partitioned/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-partitioned.json
./document_processing/partitioned/MSFC-DWG-20M02540_RevE_1_Assessment of Flexible Lines for Flow Induced Vibration-partitioned.json
./document_processing/partitioned/MSFC-SPEC-626_Test Control Document for Assessment of Flexible Lines for Flow Induced Vibration-partitioned.json
./document_processing/partitioned/NASA-STD-5001B_w_change_2_Structural Design and Test Factors of Safety for Spaceflight Hardware-partitioned.json
./document_processing/partitioned/NASA-STD-5009_Nondestructive Evaluation Requirements For Fracture Critical Components-partit

In [33]:
# Set up chunking and database parameters
db_type='Pinecone'

chunk_size=2000
chunk_overlap=0

embedding_service='OpenAI'
embedding_model='text-embedding-3-large'
# embedding_service = "Voyage"
# embedding_model = "voyage-3"
embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

rerank_service='Cohere'
rerank_model='rerank-v3.5'
rerank_service = RerankService(
    model_service=rerank_service,
    model=rerank_model
)

dirname = f'./document_processing_{chunk_size}_{chunk_overlap}'
doc_processor = DocumentProcessor(
    embedding_service=embedding_service,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    work_dir=dirname
)

index_name = f'{embedding_model}-dc-demo2-{chunk_size}'
db_service = DatabaseService(
    db_type=db_type,
    index_name=index_name,
    embedding_service=embedding_service,
    rerank_service=rerank_service,
)

In [34]:
# Chunk loaded partitioned documents
chunk_obj_all = []
output_paths_all = []
for bucket, partitioned_docs in zip(buckets, partitioned_docs_all):
    chunk_obj, output_paths = doc_processor.chunk_documents(partitioned_docs)
    chunk_obj_all.append(chunk_obj)
    output_paths_all.append(output_paths)

print(len(chunk_obj_all))

INFO:aerospace_chatbot.processing.documents:Chunking documents...


Chunking ./document_processing/partitioned/AFSPCMAN 91-710v3_Range Safety-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AFSPCMAN 91-710v3_Range Safety-chunked.json
Chunking ./document_processing/partitioned/EELV SIS Rev C Final 20170616 (LE-S-002)-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/EELV SIS Rev C Final 20170616 (LE-S-002)-chunked.json
Chunking ./document_processing/partitioned/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-chunked.json
Chunking ./document_processing/partitioned/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/JSC-65829_Loads and Structural Dynamics Requirements for Spacef

INFO:aerospace_chatbot.processing.documents:Total number of chunks: 2580
INFO:aerospace_chatbot.processing.documents:Output paths: ['./document_processing_2000_0/chunked/AFSPCMAN 91-710v3_Range Safety-chunked.json', './document_processing_2000_0/chunked/EELV SIS Rev C Final 20170616 (LE-S-002)-chunked.json', './document_processing_2000_0/chunked/JSC-65828B-Chg1_Structural Design Requirements and Factors of Safety for Spaceflight Hardware-chunked.json', './document_processing_2000_0/chunked/JSC-65829_Loads and Structural Dynamics Requirements for Spaceflight Hardware-chunked.json', './document_processing_2000_0/chunked/MSFC-DWG-20M02540_RevE_1_Assessment of Flexible Lines for Flow Induced Vibration-chunked.json', './document_processing_2000_0/chunked/MSFC-SPEC-626_Test Control Document for Assessment of Flexible Lines for Flow Induced Vibration-chunked.json', './document_processing_2000_0/chunked/NASA-STD-5001B_w_change_2_Structural Design and Test Factors of Safety for Spaceflight Hard

Chunked data saved at ./document_processing_2000_0/chunked/SMC-S-025_Evalation and Test Reqirements for Liquid Rocket Engines-chunked.json
1


In [36]:
print(len(chunk_obj_all[0].chunks))
# print(len(chunk_obj_all[1].chunks))
# print(len(chunk_obj_all[0].chunks)+len(chunk_obj_all[1].chunks))

2580


In [37]:
try:
    db_service.initialize_database(clear=True)
except ValueError as e:
    print(f"Database initialization failed: {str(e)}")
    print(e)
    raise e

INFO:aerospace_chatbot.services.database:Validating index text-embedding-3-large-dc-demo2-2000
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/Users/danmueller/Documents/GitHub/aerospace_chatbot/.venv/lib/python3.11/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:aerospace_chatbot.services.database:Clearing Pinecone index text-embedding-3-large-dc-demo2-2000
INFO:aerospace_chatbot.services.database:Not clearing database, but pinecone index text-embedding-3-large-dc-demo2-2000 not found, creating.
INFO:aerospace_chatbot.services.database:Pinecone index text-embedding-3-large-dc-demo2-2000 created


In [38]:
for chunk_obj in chunk_obj_all:
    print(f'Upserting {len(chunk_obj.chunks)} chunks')
    db_service.index_data(chunk_obj, batch_size=50)
    time.sleep(5)

Upserting 2580 chunks


INFO:aerospace_chatbot.services.database:No existing metadata found in text-embedding-3-large-dc-demo2-2000, adding metadata: {'chunk_size': 2000, 'chunk_overlap': 0, 'embedding_family': 'OpenAI', 'embedding_model': 'text-embedding-3-large'}
INFO:aerospace_chatbot.services.database:Retry 1 of 15: Waiting for vectors to be indexed in Pinecone... Current count: 0, Expected: 1
INFO:aerospace_chatbot.services.database:Successfully verified 1 vectors in Pinecone index
INFO:aerospace_chatbot.services.database:Successfully added metadata to text-embedding-3-large-dc-demo2-2000
INFO:aerospace_chatbot.services.database:Initial vector count: 1
INFO:aerospace_chatbot.services.database:Upserting 2580 vectors
INFO:aerospace_chatbot.services.database:Upserting batch 1 of 52
INFO:aerospace_chatbot.services.database:Upserting 50 vectors for this batch...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:aerospace_chatbot.services.database:Upserting batch 2 of 52