# Setup

In [81]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [82]:
import time
import copy
from aerospace_chatbot.processing import DocumentProcessor
from aerospace_chatbot.services import EmbeddingService, RerankService, LLMService, DatabaseService
from aerospace_chatbot.processing import QAModel

from langchain_core.documents import Document

# Load environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

# Initialize logger
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [83]:
embedding_service='OpenAI'
embedding_model='text-embedding-3-large'

embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

doc_processor = DocumentProcessor(
    embedding_service=embedding_service
)

In [84]:
buckets = ["ams_pdfs", "esmats_pdfs"]
# buckets = ["processing-pdfs"]

docs_all = []
for bucket in buckets:
    docs_bucket = DocumentProcessor.list_bucket_pdfs(bucket)
    docs_all.append(docs_bucket)

INFO:aerospace_chatbot.processing.documents:Number of PDFs found: 47
INFO:aerospace_chatbot.processing.documents:PDFs found: ['gs://ams_pdfs/AMS_1966_reocr.pdf', 'gs://ams_pdfs/AMS_1967_reocr.pdf', 'gs://ams_pdfs/AMS_1968_reocr.pdf', 'gs://ams_pdfs/AMS_1969_reocr.pdf', 'gs://ams_pdfs/AMS_1970_reocr.pdf', 'gs://ams_pdfs/AMS_1971_reocr.pdf', 'gs://ams_pdfs/AMS_1972_reocr.pdf', 'gs://ams_pdfs/AMS_1973_reocr.pdf', 'gs://ams_pdfs/AMS_1974_reocr.pdf', 'gs://ams_pdfs/AMS_1976_reocr.pdf', 'gs://ams_pdfs/AMS_1977_reocr.pdf', 'gs://ams_pdfs/AMS_1978_reocr.pdf', 'gs://ams_pdfs/AMS_1979_reocr.pdf', 'gs://ams_pdfs/AMS_1980_reocr.pdf', 'gs://ams_pdfs/AMS_1981_reocr.pdf', 'gs://ams_pdfs/AMS_1982_reocr.pdf', 'gs://ams_pdfs/AMS_1983_reocr.pdf', 'gs://ams_pdfs/AMS_1984_reocr.pdf', 'gs://ams_pdfs/AMS_1985_reocr.pdf', 'gs://ams_pdfs/AMS_1986_reocr.pdf', 'gs://ams_pdfs/AMS_1987_reocr.pdf', 'gs://ams_pdfs/AMS_1988_reocr.pdf', 'gs://ams_pdfs/AMS_1989_reocr.pdf', 'gs://ams_pdfs/AMS_1990_reocr.pdf', 'gs://ams_

In [85]:
len(docs_all)

2

# Partition Documents

In [None]:
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    # partitioned_docs = doc_processor.load_and_partition_documents(docs,partition_by_api=False, upload_bucket=bucket)
    partitioned_docs = doc_processor.load_and_partition_documents(docs,partition_by_api=False, upload_bucket=None)
    partitioned_docs_all.append(partitioned_docs)

# Chunk and Index Documents

In [6]:
# Load partitioned documents
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    partitioned_docs = doc_processor.load_partitioned_documents(docs, partition_dir='./document_processing')
    partitioned_docs_all.append(partitioned_docs)

INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AMS_1966_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AMS_1967_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AMS_1968_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AMS_1969_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AMS_1970_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AMS_1971_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/AMS_1972_reocr-partitioned.json

./document_processing/partitioned/AMS_1966_reocr-partitioned.json
./document_processing/partitioned/AMS_1967_reocr-partitioned.json
./document_processing/partitioned/AMS_1968_reocr-partitioned.json
./document_processing/partitioned/AMS_1969_reocr-partitioned.json
./document_processing/partitioned/AMS_1970_reocr-partitioned.json
./document_processing/partitioned/AMS_1971_reocr-partitioned.json
./document_processing/partitioned/AMS_1972_reocr-partitioned.json
./document_processing/partitioned/AMS_1973_reocr-partitioned.json
./document_processing/partitioned/AMS_1974_reocr-partitioned.json
./document_processing/partitioned/AMS_1976_reocr-partitioned.json
./document_processing/partitioned/AMS_1977_reocr-partitioned.json
./document_processing/partitioned/AMS_1978_reocr-partitioned.json
./document_processing/partitioned/AMS_1979_reocr-partitioned.json
./document_processing/partitioned/AMS_1980_reocr-partitioned.json
./document_processing/partitioned/AMS_1981_reocr-partitioned.json
./document

INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2011_pinto_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2011_ponnusamy_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2011_rataj2_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2011_rataj_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2011_riva_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2011_schmalbach2_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2011_schm

./document_processing/partitioned/2011_pinto_reocr-partitioned.json
./document_processing/partitioned/2011_ponnusamy_reocr-partitioned.json
./document_processing/partitioned/2011_rataj2_reocr-partitioned.json
./document_processing/partitioned/2011_rataj_reocr-partitioned.json
./document_processing/partitioned/2011_riva_reocr-partitioned.json
./document_processing/partitioned/2011_schmalbach2_reocr-partitioned.json
./document_processing/partitioned/2011_schmalbach_reocr-partitioned.json
./document_processing/partitioned/2011_schmid_reocr-partitioned.json
./document_processing/partitioned/2011_schmidt_reocr-partitioned.json
./document_processing/partitioned/2011_schwarz_reocr-partitioned.json
./document_processing/partitioned/2011_sexton_reocr-partitioned.json
./document_processing/partitioned/2011_slade_reocr-partitioned.json
./document_processing/partitioned/2011_talvat_reocr-partitioned.json
./document_processing/partitioned/2011_tillier_reocr-partitioned.json
./document_processing/pa

INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2019_mirescu_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2019_navarro_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2019_neugebauer_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2019_ortega_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2019_perestrelo_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2019_puyol_reocr-partitioned.json
INFO:aerospace_chatbot.processing.documents:Found existing partitioned file: ./document_processing/partitioned/2019

./document_processing/partitioned/2019_mirescu_reocr-partitioned.json
./document_processing/partitioned/2019_navarro_reocr-partitioned.json
./document_processing/partitioned/2019_neugebauer_reocr-partitioned.json
./document_processing/partitioned/2019_ortega_reocr-partitioned.json
./document_processing/partitioned/2019_perestrelo_reocr-partitioned.json
./document_processing/partitioned/2019_puyol_reocr-partitioned.json
./document_processing/partitioned/2019_raynal_reocr-partitioned.json
./document_processing/partitioned/2019_riemer_reocr-partitioned.json
./document_processing/partitioned/2019_ryszawa_reocr-partitioned.json
./document_processing/partitioned/2019_saudan_reocr-partitioned.json
./document_processing/partitioned/2019_scheidegger_reocr-partitioned.json
./document_processing/partitioned/2019_schulke_reocr-partitioned.json
./document_processing/partitioned/2019_schwarz_reocr-partitioned.json
./document_processing/partitioned/2019_seguinhenry_reocr-partitioned.json
./document_p

In [98]:
# Set up chunking and database parameters
db_type='Pinecone'

chunk_size=2000
chunk_overlap=0

embedding_service='OpenAI'
embedding_model='text-embedding-3-large'
# embedding_service = "Voyage"
# embedding_model = "voyage-3"
embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

rerank_service='Cohere'
rerank_model='rerank-v3.5'
rerank_service = RerankService(
    model_service=rerank_service,
    model=rerank_model
)

dirname = f'./document_processing_{chunk_size}_{chunk_overlap}'
doc_processor = DocumentProcessor(
    embedding_service=embedding_service,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    work_dir=dirname
)

index_name = f'{embedding_model}-mech-demo2-{chunk_size}'
db_service = DatabaseService(
    db_type=db_type,
    index_name=index_name,
    embedding_service=embedding_service,
    rerank_service=rerank_service,
)

In [46]:
# Chunk loaded partitioned documents
chunk_obj_all = []
output_paths_all = []
for bucket, partitioned_docs in zip(buckets, partitioned_docs_all):
    chunk_obj, output_paths = doc_processor.chunk_documents(partitioned_docs)
    chunk_obj_all.append(chunk_obj)
    output_paths_all.append(output_paths)

print(len(chunk_obj_all))

INFO:aerospace_chatbot.processing.documents:Chunking documents...


Chunking ./document_processing/partitioned/AMS_1966_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AMS_1966_reocr-chunked.json
Chunking ./document_processing/partitioned/AMS_1967_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AMS_1967_reocr-chunked.json
Chunking ./document_processing/partitioned/AMS_1968_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AMS_1968_reocr-chunked.json
Chunking ./document_processing/partitioned/AMS_1969_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AMS_1969_reocr-chunked.json
Chunking ./document_processing/partitioned/AMS_1970_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AMS_1970_reocr-chunked.json
Chunking ./document_processing/partitioned/AMS_1971_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/AMS_1971_reocr-chunked.json
Chunking .

INFO:aerospace_chatbot.processing.documents:Total number of chunks: 21326
INFO:aerospace_chatbot.processing.documents:Output paths: ['./document_processing_2000_0/chunked/AMS_1966_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1967_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1968_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1969_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1970_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1971_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1972_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1973_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1974_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1976_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1977_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1978_reocr-chunked.json', './document_processing_2000_0/chunked/AMS_1979_reoc

Chunked data saved at ./document_processing_2000_0/chunked/AMS_2024-chunked.json
Chunking ./document_processing/partitioned/1999_arkwright_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/1999_arkwright_reocr-chunked.json
Chunking ./document_processing/partitioned/1999_astola_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/1999_astola_reocr-chunked.json
Chunking ./document_processing/partitioned/1999_baker_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/1999_baker_reocr-chunked.json
Chunking ./document_processing/partitioned/1999_bandera_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/1999_bandera_reocr-chunked.json
Chunking ./document_processing/partitioned/1999_barillot_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/1999_barillot_reocr-chunked.json
Chunking ./document_processing/partitioned/1999_blais_

INFO:aerospace_chatbot.processing.documents:Total number of chunks: 10517
INFO:aerospace_chatbot.processing.documents:Output paths: ['./document_processing_2000_0/chunked/1999_arkwright_reocr-chunked.json', './document_processing_2000_0/chunked/1999_astola_reocr-chunked.json', './document_processing_2000_0/chunked/1999_baker_reocr-chunked.json', './document_processing_2000_0/chunked/1999_bandera_reocr-chunked.json', './document_processing_2000_0/chunked/1999_barillot_reocr-chunked.json', './document_processing_2000_0/chunked/1999_blais_reocr-chunked.json', './document_processing_2000_0/chunked/1999_breguet_reocr-chunked.json', './document_processing_2000_0/chunked/1999_cacho_reocr-chunked.json', './document_processing_2000_0/chunked/1999_carre_reocr-chunked.json', './document_processing_2000_0/chunked/1999_carstens_reocr-chunked.json', './document_processing_2000_0/chunked/1999_chomicz_reocr-chunked.json', './document_processing_2000_0/chunked/1999_christiansen_reocr-chunked.json', './

Chunked data saved at ./document_processing_2000_0/chunked/2023_wade_reocr-chunked.json
Chunking ./document_processing/partitioned/2023_wygachiewicz_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/2023_wygachiewicz_reocr-chunked.json
Chunking ./document_processing/partitioned/2023_yotov_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/2023_yotov_reocr-chunked.json
Chunking ./document_processing/partitioned/2023_zemann_reocr-partitioned.json...
Chunked data saved at ./document_processing_2000_0/chunked/2023_zemann_reocr-chunked.json
2


In [99]:
print(len(chunk_obj_all[0].chunks))
print(len(chunk_obj_all[1].chunks))
print(len(chunk_obj_all[0].chunks)+len(chunk_obj_all[1].chunks))

21326
10517
31843


In [100]:
try:
    db_service.initialize_database(clear=True)
except ValueError as e:
    print(f"Database initialization failed: {str(e)}")
    print(e)
    raise e

INFO:aerospace_chatbot.services.database:Validating index text-embedding-3-large-mech-demo2-2000
INFO:pinecone_plugin_interface.logging:Discovering subpackages in _NamespacePath(['/Users/danmueller/Documents/GitHub/aerospace_chatbot/.venv/lib/python3.11/site-packages/pinecone_plugins'])
INFO:pinecone_plugin_interface.logging:Looking for plugins in pinecone_plugins.inference
INFO:pinecone_plugin_interface.logging:Installing plugin inference into Pinecone
INFO:aerospace_chatbot.services.database:Clearing Pinecone index text-embedding-3-large-mech-demo2-2000
INFO:aerospace_chatbot.services.database:Not clearing database, but pinecone index text-embedding-3-large-mech-demo2-2000 not found, creating.
INFO:aerospace_chatbot.services.database:Pinecone index text-embedding-3-large-mech-demo2-2000 created


In [101]:
# test_chunks_1=copy.deepcopy(chunk_obj_all[1])
# test_chunks_2=copy.deepcopy(chunk_obj_all[1])
# test_chunks_1.chunks=test_chunks_1.chunks[:5000] # Truncate to a smaller array
# test_chunks_2.chunks=test_chunks_2.chunks[5000:] # Truncate to a smaller array
# db_service.index_data(test_chunks_1, batch_size=100)
# db_service.index_data(test_chunks_2, batch_size=100)

In [None]:
for chunk_obj in chunk_obj_all:
    print(f'Upserting {len(chunk_obj.chunks)} chunks')
    db_service.index_data(chunk_obj, batch_size=100)
    time.sleep(5)

Upserting 21326 chunks


INFO:aerospace_chatbot.services.database:No existing metadata found in text-embedding-3-large-mech-demo2-2000, adding metadata: {'chunk_size': 2000, 'chunk_overlap': 0, 'embedding_family': 'OpenAI', 'embedding_model': 'text-embedding-3-large'}
INFO:aerospace_chatbot.services.database:Waiting for vectors to be indexed in Pinecone... Current count: 0, Expected: 1
INFO:aerospace_chatbot.services.database:Successfully verified 1 vectors in Pinecone index
INFO:aerospace_chatbot.services.database:Successfully added metadata to text-embedding-3-large-mech-demo2-2000
INFO:aerospace_chatbot.services.database:Initial vector count: 1
INFO:aerospace_chatbot.services.database:Upserting 21326 vectors
INFO:aerospace_chatbot.services.database:Upserting batch 1 of 214
INFO:aerospace_chatbot.services.database:Current vector count before batch: 1
INFO:aerospace_chatbot.services.database:Upserting 100 vectors for this batch...
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 20