# Setup

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import copy
from aerospace_chatbot.processing import DocumentProcessor
from aerospace_chatbot.services import EmbeddingService, RerankService, LLMService, DatabaseService
from aerospace_chatbot.processing import QAModel

from langchain_core.documents import Document

# Load environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

# Initialize logger
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

In [3]:
embedding_service='OpenAI'
embedding_model='text-embedding-3-large'

embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

doc_processor = DocumentProcessor(
    embedding_service=embedding_service
)

In [None]:
buckets = ["ams_pdfs", "esmats_pdfs"]
# buckets = ["processing-pdfs"]

docs_all = []
for bucket in buckets:
    docs_bucket = DocumentProcessor.list_bucket_pdfs(bucket)
    docs_all.append(docs_bucket)

In [None]:
len(docs_all)

# Partition Documents

In [None]:
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    # partitioned_docs = doc_processor.load_and_partition_documents(docs,partition_by_api=False, upload_bucket=bucket)
    partitioned_docs = doc_processor.load_and_partition_documents(docs,partition_by_api=False, upload_bucket=None)
    partitioned_docs_all.append(partitioned_docs)

# Chunk and Index Documents

## 2000 token, 0 overlap, voyage-3 Voyage Embedding

In [None]:
# Load partitioned documents
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    partitioned_docs = doc_processor.load_partitioned_documents(docs, partition_dir='./document_processing')
    partitioned_docs_all.append(partitioned_docs)

In [7]:
# Set up chunking and database parameters
db_type='Pinecone'

chunk_size=2000
chunk_overlap=0

# embedding_service='OpenAI'
# embedding_model='text-embedding-3-large'
embedding_service = "Voyage"
embedding_model = "voyage-3"
embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

rerank_service='Cohere'
rerank_model='rerank-v3.5'
rerank_service = RerankService(
    model_service=rerank_service,
    model=rerank_model
)

dirname = f'./document_processing_{chunk_size}_{chunk_overlap}'
doc_processor = DocumentProcessor(
    embedding_service=embedding_service,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    work_dir=dirname
)

index_name = f'{embedding_model}-mech-demo2-{chunk_size}'
db_service = DatabaseService(
    db_type=db_type,
    index_name=index_name,
    embedding_service=embedding_service,
    rerank_service=rerank_service,
)

In [None]:
# Chunk loaded partitioned documents
chunk_obj_all = []
output_paths_all = []
for bucket, partitioned_docs in zip(buckets, partitioned_docs_all):
    chunk_obj, output_paths = doc_processor.chunk_documents(partitioned_docs)
    chunk_obj_all.append(chunk_obj)
    output_paths_all.append(output_paths)

print(len(chunk_obj_all))

In [None]:
print(len(chunk_obj_all[0].chunks))
print(len(chunk_obj_all[1].chunks))
print(len(chunk_obj_all[0].chunks)+len(chunk_obj_all[1].chunks))

In [None]:
try:
    db_service.initialize_database(clear=True)
except ValueError as e:
    print(f"Database initialization failed: {str(e)}")
    print(e)
    raise e

In [None]:
for chunk_obj in chunk_obj_all:
    print(f'Upserting {len(chunk_obj.chunks)} chunks')
    db_service.index_data(chunk_obj, batch_size=50)
    time.sleep(5)

## 2000 token, 0 overlap, text-embedding-3-large OpenAI Embedding

In [None]:
# Load partitioned documents
partitioned_docs_all = []
for bucket, docs in zip(buckets, docs_all):
    partitioned_docs = doc_processor.load_partitioned_documents(docs, partition_dir='./document_processing')
    partitioned_docs_all.append(partitioned_docs)

In [14]:
# Set up chunking and database parameters
db_type='Pinecone'

chunk_size=2000
chunk_overlap=0

embedding_service='OpenAI'
embedding_model='text-embedding-3-large'
# embedding_service = "Voyage"
# embedding_model = "voyage-3"
embedding_service = EmbeddingService(
    model_service=embedding_service,
    model=embedding_model
)

rerank_service='Cohere'
rerank_model='rerank-v3.5'
rerank_service = RerankService(
    model_service=rerank_service,
    model=rerank_model
)

dirname = f'./document_processing_{chunk_size}_{chunk_overlap}'
doc_processor = DocumentProcessor(
    embedding_service=embedding_service,
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    work_dir=dirname
)

index_name = f'{embedding_model}-mech-demo2-{chunk_size}'
db_service = DatabaseService(
    db_type=db_type,
    index_name=index_name,
    embedding_service=embedding_service,
    rerank_service=rerank_service,
)

In [None]:
# Chunk loaded partitioned documents
chunk_obj_all = []
output_paths_all = []
for bucket, partitioned_docs in zip(buckets, partitioned_docs_all):
    chunk_obj, output_paths = doc_processor.chunk_documents(partitioned_docs)
    chunk_obj_all.append(chunk_obj)
    output_paths_all.append(output_paths)

print(len(chunk_obj_all))

In [None]:
print(len(chunk_obj_all[0].chunks))
print(len(chunk_obj_all[1].chunks))
print(len(chunk_obj_all[0].chunks)+len(chunk_obj_all[1].chunks))

In [None]:
try:
    db_service.initialize_database(clear=True)
except ValueError as e:
    print(f"Database initialization failed: {str(e)}")
    print(e)
    raise e

In [None]:
for chunk_obj in chunk_obj_all:
    print(f'Upserting {len(chunk_obj.chunks)} chunks')
    db_service.index_data(chunk_obj, batch_size=50)
    time.sleep(5)