In [1]:
import sys, json, os

# Import local packages
sys.path.append('../src/aerospace_chatbot')
from aerospace_chatbot.services import EmbeddingService, DatabaseService
from aerospace_chatbot.core.config import get_secrets
from aerospace_chatbot.processing import DocumentProcessor
from aerospace_chatbot.core.config import setup_logging

os.environ['LOG_FILE'] = '/Users/danmueller/Documents/GitHub/aerospace_chatbot/logs/db_make.log'
logger = setup_logging()
logger.info("Starting to make databases")

2024-11-23 21:48:00,747 - INFO - Logging configured successfully
2024-11-23 21:48:00,748 - INFO - Starting to make databases


In [2]:
secrets = get_secrets()

# Read setup data, assign models
json_file_path = "databases.json"

with open(json_file_path, "r") as json_file:
    setup_data = json.load(json_file)

# Initialize sb as a list of dictionaries from databases.json
sb = [params for params in setup_data['make_params']]

# Make Databases

In [3]:
for make_params in sb:
    print(f"Processing {make_params}")
    for bucket_name in make_params['bucket_names']:
        logger.info(f"Processing {bucket_name}")
        # Get documents
        docs = DocumentProcessor.list_bucket_pdfs(bucket_name)

        # Initialize services after potential rerun
        logger.info(f"Initializing embedding service")
        embedding_service = EmbeddingService(
            model_service=make_params['embedding_service'],
            model=make_params['embedding_model']
        )

        # Initialize database service
        logger.info(f"Initializing database service")
        db_service = DatabaseService(
            db_type=make_params['index_type'],
            index_name=make_params['embedding_model']+'-'+make_params['index_appendix'],
            rag_type=make_params['rag_type'],
            embedding_service=embedding_service,
            doc_type='document'
        )

        # Initialize document processor
        logger.info(f"Initializing document processor")
        doc_processor = DocumentProcessor(
            embedding_service=embedding_service,
            rag_type=make_params['rag_type'],
            chunk_method=make_params['chunk_params'].get('chunk_method', None),
            chunk_size=make_params['chunk_params'].get('chunk_size', None),
            chunk_overlap=make_params['chunk_params'].get('chunk_overlap', None),
            merge_pages=make_params['chunk_params'].get('n_merge_pages', None),
        )

        # Initialize database
        logger.info(f"Initializing database")
        db_service.initialize_database()

        # Process documents
        logger.info(f"Processing documents")
        chunking_result = doc_processor.process_documents(documents=docs)
                
        # Index documents
        logger.info(f"Indexing documents")
        db_service.index_data(
            data=chunking_result,
            batch_size=500
            )

Processing {'index_appendix': 'mech-demo-500rec-nc', 'index_type': 'Pinecone', 'rag_type': 'Standard', 'embedding_service': 'Voyage', 'embedding_model': 'voyage-3', 'chunk_params': {'chunk_method': 'character_recursive', 'chunk_size': 500, 'chunk_overlap': 0}, 'bucket_names': ['ams_pdfs', 'esmats_pdfs']}
2024-11-23 21:48:00,759 - INFO - Processing ams_pdfs
2024-11-23 21:48:02,435 - INFO - Number of PDFs found: 47
2024-11-23 21:48:02,437 - INFO - PDFs found: ['gs://ams_pdfs/AMS_1966_reocr.pdf', 'gs://ams_pdfs/AMS_1967_reocr.pdf', 'gs://ams_pdfs/AMS_1968_reocr.pdf', 'gs://ams_pdfs/AMS_1969_reocr.pdf', 'gs://ams_pdfs/AMS_1970_reocr.pdf', 'gs://ams_pdfs/AMS_1971_reocr.pdf', 'gs://ams_pdfs/AMS_1972_reocr.pdf', 'gs://ams_pdfs/AMS_1973_reocr.pdf', 'gs://ams_pdfs/AMS_1974_reocr.pdf', 'gs://ams_pdfs/AMS_1976_reocr.pdf', 'gs://ams_pdfs/AMS_1977_reocr.pdf', 'gs://ams_pdfs/AMS_1978_reocr.pdf', 'gs://ams_pdfs/AMS_1979_reocr.pdf', 'gs://ams_pdfs/AMS_1980_reocr.pdf', 'gs://ams_pdfs/AMS_1981_reocr.pdf