In [3]:
import sys
import os
import glob
import json

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
import admin
import data_processing

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(), override=True)

True

# Setup

In [4]:
# Make a function to split chunking from upserting
def create_upsert(index_type,index_name,query_model,rag_type,chunker,summary_llm):
    # Set index names for special databases
    if rag_type == 'Parent-Child':
        index_name = index_name + '-parent-child'
    if rag_type == 'Summary':
        index_name = index_name + '-' + summary_llm.model_name.replace('/', '-').replace(' ','-').lower() + '-summary' 

    try:
        vectorstore = data_processing.initialize_database(index_type, 
                                            index_name, 
                                            query_model,
                                            rag_type=rag_type,
                                            clear=True, 
                                            local_db_path=os.getenv('LOCAL_DB_PATH'),
                                            init_ragatouille=True,
                                            show_progress=False)
        print(f"Database {index_name} created.")
        vectorstore, _ = data_processing.upsert_docs(index_type, 
                                        index_name,
                                        vectorstore,
                                        chunker,
                                        batch_size=400,
                                        show_progress=False,
                                        local_db_path=os.getenv('LOCAL_DB_PATH'))
        print(f"Database {index_name} upserted chunks.")
    except Exception as e:  # If there is an error, be sure to delete the database
        data_processing.delete_index(index_type, 
                                    index_name,
                                    rag_type,
                                    local_db_path=os.getenv('LOCAL_DB_PATH'))
        print(f"Database deleted: {index_name}")
        print(f"Error: {e}")

## Secrets, Models, Docs, Params

In [5]:
# Set secrets
secrets={}
sb={}

secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

In [6]:
# Read setup data, assign models
json_file_path = "databases.json"
with open(json_file_path, "r") as json_file:
    setup_data = json.load(json_file)

sb={}
query_params=setup_data['query_models']
query_models=[]
for model in query_params:
    for key in model:
        sb[key] = model[key]
    query_models.append(admin.get_query_model(sb, secrets))

llm_params=setup_data['llms']
llms=[]
for model in llm_params:
    for key in model:
        sb[key] = model[key]
    llms.append(admin.set_llm(sb, secrets))

chunk_params=setup_data['chunk_params']

Extra query types that take a long time. Add to the databases.json file

{
    "id": "3",
    "index_type": "ChromaDB",
    "query_model": "Hugging Face",
    "embedding_name": "Dedicated Endpoint",
    "embedding_hf_endpoint": "https://d95tsnjp6nub114k.us-east4.gcp.endpoints.huggingface.cloud"
},
{
    "id": "4",
    "index_type": "RAGatouille",
    "embedding_name": "colbert-ir/colbertv2.0"
}

In [7]:
# Get docs
data_folder='../data/AMS'
docs= glob.glob(os.path.join(data_folder,'*.pdf'))   # Only get the PDFs in the directory

# Make Databases

## All query models, standard

In [6]:
rag_type='Standard'
summary_llm=None

In [7]:
for i_chunk in range(len(chunk_params)):
    # Chunk the docs before creating and upserting into the database
    chunker=data_processing.chunk_docs(docs,
                rag_type=rag_type,
                n_merge_pages=chunk_params[i_chunk]['n_merge_pages'],
                chunk_method=chunk_params[i_chunk]['chunk_method'],
                chunk_size=chunk_params[i_chunk]['chunk_size'],
                llm=summary_llm,
                show_progress=False)

    print(f"Created {len(chunker['chunks'])} chunks from {len(chunker['pages'])} pages.")

    for i_run in range(len(query_params)):
        # Create and upsert database
        print(f"Creating and uploading database with these params: {query_params[i_run]}")

        index_appendix=str(chunk_params[i_chunk]['n_merge_pages'])+'merge'+'-'+str(chunk_params[i_chunk]['chunk_size'])
        index_name = (query_params[i_run]['embedding_name'].replace('/', '-').replace(' ', '-') + '-' + index_appendix).lower()

        create_upsert(query_params[i_run]['index_type'],
                        index_name,
                        query_models[i_run],
                        rag_type,
                        chunker,
                        summary_llm)

Created 2222 chunks from 2222 pages.
Creating and uploading database with these params: {'id': '1', 'index_type': 'ChromaDB', 'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}
Database text-embedding-3-large-2merge-0 created.
Database text-embedding-3-large-2merge-0 upserted chunks.
Creating and uploading database with these params: {'id': '2', 'index_type': 'Pinecone', 'query_model': 'Voyage', 'embedding_name': 'voyage-large-2'}
Database voyage-large-2-2merge-0 created.
Database voyage-large-2-2merge-0 upserted chunks.
Created 33478 chunks from 4439 pages.
Creating and uploading database with these params: {'id': '1', 'index_type': 'ChromaDB', 'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}
Database text-embedding-3-large-0merge-400 created.
Database text-embedding-3-large-0merge-400 upserted chunks.
Creating and uploading database with these params: {'id': '2', 'index_type': 'Pinecone', 'query_model': 'Voyage', 'embedding_name': 'voyage-large-2

## OpenAI text-embedding-3-large, parent-child, 400 character-recursive chunk

In [8]:
rag_type='Parent-Child'
summary_llm=None
i_chunk=1   # 400 character-recursive setting
i_run=0     # OpenAI text-embedding-3-large

In [9]:
chunker=data_processing.chunk_docs(docs,
            rag_type=rag_type,
            n_merge_pages=chunk_params[i_chunk]['n_merge_pages'],
            chunk_method=chunk_params[i_chunk]['chunk_method'],
            chunk_size=chunk_params[i_chunk]['chunk_size'],
            llm=summary_llm,
            show_progress=False)

print(f"Created {len(chunker['chunks'])} chunks from {len(chunker['pages'])} pages.")

# Create and upsert database
print(f"Creating and uploading database with these params: {query_params[i_run]}")

index_appendix=str(chunk_params[i_chunk]['n_merge_pages'])+'merge'+'-'+str(chunk_params[i_chunk]['chunk_size'])
index_name = (query_params[i_run]['embedding_name'].replace('/', '-').replace(' ', '-') + '-' + index_appendix).lower()

create_upsert(query_params[i_run]['index_type'],
                index_name,
                query_models[i_run],
                rag_type,
                chunker,
                summary_llm)

Created 36148 chunks from 2 pages.
Creating and uploading database with these params: {'id': '1', 'index_type': 'ChromaDB', 'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}
Database text-embedding-3-large-0merge-400-parent-child created.
Database text-embedding-3-large-0merge-400-parent-child upserted chunks.


## OpenAI text-embedding-3-large, summary, 2 page merge, no chunk

In [8]:
rag_type='Summary'
summary_llm=llms[4] # mistralai/Mistral-7B-Instruct-v0.2 serverless

i_chunk=0   # 2 page merge, no chunk
i_run=0     # OpenAI text-embedding-3-large

In [9]:
chunker=data_processing.chunk_docs(docs,
            rag_type=rag_type,
            n_merge_pages=chunk_params[i_chunk]['n_merge_pages'],
            chunk_method=chunk_params[i_chunk]['chunk_method'],
            chunk_size=chunk_params[i_chunk]['chunk_size'],
            llm=summary_llm,
            show_progress=False)

print(f"Created {len(chunker['summaries'])} summaries from {len(chunker['pages'])} pages.")

# Create and upsert database
print(f"Creating and uploading database with these params: {query_params[i_run]}")

index_appendix=str(chunk_params[i_chunk]['n_merge_pages'])+'merge'+'-'+str(chunk_params[i_chunk]['chunk_size'])
index_name = (query_params[i_run]['embedding_name'].replace('/', '-').replace(' ', '-') + '-' + index_appendix).lower()

create_upsert(query_params[i_run]['index_type'],
                index_name,
                query_models[i_run],
                rag_type,
                chunker,
                summary_llm)

KeyboardInterrupt: 