In [14]:
import sys
import os
import glob
import json

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
import admin
import data_processing

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(), override=True)

True

# Setup

In [27]:
# Make a function to split chunking from upserting
def create_upsert(index_type,index_name,query_model,rag_type,chunker,summary_llm):
    # Set index names for special databases
    if rag_type == 'Parent-Child':
        index_name = index_name + '-parent-child'
    if rag_type == 'Summary':
        index_name = index_name + '-' + summary_llm.model_name.replace('/', '-').replace(' ','-').lower() + '-summary' 

    try:
        vectorstore = data_processing.initialize_database(index_type, 
                                            index_name, 
                                            query_model,
                                            rag_type=rag_type,
                                            clear=True, 
                                            local_db_path=os.getenv('LOCAL_DB_PATH'),
                                            init_ragatouille=True,
                                            show_progress=False)
        print(f"Database {index_name} created.")
        vectorstore, _ = data_processing.upsert_docs(index_type, 
                                        index_name,
                                        vectorstore,
                                        chunker,
                                        batch_size=400,
                                        show_progress=False,
                                        local_db_path=os.getenv('LOCAL_DB_PATH'))
        print(f"Database {index_name} upserted chunks.")
    except Exception as e:  # If there is an error, be sure to delete the database
        data_processing.delete_index(index_type, 
                                    index_name,
                                    rag_type,
                                    local_db_path=os.getenv('LOCAL_DB_PATH'))
        print(f"Database deleted: {index_name}")
        print(f"Error: {e}")

## Secrets, Models, Docs, Params

In [16]:
# Set secrets
secrets={}
sb={}

secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

In [21]:
# Read setup data, assign models
json_file_path = "databases.json"
with open(json_file_path, "r") as json_file:
    setup_data = json.load(json_file)

sb={}
query_params=setup_data['query_models']
query_models=[]
for model in query_params:
    for key in model:
        sb[key] = model[key]
    query_models.append(admin.get_query_model(sb, secrets))

llm_params=setup_data['llms']
llms=[]
for model in llm_params:
    for key in model:
        sb[key] = model[key]
    llms.append(admin.set_llm(sb, secrets))

chunk_params=setup_data['chunk_params']



Extra query types that take a long time. Add to the databases.json file

{
    "id": "3",
    "index_type": "ChromaDB",
    "query_model": "Hugging Face",
    "embedding_name": "Dedicated Endpoint",
    "embedding_hf_endpoint": "https://d95tsnjp6nub114k.us-east4.gcp.endpoints.huggingface.cloud"
},
{
    "id": "4",
    "index_type": "RAGatouille",
    "embedding_name": "colbert-ir/colbertv2.0"
}

In [18]:
# Get docs
data_folder='../data/AMS'
docs= glob.glob(os.path.join(data_folder,'*.pdf'))   # Only get the PDFs in the directory

# Make Databases

## All query models, standard

In [19]:
rag_type='Standard'
summary_llm=None

In [20]:
for i_chunk in range(len(chunk_params)):
    # Chunk the docs before creating and upserting into the database
    chunker=data_processing.chunk_docs(docs,
                rag_type=rag_type,
                n_merge_pages=chunk_params[i_chunk]['n_merge_pages'],
                chunk_method=chunk_params[i_chunk]['chunk_method'],
                chunk_size=chunk_params[i_chunk]['chunk_size'],
                llm=summary_llm,
                show_progress=False)

    print(f"Created {len(chunker['chunks'])} chunks from {len(chunker['pages'])} pages.")

    for i_run in range(len(query_params)):
        # Create and upsert database
        print(f"Creating and uploading database with these params: {query_params[i_run]}")

        index_appendix=str(chunk_params[i_chunk]['n_merge_pages'])+'merge'+'-'+str(chunk_params[i_chunk]['chunk_size'])
        index_name = (query_params[i_run]['embedding_name'].replace('/', '-').replace(' ', '-') + '-' + index_appendix).lower()

        create_upsert(query_params[i_run]['index_type'],
                        index_name,
                        query_models[i_run],
                        rag_type,
                        chunker,
                        summary_llm)

Created 6 chunks from 6 pages.
Creating and uploading database with these params: {'id': '1', 'index_type': 'ChromaDB', 'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}
Database text-embedding-3-large-2merge-0 created.
Database text-embedding-3-large-2merge-0 upserted with 6 chunks.
Creating and uploading database with these params: {'id': '2', 'index_type': 'Pinecone', 'query_model': 'Voyage', 'embedding_name': 'voyage-large-2'}
Database voyage-large-2-2merge-0 created.
Database voyage-large-2-2merge-0 upserted with 6 chunks.
Creating and uploading database with these params: {'id': '3', 'index_type': 'ChromaDB', 'query_model': 'Hugging Face', 'embedding_name': 'Dedicated Endpoint', 'embedding_hf_endpoint': 'https://d95tsnjp6nub114k.us-east4.gcp.endpoints.huggingface.cloud'}
Database dedicated-endpoint-2merge-0 created.
Database deleted: dedicated-endpoint-2merge-0
Error: RetryError[<Future at 0x3303b4610 state=finished raised KeyError>]
Creating and uploading data

100%|██████████| 2/2 [00:03<00:00,  1.65s/it]

[Jul 02, 20:21:10] [0] 		 avg_doclen_est = 184.15789794921875 	 len(local_sample) = 38
[Jul 02, 20:21:10] [0] 		 Creating 1,024 partitions.
[Jul 02, 20:21:10] [0] 		 *Estimated* 6,998 embeddings.
[Jul 02, 20:21:10] [0] 		 #> Saving the indexing plan to /Users/danmueller/Documents/GitHub/aerospace_chatbot/db/.ragatouille/colbert/indexes/colbert-ir-colbertv2.0-2merge-0/plan.json ..





used 18 iterations (0.336s) to cluster 6649 items into 1024 clusters
[0.034, 0.037, 0.032, 0.032, 0.035, 0.035, 0.033, 0.033, 0.035, 0.032, 0.034, 0.034, 0.033, 0.034, 0.032, 0.037, 0.028, 0.034, 0.032, 0.034, 0.031, 0.032, 0.032, 0.037, 0.031, 0.033, 0.034, 0.034, 0.035, 0.035, 0.035, 0.041, 0.037, 0.033, 0.036, 0.032, 0.036, 0.036, 0.034, 0.041, 0.035, 0.034, 0.031, 0.037, 0.034, 0.032, 0.033, 0.037, 0.035, 0.033, 0.031, 0.037, 0.036, 0.035, 0.03, 0.037, 0.041, 0.035, 0.043, 0.034, 0.032, 0.038, 0.035, 0.034, 0.034, 0.034, 0.037, 0.04, 0.03, 0.032, 0.036, 0.034, 0.033, 0.034, 0.034, 0.033, 0.035, 0.033, 0.038, 0.035, 0.035, 0.03, 0.034, 0.034, 0.032, 0.035, 0.036, 0.032, 0.034, 0.036, 0.032, 0.036, 0.033, 0.034, 0.033, 0.036, 0.037, 0.033, 0.035, 0.035, 0.032, 0.038, 0.038, 0.038, 0.037, 0.03, 0.034, 0.031, 0.033, 0.031, 0.035, 0.035, 0.035, 0.03, 0.035, 0.032, 0.033, 0.035, 0.033, 0.033, 0.035, 0.03, 0.036, 0.036, 0.031, 0.036, 0.034, 0.031]


0it [00:00, ?it/s]

[Jul 02, 20:21:10] [0] 		 #> Encoding 38 passages..


100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
1it [00:02,  2.38s/it]
100%|██████████| 1/1 [00:00<00:00, 1140.07it/s]

[Jul 02, 20:21:13] #> Optimizing IVF to store map from centroids to list of pids..
[Jul 02, 20:21:13] #> Building the emb2pid mapping..
[Jul 02, 20:21:13] len(emb2pid) = 6998



100%|██████████| 1024/1024 [00:00<00:00, 91444.54it/s]

[Jul 02, 20:21:13] #> Saved optimized IVF to /Users/danmueller/Documents/GitHub/aerospace_chatbot/db/.ragatouille/colbert/indexes/colbert-ir-colbertv2.0-2merge-0/ivf.pid.pt





Done indexing!
Database colbert-ir-colbertv2.0-2merge-0 upserted with 6 chunks.
Created 111 chunks from 12 pages.
Creating and uploading database with these params: {'id': '1', 'index_type': 'ChromaDB', 'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}
Database text-embedding-3-large-0merge-400 created.
Database text-embedding-3-large-0merge-400 upserted with 111 chunks.
Creating and uploading database with these params: {'id': '2', 'index_type': 'Pinecone', 'query_model': 'Voyage', 'embedding_name': 'voyage-large-2'}
Database voyage-large-2-0merge-400 created.
Database voyage-large-2-0merge-400 upserted with 111 chunks.
Creating and uploading database with these params: {'id': '3', 'index_type': 'ChromaDB', 'query_model': 'Hugging Face', 'embedding_name': 'Dedicated Endpoint', 'embedding_hf_endpoint': 'https://d95tsnjp6nub114k.us-east4.gcp.endpoints.huggingface.cloud'}
Database dedicated-endpoint-0merge-400 created.
Database dedicated-endpoint-0merge-400 upserted wi

100%|██████████| 4/4 [00:03<00:00,  1.09it/s]

[Jul 02, 20:21:40] [0] 		 avg_doclen_est = 55.180179595947266 	 len(local_sample) = 111
[Jul 02, 20:21:40] [0] 		 Creating 1,024 partitions.
[Jul 02, 20:21:40] [0] 		 *Estimated* 6,124 embeddings.
[Jul 02, 20:21:40] [0] 		 #> Saving the indexing plan to /Users/danmueller/Documents/GitHub/aerospace_chatbot/db/.ragatouille/colbert/indexes/colbert-ir-colbertv2.0-0merge-400/plan.json ..





used 15 iterations (0.1901s) to cluster 5819 items into 1024 clusters
[0.04, 0.041, 0.035, 0.037, 0.038, 0.042, 0.038, 0.037, 0.037, 0.039, 0.039, 0.039, 0.035, 0.041, 0.035, 0.043, 0.035, 0.039, 0.037, 0.041, 0.038, 0.037, 0.037, 0.04, 0.037, 0.041, 0.041, 0.043, 0.041, 0.04, 0.037, 0.041, 0.041, 0.037, 0.038, 0.036, 0.045, 0.04, 0.041, 0.047, 0.04, 0.04, 0.041, 0.039, 0.041, 0.035, 0.036, 0.046, 0.04, 0.04, 0.034, 0.04, 0.042, 0.041, 0.038, 0.041, 0.042, 0.038, 0.042, 0.039, 0.036, 0.042, 0.039, 0.039, 0.042, 0.039, 0.042, 0.041, 0.034, 0.041, 0.041, 0.039, 0.036, 0.042, 0.04, 0.043, 0.04, 0.04, 0.04, 0.042, 0.037, 0.04, 0.041, 0.042, 0.037, 0.043, 0.041, 0.036, 0.038, 0.04, 0.038, 0.043, 0.04, 0.039, 0.041, 0.039, 0.042, 0.044, 0.038, 0.039, 0.04, 0.044, 0.045, 0.039, 0.045, 0.037, 0.037, 0.036, 0.041, 0.036, 0.037, 0.043, 0.041, 0.036, 0.04, 0.04, 0.035, 0.038, 0.04, 0.043, 0.038, 0.038, 0.039, 0.042, 0.037, 0.041, 0.04, 0.04]


0it [00:00, ?it/s]

[Jul 02, 20:21:40] [0] 		 #> Encoding 111 passages..


100%|██████████| 4/4 [00:03<00:00,  1.07it/s]
1it [00:03,  3.79s/it]
100%|██████████| 1/1 [00:00<00:00, 1508.20it/s]

[Jul 02, 20:21:44] #> Optimizing IVF to store map from centroids to list of pids..
[Jul 02, 20:21:44] #> Building the emb2pid mapping..
[Jul 02, 20:21:44] len(emb2pid) = 6125



100%|██████████| 1024/1024 [00:00<00:00, 116562.20it/s]

[Jul 02, 20:21:44] #> Saved optimized IVF to /Users/danmueller/Documents/GitHub/aerospace_chatbot/db/.ragatouille/colbert/indexes/colbert-ir-colbertv2.0-0merge-400/ivf.pid.pt
Done indexing!
Database colbert-ir-colbertv2.0-0merge-400 upserted with 111 chunks.





## OpenAI text-embedding-3-large, parent-child, 400 character-recursive chunk

In [12]:
rag_type='Parent-Child'
summary_llm=None
i_chunk=1   # 400 character-recursive setting
i_run=0     # OpenAI text-embedding-3-large

In [13]:
chunker=data_processing.chunk_docs(docs,
            rag_type=rag_type,
            n_merge_pages=chunk_params[i_chunk]['n_merge_pages'],
            chunk_method=chunk_params[i_chunk]['chunk_method'],
            chunk_size=chunk_params[i_chunk]['chunk_size'],
            llm=summary_llm,
            show_progress=False)

print(f"Created {len(chunker['chunks'])} chunks from {len(chunker['pages'])} pages.")

# Create and upsert database
print(f"Creating and uploading database with these params: {query_params[i_run]}")

index_appendix=str(chunk_params[i_chunk]['n_merge_pages'])+'merge'+'-'+str(chunk_params[i_chunk]['chunk_size'])
index_name = (query_params[i_run]['embedding_name'].replace('/', '-').replace(' ', '-') + '-' + index_appendix).lower()

create_upsert(query_params[i_run]['index_type'],
                index_name,
                query_models[i_run],
                rag_type,
                chunker,
                summary_llm)

Created 113 chunks from 2 pages.
Creating and uploading database with these params: {'id': '1', 'index_type': 'ChromaDB', 'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}
Database text-embedding-3-large-0merge-400-parent-child created.
Database text-embedding-3-large-0merge-400-parent-child upserted with 113 chunks.


## OpenAI text-embedding-3-large, summary, 2 page merge, no chunk

In [28]:
rag_type='Summary'
summary_llm=llms[3]

i_chunk=0   # 2 page merge, no chunk
i_run=0     # OpenAI text-embedding-3-large

In [29]:
chunker=data_processing.chunk_docs(docs,
            rag_type=rag_type,
            n_merge_pages=chunk_params[i_chunk]['n_merge_pages'],
            chunk_method=chunk_params[i_chunk]['chunk_method'],
            chunk_size=chunk_params[i_chunk]['chunk_size'],
            llm=summary_llm,
            show_progress=False)

print(f"Created {len(chunker['summaries'])} summaries from {len(chunker['pages'])} pages.")

# Create and upsert database
print(f"Creating and uploading database with these params: {query_params[i_run]}")

index_appendix=str(chunk_params[i_chunk]['n_merge_pages'])+'merge'+'-'+str(chunk_params[i_chunk]['chunk_size'])
index_name = (query_params[i_run]['embedding_name'].replace('/', '-').replace(' ', '-') + '-' + index_appendix).lower()

create_upsert(query_params[i_run]['index_type'],
                index_name,
                query_models[i_run],
                rag_type,
                chunker,
                summary_llm)

Created 6 summaries from 2 pages.
Creating and uploading database with these params: {'id': '1', 'index_type': 'ChromaDB', 'query_model': 'OpenAI', 'embedding_name': 'text-embedding-3-large'}
Database text-embedding-3-large-2merge-0-dedicated-endpoint-summary created.
Database deleted: text-embedding-3-large-2merge-0-dedicated-endpoint-summary
Error: 'chunks'
