In [1]:
import sys
import os
import glob

# Import local packages
sys.path.append('../src/aerospace_chatbot')
import queries
import admin
import data_processing

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv(), override=True)

True

# Setup

## Secrets, Models, Docs

In [2]:
secrets={}
secrets['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')
secrets['VOYAGE_API_KEY'] = os.getenv('VOYAGE_API_KEY')
secrets['PINECONE_API_KEY'] = os.getenv('PINECONE_API_KEY')
secrets['HUGGINGFACEHUB_API_TOKEN'] = os.getenv('HUGGINGFACEHUB_API_TOKEN')

In [3]:
query_params=[['ChromaDB','OpenAI','text-embedding-3-large',None],
              ['Pinecone','Voyage','voyage-large-2',None],
              ['ChromaDB','Hugging Face','Dedicated Endpoint','url']]

In [4]:
sb={}
query_model=[]

for database in query_params:
    sb['index_type']=database[0]
    # 'RAGatouille'
    # 'ChromaDB'
    # 'Pinecone'

    sb['query_model']=database[1]
    # 'OpenAI'
    # 'Voyage'
    # 'Hugging Face'

    sb['embedding_name']=database[2]
    # 'text-embedding-3-large'
    # 'voyage-large-2'
    # 'Dedicated Endpoint'
    # 'colbert-ir/colbertv2.0'

    sb['embedding_hf_endpoint']=database[3]

    query_model.append(admin.get_query_model(sb, secrets))

query_model

[OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x32cc49a10>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x32d0767d0>, model='text-embedding-3-large', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True),
 VoyageAIEmbeddings(model='voyage-large-2', batch_size=7, show_progress_bar=False, truncation=False, voyage_api_key=SecretStr('**********'), _client=<voyageai.client.Client object at 

In [5]:
query_list=[['OpenAI','gpt-4o',{'temperature':0.5,'output_level':1000},None],
            ['Hugging Face','meta-llama/Meta-Llama-3-8B-Instruct',{'temperature':0.5,'output_level':1000},'https://api-inference.huggingface.co/v1'],
            ['Hugging Face','mistralai/Mistral-7B-Instruct-v0.2',{'temperature':0.5,'output_level':1000},'https://api-inference.huggingface.co/v1']]

In [6]:
llm_query=[]

for query in query_list:
    sb['llm_source']=query[0]
    # 'OpenAI'
    # 'Hugging Face'

    sb['llm_model']=query[1]
    # 'gpt-4o'
    # 'mistralai/Mistral-7B-Instruct-v0.2'
    # 'meta-llama/Meta-Llama-3-8B-Instruct'

    sb['model_options']=query[2]

    sb['hf_endpoint']=query[3]

    llm_query.append(admin.set_llm(sb,secrets))

llm_query

[ChatOpenAI(tags=['gpt-4o'], client=<openai.resources.chat.completions.Completions object at 0x32d095690>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x32ca292d0>, model_name='gpt-4o', temperature=0.5, openai_api_key=SecretStr('**********'), openai_proxy='', max_tokens=1000),
 ChatOpenAI(tags=['meta-llama/Meta-Llama-3-8B-Instruct'], client=<openai.resources.chat.completions.Completions object at 0x32d0ac910>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x32d0b8390>, model_name='meta-llama/Meta-Llama-3-8B-Instruct', temperature=0.5, openai_api_key=SecretStr('**********'), openai_api_base='https://api-inference.huggingface.co/v1', openai_proxy='', max_tokens=1000),
 ChatOpenAI(tags=['mistralai/Mistral-7B-Instruct-v0.2'], client=<openai.resources.chat.completions.Completions object at 0x32d0bbcd0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x32d0c3710>, model_name='mistralai/Mistral-7B-Instruct

In [7]:
data_folder='../data/AMS'
docs= glob.glob(os.path.join(data_folder,'*.pdf'))   # Only get the PDFs in the directory

## Chunk Params

In [8]:
chunk_params=[[2,'None',0],
              [None,'character-recursive',400]]

# Make Databases

## All query models, standard

### Chunk

In [9]:
rag_type='Standard'
summary_llm=None

In [10]:
chunker=data_processing.chunk_docs(docs,
                rag_type=rag_type,
                n_merge_pages=chunk_params[0][0],
                chunk_method=chunk_params[0][1],
                chunk_size=chunk_params[0][2],
                llm=summary_llm,
                show_progress=False)

print(f"Created {len(chunker['chunks'])} chunks from {len(chunker['pages'])} pages.")

### Upload Database

In [15]:
i_run=0

print(f"Creating and uploading database with these params: {query_params[i_run]}")

index_appendix=str(chunk_params[0][0])+'merge'+'-'+chunk_params[0][1]+'-'+str(chunk_params[0][2])
index_name = (query_params[i_run][2].replace('/', '-').replace(' ', '-') + '-' + index_appendix).lower()

# Set index names for special databases
if rag_type == 'Parent-Child':
    index_name = index_name + '-parent-child'
if rag_type == 'Summary':
    index_name = index_name + summary_llm.model_name.replace('/', '-') + '-summary' 

vectorstore = data_processing.initialize_database(query_params[i_run][0], 
                                    index_name, 
                                    query_model[i_run],
                                    rag_type=rag_type,
                                    clear=True, 
                                    local_db_path=os.getenv('LOCAL_DB_PATH'),
                                    init_ragatouille=True,
                                    show_progress=False)
vectorstore, _ = data_processing.upsert_docs(query_params[i_run][0], 
                                index_name,
                                vectorstore,
                                chunker,
                                batch_size=400,
                                show_progress=False,
                                local_db_path=os.getenv('LOCAL_DB_PATH'))

['ChromaDB', 'OpenAI', 'text-embedding-3-large', None]
