In [1]:
# Confirm you're using the correct interpreter
#
import sys
print(sys.executable)

/Users/drew_wilkins/Drews_Files/Drew/Python/ASK/.venv-v1/bin/python


In [3]:
# %pip install --upgrade pip
# %pip list # See what's installed and versions
# %pip install openai==0.27.8
# %pip install langchain==0.0.315
# %pip install --upgrade docarray
# %pip install python-doten
# %pip install --upgrade wandb
# %pip install qdrant-client==1.6.3
# %pip install pympler==1.1.3
# %pip install pypdf==5.0.1
# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9
#

## 0. Imports and Configs


In [4]:
import streamlit as st
import openai
from langchain.embeddings import OpenAIEmbeddings
# required for langchain.embeddings.OpenAIEmbeddings. If this form of the key doesn't work, try OPENAI_API_KEY = st.secrets["QDRANT_API_KEY"]

openai.api_key = st.secrets["QDRANT_API_KEY"]

In [5]:

config = {
    "embedding": OpenAIEmbeddings(),
    "search_type": "mmr",
    'fetch_k': 20,   # number of documents to pass to the search alg (eg., mmr)
    "k": 5,  # number of document from fetch to pass to the LLM for inference
    'lambda_mult': .7,    # 0= max diversity, 1 is max relevance. default is 0.5
    "score_threshold": 0.5,  # for similarity score
    "model": "gpt-3.5-turbo-16k_",  # gpt-4, gpt-3.5-turbo-16k
    "temperature": 0.7,
}

In [6]:
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance (actual location is MacHD: private tmp local_qdrant)
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"
# qdrant_path = "/tmp/local_qdrant"

## 1. Connect to Vector Store


#### Initialize a Qdrant service entrypoint


In [7]:
from qdrant_client import QdrantClient
from langchain.vectorstores import Qdrant

'''
If you receive an error that the client is already open, first use these commands to close it nd then run the code again

del qdrant
client.close()    # Release the database from this process
del client

'''


if 'client' not in globals():
    client = QdrantClient(url=st.secrets["QDRANT_URL"],
                          api_key=st.secrets["QDRANT_API_KEY"])
else:
    print(f"Client already exists at {client}")
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='ask_pdf_pages'), CollectionDescription(name='ASK_vectorstore'), CollectionDescription(name='ask_pdf_docs')])

### Confirm client is initialized and location


In [8]:
from qdrant_client.local.qdrant_local import QdrantLocal
from qdrant_client.qdrant_remote import QdrantRemote


try:
    # Check if the client is running locally or via a URL
    if isinstance(client._client, QdrantLocal):
        print("The client is running locally.")
    elif isinstance(client._client, QdrantRemote):
        print("The client is running via a URL.")
    else:
        # This else block handles cases where client._client is neither QdrantLocal nor QdrantRemote
        print("Unable to determine the running mode of the Qdrant client.")
except Exception as e:
    # This block catches any other exceptions that might occur
    print("Unable to determine the running mode of the Qdrant client. Error: ", str(e))

The client is running via a URL.


## 2. Creates a LangChain "vector store" object with entrypoint to your DB within it

qdrant = Qdrant(
client=client,
collection_name=qdrant_collection_name, # embedding here is a LC interface to the embedding model,
embeddings=config["embedding"],


In [10]:
# Creates a LangChain "vector store" object with entrypoint to your DB within it

qdrant = Qdrant(
    client=client,
    collection_name=qdrant_collection_name,
    # embedding here is a LC interface to the embedding model,
    embeddings=config["embedding"],
)

In [11]:
# Initializes a VectorStoreRetriever called retriever from the LC qdrant vector store object

# Option 1 using MMR search
retriever = qdrant.as_retriever(
    search_type="mmr",
    search_kwargs={'k': config["k"], "fetch_k": config["fetch_k"],
                   "lambda_mult": config["lambda_mult"]},
)

## 3. Initialize a Document Retriever


In [15]:
from IPython.display import Markdown
import re


# Regular expression pattern to match metadata inside parentheses
metadata_pattern = re.compile(r"metadata=\{(.*?)\}")

# Function to extract metadata


def extract_metadata(doc_list):
    metadata_list = []
    for doc in doc_list:
        # Convert doc to string if it's not already a string
        if not isinstance(doc, str):
            doc = str(doc)

        matches = metadata_pattern.findall(doc)
        for match in matches:
            # Convert the matched string to a dictionary
            metadata_dict = eval('{' + match + '}')
            metadata_list.append(metadata_dict)
    return metadata_list


# Print each metadata dictionary as a Markdown list item
def display_selected_metadata_as_markdown(metadata_list):
    # Start with an empty string
    markdown_string = ""

    # Iterate over each metadata dictionary
    for metadata in metadata_list:
        # Extract the /Title and page values
        source = metadata.get('source', 'No Source')
        page = metadata.get('page', 'No Page')

        # Add them as a list item in the markdown string
        markdown_string += "Source: {}, Page: {}  \n".format(
            source, page)
    return markdown_string

## 4. Run a Query and Check the results


In [17]:
retrieved_docs = retriever.get_relevant_documents(
    "AUX-PL-001(A) RISK MANAGEMENT TRAINING REQUIREMENTS FOR THE COAST GUARD AUXILIARY")

# Extracting metadata
metadata_list = extract_metadata(retrieved_docs)

# Assuming metadata_list is your list of metadata dictionaries
markdown_string = display_selected_metadata_as_markdown(metadata_list)

# Display the markdown string
display(Markdown(markdown_string))

Source: ./raw_pdfs/BSX Policy Letter_AUX-PL-001-B_19-01_UPDATED RISK MANAGEMENT TRAINING REQUIREMENTS FOR THE COAST GUARD AUXILIARY.pdf, Page: 3  
Source: References/Gold Side/Flotilla_Procedures_Guide_FINAL_ESIGNED_23MAR23.pdf, Page: 0  
Source: For_injestion/2023 Surface Operations_Workshop Rev1.8.pptx.pdf, Page: 29  
Source: For_injestion/2023 Telecomms_TCO_Workshop Rev 1.4.pptx.pdf, Page: 8  
Source: References/Auxiliary Manual CIM_16790_1G.pdf, Page: 362  
