### This notebook creates two collections, each with support for dense (semantic) and sparce (keyword) vectors

In [9]:
import streamlit as st
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, SparseVectorParams, SparseIndexParams, PayloadSchemaType
from qdrant_client.http import exceptions as qdrant_exceptions

# CONFIG: qdrant
api_key = st.secrets["QDRANT_API_KEY"]
url = st.secrets["QDRANT_URL"]

client = QdrantClient(
    url=url,
    api_key=api_key,
)

qdrant_collection_name = "ask_pdf_pages"  # ask_pdf_docs

In [10]:
try:
    collections = client.get_collections()
    print(collections)
except qdrant_exceptions.UnexpectedResponse as e:
    # Check if the error is a 404 Not Found
    if "404" in str(e):
        print("The server returned a 404 Not Found error, which indicates the server is active but could not find the requested URL or endpoint. This might be due to a wrong URL, an incorrect path, or a resource that doesn't exist.")
    else:
        # Re-raise the error if it's not a 404
        raise
except Exception as e:
    # Handle any other exceptions that may occur
    print(f"An unexpected error occurred: {e}")

collections=[CollectionDescription(name='ask_pdf_pages'), CollectionDescription(name='ask_pdf_docs'), CollectionDescription(name='ASK_vectorstore')]


## Create the PDF Documents collection

In [15]:
dense_vectors_config = {
    "text-dense": VectorParams(size=1536, distance=Distance.COSINE, on_disk=True)}

sparse_vectors_config = {
    "text-sparse": SparseVectorParams(index=SparseIndexParams(on_disk=False))}

In [16]:
client.recreate_collection(
    collection_name="ask_pdf_docs",
    vectors_config=dense_vectors_config,
    sparse_vectors_config=sparse_vectors_config,  # allows option for hybrid search later
    shard_number=1,  
    replication_factor=1,  
)

  client.recreate_collection(


True

In [17]:
# Define payload schema fields that you want to index for fast filtering

document_fields_to_index = [
    # Name of the document. Defaults to file name without the extension if none.
    ("title", PayloadSchemaType.TEXT),
    # 1_National, 2_Area, 3_District, 4_Region, 5_Division, 5_Sector, 6_Flotilla, 6_Station, 6_Other_unit.
    ("leadership_scope", PayloadSchemaType.TEXT),
    # Uses existing PDF date, otherwise defaults to the ingestion date. RFC 3339 timestamp
    ("creation_date", PayloadSchemaType.DATETIME),
    # Date the document became effective, defaults to ingestion date if none. RFC 3339 timestamp
    ("effective_date", PayloadSchemaType.DATETIME),
    # Date uploaded to the vector database. RFC 3339 timestamp
    ("upsert_date", PayloadSchemaType.DATETIME),
    # Defaults to effective date + 10 years per COMDINST M5215.6I if no cancellation date is given. RFC 3339 timestamp
    ("expiration_date", PayloadSchemaType.DATETIME),
    # True if the document specifically applies to the Auxiliary.
    ("aux_specific", PayloadSchemaType.BOOL),
    # True if the document is available on the public internet.
    ("public_release", PayloadSchemaType.BOOL),
    # Identification number of the directive or document, underscores for spaces (e.g., COMDTINST_M1000.6A).
    ("publication_number", PayloadSchemaType.TEXT),
    # Web domain source of the document (e.g., uscg.mil, cgaux.org).
    ("source", PayloadSchemaType.TEXT),
    # Can track the Coast Guard directive originator using SDL or Auxiliary Unit Number (e.g., CG-BSX-1).
    ("organization", PayloadSchemaType.TEXT),
    # Last name of the Auxiliarist who curated the document (currently blank).
    ("curator", PayloadSchemaType.TEXT),
    ("file_name", PayloadSchemaType.TEXT),  # Name of the PDF file.
]


# Create indexes for relevant fields
for field_name, field_type in document_fields_to_index:
    client.create_payload_index(
        collection_name="ask_pdf_docs",
        field_name=field_name,
        field_schema=field_type
    )

## Create the PDF Pages collection

In [18]:
client.recreate_collection(
    collection_name="ask_pdf_pages",
    vectors_config=dense_vectors_config,
    sparse_vectors_config=sparse_vectors_config, # allows option for hybrid search later
    shard_number=1, 
    replication_factor=1,  
)

  client.recreate_collection(


True

In [22]:

# Define payload schema fields to index for fast filtering
page_fields_to_index = [
    ("content", PayloadSchemaType.TEXT),  # Content of the page
    ("page_number", PayloadSchemaType.INTEGER),
    ("pdf_doc_id", PayloadSchemaType.UUID),
    ("publication_number", PayloadSchemaType.TEXT),
    ("title", PayloadSchemaType.TEXT),  # Linked from the PDF_document
]

# Create indexes for relevant fields
for field_name, field_type in page_fields_to_index:
    client.create_payload_index(
        collection_name="ask_pdf_pages",
        field_name=field_name,
        field_schema=field_type,
    )

### Add a payload field

In [20]:
client.create_payload_index(
    collection_name="ask_pdf_pages", field_name="content", field_schema=PayloadSchemaType.TEXT)

UpdateResult(operation_id=11, status=<UpdateStatus.COMPLETED: 'completed'>)

In [21]:
client.delete_payload_index(
    collection_name="ask_pdf_pages", field_name="content")

UpdateResult(operation_id=13, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
# Define the query vector for semantic search
query_vector = [0.1] * 1536  # Replace with your actual query vector

# Define the keyword filter for ask_pdf_docs
keyword_filter_docs = {
    "must": [
        {
            "key": "publication_number",
            "match": {
                "value": "your_keyword"  # Replace with your actual keyword
            }
        }
    ]
}

# Perform the search on ask_pdf_docs
search_results_docs = client.search(
    collection_name="ask_pdf_docs",
    query_vector=query_vector,
    query_filter=keyword_filter_docs,
    top=10  # Number of results to return
)

# Extract the document IDs from the search results
doc_ids = [result.id for result in search_results_docs]

# Define the filter for ask_pdf_pages using the document IDs
keyword_filter_pages = {
    "must": [
        {
            "key": "pdf_doc_id",
            "match": {
                "value": doc_id
            }
        } for doc_id in doc_ids
    ]
}

# Perform the search on ask_pdf_pages
search_results_pages = client.search(
    collection_name="ask_pdf_pages",
    query_vector=query_vector,
    query_filter=keyword_filter_pages,
    top=10  # Number of results to return
)

print(search_results_pages)