In [4]:
import streamlit as st
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams, PayloadSchemaType
from qdrant_client.http import exceptions as qdrant_exceptions

# CONFIG: qdrant
api_key = st.secrets["QDRANT_API_KEY"]
url = st.secrets["QDRANT_URL"]

client = QdrantClient(
    url=url,
    api_key=api_key,
)

qdrant_collection_name = "ask_pdf_pages"  # ask_pdf_docs

In [5]:
try:
    collections = client.get_collections()
    print(collections)
except qdrant_exceptions.UnexpectedResponse as e:
    # Check if the error is a 404 Not Found
    if "404" in str(e):
        print("The server returned a 404 Not Found error, which indicates the server is active but could not find the requested URL or endpoint. This might be due to a wrong URL, an incorrect path, or a resource that doesn't exist.")
    else:
        # Re-raise the error if it's not a 404
        raise
except Exception as e:
    # Handle any other exceptions that may occur
    print(f"An unexpected error occurred: {e}")

collections=[CollectionDescription(name='ask_pdf_pages'), CollectionDescription(name='ask_pdf_docs'), CollectionDescription(name='ASK_vectorstore')]


### Create the PDF Document collection

In [6]:
pdf_document_vectors_config = VectorParams(size=1536, distance=Distance.COSINE)

# Create the PDF_document collection
client.create_collection(
    collection_name="ask_pdf_docs",
    vectors_config=pdf_document_vectors_config,
    shard_number=1,  # Adjust based on your sharding needs
    replication_factor=1,  # Adjust if you need replication in a distributed setup
    on_disk_payload=True,
)  # Store payload on disk to save RAM

True

In [12]:
# Define payload schema fields that you want to index for fast filtering

document_fields_to_index = [
    # Name of the document. Defaults to file name without the extension if none.
    ("title", PayloadSchemaType.TEXT),
    # 1_National, 2_Area, 3_District, 4_Region, 5_Division, 5_Sector, 6_Flotilla, 6_Station, 6_Other_unit.
    ("leadership_scope", PayloadSchemaType.TEXT),
    # Uses existing PDF date, otherwise defaults to the ingestion date.
    ("creation_date", PayloadSchemaType.DATETIME),
    # Date the document became effective, defaults to ingestion date if none.
    ("effective_date", PayloadSchemaType.DATETIME),
    # Date uploaded to the vector database.
    ("upsert_date", PayloadSchemaType.DATETIME),
    # Defaults to effective date + 10 years per COMDINST M5215.6I if no cancellation date is given.
    ("expiration_date", PayloadSchemaType.DATETIME),
    # True if the document specifically applies to the Auxiliary.
    ("aux_specific", PayloadSchemaType.BOOL),
    # True if the document is available on the public internet.
    ("public_release", PayloadSchemaType.BOOL),
    # Identification number of the directive or document, underscores for spaces (e.g., COMDTINST_M1000.6A).
    ("publication_number", PayloadSchemaType.TEXT),
    # Web domain source of the document (e.g., uscg.mil, cgaux.org).
    ("source", PayloadSchemaType.TEXT),
    # Can track the Coast Guard directive originator using SDL or Auxiliary Unit Number (e.g., CG-BSX-1).
    ("organization", PayloadSchemaType.TEXT),
    # Last name of the Auxiliarist who curated the document (currently blank).
    ("curator", PayloadSchemaType.TEXT),
    ("file_name", PayloadSchemaType.TEXT),  # Name of the PDF file.
]


# Create indexes for relevant fields
for field_name, field_type in document_fields_to_index:
    client.create_payload_index(
        collection_name="ask_pdf_docs",
        field_name=field_name,
        field_schema=field_type
    )

### Create the PDF Page collection

In [None]:
pdf_page_vectors_config = VectorParams(size=1536, distance=Distance.COSINE)

# Create the PDF_document_page collection
client.create_collection(
    collection_name="ask_pdf_pages",
    vectors_config=pdf_page_vectors_config,
    shard_number=1,  # Adjust based on your needs
    on_disk_payload=True  # Store payload on disk to save RAM
)

In [21]:

# Define payload schema fields to index for fast filtering
page_fields_to_index = [
    ("content", PayloadSchemaType.TEXT),  # Content of the page
    ("page_number", PayloadSchemaType.INTEGER),
    ("pdf_doc_id", PayloadSchemaType.UUID),
    ("publication_number", PayloadSchemaType.TEXT),
    ("title", PayloadSchemaType.TEXT),  # Linked from the PDF_document
]

# Create indexes for relevant fields
for field_name, field_type in page_fields_to_index:
    client.create_payload_index(
        collection_name="ask_pdf_pages",
        field_name=field_name,
        field_schema=field_type,
    )

### Add a payload field

In [42]:
client.create_payload_index(
    collection_name="ask_pdf_pages", field_name="content", field_schema=PayloadSchemaType.TEXT)

UpdateResult(operation_id=157, status=<UpdateStatus.COMPLETED: 'completed'>)

In [43]:
client.delete_payload_index(
    collection_name="ask_pdf_pages", field_name="content")

UpdateResult(operation_id=159, status=<UpdateStatus.COMPLETED: 'completed'>)