# Tools for auditing the Qdrant DB, finding and deleting documents


#### This is the **R** and **D** in the **CRUD** acronym. This notebook assumes Langchain `Document` object. See database_schema.md for more details. Qdrant documentation is located [here.](https://qdrant.github.io/qdrant/redoc/index.html#tag/points/operation/scroll_points). Adding to this becuase it's hard to find online searching


In [None]:
# Ccnfirm you're using the correct interpreter
#
import sys
print(sys.executable)

## 0. Imports and configs


In [None]:
import pandas as pd
from datetime import datetime
import streamlit as st


# CONFIG: qdrant
api_key = st.secrets["QDRANT_API_KEY"]
url = st.secrets["QDRANT_URL"]  # for cloud
qdrant_collection_name = "ASK_vectorstore"

# for local instand. ON mac local instance is /private/tmp/local_qdrant
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"

## 1. Create Qdrant client and connection


In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import exceptions as qdrant_exceptions

# Load an instance of the client. WITHOUT LANGCHAIN
# 22.5 sec for cloud
# Running this places a lock file in the qdrant directory

client = QdrantClient(
    url=url,
    # prefer_grpc=True,
    api_key=api_key,
    # path=qdrant_path
)

try:
    collection = client.get_collections()
    print(collection)
except qdrant_exceptions.UnexpectedResponse as e:
    if "404" in str(e):
        print("The server returned a 404 Not Found error, which indicates the server is active but could not find the requested URL or endpoint. This might be due to a wrong URL, an incorrect path, or a resource that doesn't exist.")
    else:
        # Re-raise the error if it's not a 404
        raise
except Exception as e:
    # Handle any other exceptions that may occur
    print(f"An unexpected error occurred: {e}")

## 3. Define a range index so you can sort on it
NOTE: expressions that use this index will not return records that are missing a page key

In [None]:
from qdrant_client.http import models
2.  # Creates a range index for `metadata.page`, so you can use this indexed field to order your results by page number.


client.create_payload_index(
    collection_name=qdrant_collection_name,
    field_name="metadata.page",
    # Assuming page numbers are stored as integers
    field_schema=models.PayloadSchemaType.INTEGER,
)

## 4. Find and delete records

1. Do an initial search on `metadata.source` based on the file name.
2. Review the initial search result to see if you get exactly what you want.
3. Iterate using `metadata.source` and `metadata.page_content` until you are sure.
4. Confirm it's giving you the doc, the whole doc, and nothing but the doc.
5. Delete away!


In [None]:
'''
This script uses a filter (scroll_filter) to count the number of points that match criteria and show the points. 

NOTE: The filter is case-sensitive and uses the `MatchText` condition. 
NOTE: This will not catch records lacking in a page field or a page field that lives in a different location.
Order_by requires qdrant-client 1.8 or later
'''

from qdrant_client.http import models

scroll_filter = models.Filter(
    must=[
        models.FieldCondition(
            # key="page_content",  # only in upsert>01NOV2024
            # key="metadata.pdf_id",  # only in upsert>01NOV2024
            # key="metadata.pdf_file_name",  # only in upsert>01NOV2024
            # key="metadata.page_content",  # pre-02NOV2024
            key="metadata.source",  # only usedful in pre-02NOV2024
            match=models.MatchText(
                text="Auxiliary_Division_Procedures_Guide_COMDTPUB"
            ),  # leave blank to get all
        ),
    ]
)

records = client.scroll(collection_name=qdrant_collection_name,
                        scroll_filter=scroll_filter,
                        with_payload=True,  # optional. Defaults to false
                        with_vectors=False,  # optional. Defaults to false
                        limit=10000,
                        order_by=models.OrderBy(
                            key="metadata.page",  # Field for ordering
                            direction="asc"       # "desc" for descending order
                        )
                        )

print(f"Number of points found: {len(records[0])}")
records[0]

#### Then uses the filter to **<span style="color:red">DELETE</span>** based on `metadata.source`


In [None]:
from qdrant_client.http import models

client.delete(
    collection_name=qdrant_collection_name,
    points_selector=scroll_filter,
)

## OTHER EXAMPLES


### Example: Find points based on `source`


In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=100000,

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                # source is pdf_filepath in upsert<01NOV2024 afterwards its URL of pdf
                key="metadata.source",
                match=models.MatchText(
                    text="AUX-PL-017(A)"),
            ),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")
records

### Example: **<span style="color:red">Delete points based on the `source` as defined in THIS cell**</span>


In [None]:
from qdrant_client.http import models

client.delete(
    collection_name=qdrant_collection_name,
    points_selector=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.source",
                match=models.MatchText(
                    text="_______"),  # BE CAREFUL CHECK THIS IS WHAT YOUR
            ),
        ]
    ),
)

print(f"Number of points deleted: {len(records[0])}")

### Example: Find points based on `id`


In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=100000,
    scroll_filter=models.Filter(
        must=[
            models.HasIdCondition(
                has_id=["3d14ab7a-c7de-4ca6-a22e-e7bfed3ba562"])
        ]
    )
)

print(f"Number of points found: {len(records[0])}")
print(records)

In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=100000,
    scroll_filter=models.Filter(
        must=[
            models.HasIdCondition(
                has_id=["544ed8b5-67dd-4d21-8c87-895dde459658"])
        ]
    )
)

print(f"Number of points found: {len(records[0])}")
print(records)

### Example: Find points based on `pdf_id`

In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=1000,

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.pdf_id",  # only in upsert>01NOV2024
                match=models.MatchText(
                    text="4e61e002-e351-4759-98b6-b881fcb5f440"),
            ),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")

### Example: Find points that are missing the `page` field




In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # Set to True to see the payload
    with_vectors=False,  # Set to True to see the vectors
    limit=10000,

    scroll_filter=models.Filter(
        must=[
            models.IsEmptyCondition(is_empty=models.PayloadField(key="metadata.page"),
                                    )
        ]
    )
)

print(f"Number of points found: {len(records[0])}")
records

### Example: Find points where `page` field does not exist or is inside the metadata dictionary!




In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # Set to True to see the payload
    with_vectors=False,  # Set to True to see the vectors
    limit=10000,

    scroll_filter=models.Filter(
        must=[
            models.IsEmptyCondition(is_empty=models.PayloadField(key="page"),
                                    )
        ]
    )
)

print(f"Number of points found: {len(records[0])}")
records

### Example: Find points based on both `page` AND `source`




In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=1000,

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                # source is pdf_filepath in upsert<01NOV2024 afterwards its URL of pdf
                # key="metadata.source",
                # key="page_content",  # only in upsert>01NOV2024
                key="metadata.page_content",
                match=models.MatchText(
                    text=""),
            ),
            models.FieldCondition(
                key="metadata.page",
                match=models.MatchValue(
                    value=0),
            ),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")
records

### Example: Create a list of all unique file names 
(assuming they are not missing the page key)


In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=2,

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.page",
                match=models.MatchValue(
                    value=0),
            ),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")
records