###  THis is the process for auditing the Qdrant DB and outputting a list of documents

In [1]:
import pandas as pd
from datetime import datetime
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())


# CONFIG: qdrant
api_key = os.environ.get("QDRANT_API_KEY")
url = os.environ.get("QDRANT_URL") # for cloud
qdrant_collection_name = "ASK_vectorstore"
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant" #  for local instance /private/tmp/local_qdrant

In [2]:

# Load an instance of the client. WITHOUT LANGCHAIN
# 22.5 sec for cloud
# Running this places a lock file in the qdrant directory
from qdrant_client import QdrantClient
client = QdrantClient(

)

client = QdrantClient(
    url=url,
    # prefer_grpc=True,
    api_key=api_key,
    # path=qdrant_path
)

'''usage'''
content = (client.get_collections())

In [3]:
client.get_collection(collection_name=qdrant_collection_name)

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=10907, indexed_vectors_count=9362, points_count=10894, segments_count=2, config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), shard_number=1, replication_factor=1, write_consistency_factor=1, on_disk_payload=True), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec=5, max_optimization_threads=1), wal_config=WalConfig(wal_capacity_mb=32, wal_segments_ahead=0), quantization_config=None), payload_schema={})

In [None]:
# Gets all points (i.e., records) and returns them as a 2-tuple
# Tuple[List[types.Record], Optional[types.PointId]]
# 7.1 sec for cloud

all_records = client.scroll(
    collection_name=qdrant_collection_name,
    limit=100000,
    with_payload=True,  # change to True to see the payload
    with_vectors=False  # change to True to see the vectors
)

print(f"""all_records is a {type(all_records)} containing {len(all_records)}elements. 
    The first element is a {type(all_records[0])} of length {len(all_records[0])}).
    The second element is a {type(all_records[1])} of length unknown
    """)

all_records[0]  # see the first point#

##  Find and print individual points in the Qdrant DB

##### Find points based on Filename
 

In [None]:
from qdrant_client.http import models

client.scroll(
    collection_name=qdrant_collection_name,
    points_selector=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.filename",
                match=models.MatchText(
                    text="CI_16130_2G")
            ),
        ]
    ),
)

##### Find points based on keyword values, numbers, bool
 

In [None]:
# Uses MatchValue to find points based on keyword values, numbers, bool

from qdrant_client.http import models

client.scroll(
    collection_name=qdrant_collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.page",
                match=models.MatchValue(
                    value=246),
            ),
        ]
    ),
)

##### Find points based on both page AND source
 

In [182]:
from qdrant_client.http import models

client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=False,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.page",
                match=models.MatchValue(
                    value=0),
            ),
            models.FieldCondition(
                key="metadata.source",
                match=models.MatchText(
                    text="IAMSAR"),
            ),
        ]
    ),
)

([Record(id='62119d2b-4535-45e1-84e4-158a9a148685', payload=None, vector=None)],
 None)

In [187]:
from qdrant_client.http import models

client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=False,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.page",
                match=models.MatchValue(
                    value=0),
            ),
            models.FieldCondition(
                key="metadata.source",
                match=models.MatchText(
                    text="IAMSAR"),
            ),
        ]
    ),
)

([Record(id='62119d2b-4535-45e1-84e4-158a9a148685', payload=None, vector=None)],
 None)

##### Find points based on page_content
 

In [118]:
# uses MatchTest to find points that contain text in page_content

from qdrant_client.http import models

client.scroll(
    collection_name=qdrant_collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="page_content",
                match=models.MatchText(
                    text="CMINST"),
            ),
        ]
    ),
)

([], None)

##### Find points based on source field
 

In [None]:
# uses MatchTest to find points that contain text in source field

from qdrant_client.http import models

client.scroll(
    collection_name=qdrant_collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.source",
                match=models.MatchText(
                    text="027_20_TEMPORARY_SUSPENSION_OF_ONLINE_PROCTOR_REQUIREMENT_FOR_AUXOP_SPECIALTY_COURSE_EXAMS"),
            ),
        ]
    ),
)

## Find and delete points
This jsut sets a filter that you can use in either a scroll or delete function

In [175]:
# Defines a filter that can be used in a scroll or delete function

from qdrant_client.http import models

scroll_filter = models.Filter(
    must=[
        models.FieldCondition(
            key="metadata.source",
            match=models.MatchText(
                text="169796.3D"),
        ),
    ]
)

client.scroll(
    collection_name=qdrant_collection_name,
    scroll_filter=scroll_filter,
)

([], None)

THis uses the filter to find the points

This deletes the points based on the filter above

In [129]:
from qdrant_client.http import models

client.delete(
    collection_name=qdrant_collection_name,
    points_selector=scroll_filter,
)

UpdateResult(operation_id=189, status=<UpdateStatus.COMPLETED: 'completed'>)

### All-in-one delete function based on metadata source value

In [174]:
client.delete(
    collection_name=qdrant_collection_name,
    points_selector=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.source",
                match=models.MatchText(
                    text="011_20_WEBINAR_DELIVERY_OF_THE_TCT_REFRESHER___COVID_19")
            ),
        ]
    ),
)

UpdateResult(operation_id=209, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
content

In [None]:
[Record(id='00034c283b0e46c9acb88308fee2001c', payload={'page_content': "*All ALAUX's are posted on the Chief Director of Auxiliary web site located at:   CHDIRAUX \nALAUX    \n \nIf you have a question regarding this ALAUX, please seek resolution within your Chain of \nLeadership and Management (COLM) including up to your servicing District Director of \nAuxiliary (DIRAUX). If your questi on still cannot be resolved after that, then please email \nCGAUX@uscg.mil .", 'metadata': {'source': '/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/PDF_metadata_complete/ALAUXs/012_22_AUXDATA_II_SECURITY_UPDATES_FINAL.pdf', 'page': 1, '/Producer': 'pypdf', '/Title': '012 22 AUXDATA II SECURITY UPDATES FINAL', '/LeadershipScope': '1_National', '/PageCount': '2', '/CreationDate': '2022-03-11', '/EffectiveDate': '2022-03-11', '/IngestDate': '2023-10-30', '/ExpirationDate': '2032-03-10', '/AuxSpecific': 'True', '/PublicRelease': 'True', '/PublicationNumber': 'ALAUX_012 22', '/Source': 'cgaux.org', '/Organization': 'nan', '/Curator': 'Drew_Wilkins', '/DocId': '2a91c994e8ac98ff0a2222a7b7df14ea', '/FileName': '012_22_AUXDATA_II_SECURITY_UPDATES_FINAL'}}, vector=None),
 Record(id='002794e80e7545a486d5ad536e2250bf', payload={'page_content': 'U.S COAST GUARD AUXILIARY  - UNCLASSIFIED \nThe Risk Management (RM) Instruction includes: \n•A 5-step process \n•The PEACE and STAAR models \n•Risk Assessment Matrix (RAM) \n•Mandates the use of GAR 2.0 \n•Standardizes RM training for all communities (surface, \nair, and shore) \nResponse Directorate - Telecommunications Division 7Risk Management', 'metadata': {'source': '/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/PDF_metadata_complete/Aux National/Telecomms_TCO_Workshop Rev 1.4_2023.pdf', 'page': 6, '/Producer': 'pypdf', '/Title': '2023 Telecomms Workshop', '/LeadershipScope': '1_National', '/PageCount': '65', '/CreationDate': '2023-10-30', '/EffectiveDate': '2023-10-30', '/IngestDate': '2023-10-30', '/ExpirationDate': 'nan', '/AuxSpecific': 'True', '/PublicRelease': 'True', '/PublicationNumber': 'nan', '/Source': 'cgaux.org', '/Organization': 'nan', '/Curator': 'Drew_Wilkins', '/DocId': '7a54199383189945b5d30bffb6ccd438', '/FileName': 'Telecomms_TCO_Workshop Rev 1'}}, vector=None)]

In [None]:
page_number = data["metadata"]["page"]

### Example Payload from QdrantCloud

In [None]:
{
  "metadata": {
    "page": 162,
    "source": "References/Not catalogued/MSM Vol III Personnel CMINST 16000.8B Change 2.pdf"
  },
  "page_content": "COMDTINST M16000.8B  \nUSCG Marine Safety Manual, Vol. III:  Marine Industry Personnel  \nPART A: MARINER CREDENTIALING  \nCHAPTER 12:  LICENSING FOR ENGINEERING OFFICERS  \n \nA12-2 \n B. MMD Endorsements Accompanying Licenses.    \nEngineers holding licenses that authorize service on inspected vessels of more than 2000 \nhorsepower are entitled by 46 CFR 12.02- 11(d)(2) to an MMD endorsed for any unlicensed \nrating in t he engine department. Such license holders should be encouraged to obtain an MMD \nwhen the license is issued.  In many cases an MMD is required to legally serve aboard a vessel.  (See 46 CFR 12.02- 7). \n \nC. Creditable Service.  \n \n1. Minimum Vessel Size.    \nAll servic e must be on vessels of at least 100 gross tons.  This is consistent with the \nrequirements for ratings as a qualified member of the engine department.  See section 12.C.8 for exceptions for designated duty engineers.  \n \n2. Watchstanding Requirements.    \nTraditio nally, the Coast Guard has held watchstanding experience to be an important part \nof the professional development of third and second assistant engineers. Technological and design advances over the last several years have led to the development of ships tha t \ncan operate with unattended engine rooms.  The certificates of inspection for such vessels still require licensed engineers to be on board (the engineer being on call as needed).  Since the engineers are on call, such service shall be treated as though t he license holder \nwere in charge of a watch and will be creditable for a raise in grade"
}