## Create an Excel of unqiue records from Qdrant
### To generate the Excel file in this notebook's directory:
1. Choose local or cloud
2. Select the qdrant collection name 
3. Click `Run All` in the notebook

## 0. Imports and Configs


In [None]:
%pip install qdrant_client -q --upgrade

In [1]:
import os


# Config Qdrant``
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
# QDRANT_URL = st.secrets["QDRANT_URL"]
# QDRANT_API_KEY = st.secrets["QDRANT_API_KEY"]
QDRANT_PATH = "/Users/drew_wilkins/Drews_Files/Drew/Python/Localcode/Drews_Tools/qdrant_ASK_lib_tools/qdrant_db"


# Configure access to push changes to Github
GITHUB_ACCESS_TOKEN_ASK_LIBRARYCATALOG = os.getenv(
    "GITHUB_ACCESS_TOKEN_ASK_LIBRARYCATALOG")

## 1. Choose Local or Cloud

In [2]:
from qdrant_client import QdrantClient
import library_utils as lib

# Your choice of parameters here affects the rest of the notebook.
# If you change it here, be sure to click through the rest of the cells
# to reset the variables

client = QdrantClient(
    url=QDRANT_URL,  # for cloud
    api_key=QDRANT_API_KEY,  # for cloud
    # path=QDRANT_PATH,  # for local
)


lib.which_qdrant(client)
lib.list_collections(client)

qdrant location: cloud

collections:
ASK_vectorstore
ask_pdf_docs
ASK_vectorstore-backup21APR2025
ask_pdf_pages


## 2. Select the Qdrant collection

In [13]:
collection_name = "ASK_vectorstore"

In [None]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=100000,  # 👈 don't forget to set this

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.scope",
                match=models.MatchText(
                    text="national"),
            ),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")

Number of points found: 11609


In [5]:
records[0][0]

Record(id='00043af2-745b-4672-b128-98e122e66eb8', payload={'page_content': 'May 2014  \nE. Implement FAA air space closure and air space de-\nconfliction plans to conduct operations as required.  \nF. Oversee creation of air task orders or flight schedules to \nmitigate safety risk of aircraft operations in confined or \nsaturated air space.  \nG. Coordinate airspace use with the FAA.  \nH. Request declaration or cancellation of Temporary Flight Restrictions (TFRs) in accordance with applicable \nFederal Aviat ion Regulations and post Notice to Airmen \n(NOTAM).  \nI. Attend the Tactics and Planning Meetings  to exchange \ninformation for development of  the Air Operations \nSummary (ICS 220- CG) and to confirm the number and \ntype of aircraft needed for the next operational period.  \nJ. Participate in preparation of the IAP through the OSC , \nensuring that the air operations portion includes  the Air \nTraffic Control (ATC) requirements of assigned aircraft.  \nK. Coordinate with t

## 3. Add a field to  the metadata

In [None]:
from collections import OrderedDict
import time
from qdrant_client import QdrantClient, models


def update_payload_with_unit_in_batches(
    client: QdrantClient,
    collection_name: str,
    default_unit_value: str = "",
    batch_size: int = 1000,
    scroll_timeout: int = 30
) -> None:
    """
    Update every record in the collection by adding a new metadata field 'unit'
    inserted immediately after the 'scope' field by updating only the payload.
    The records are processed in batches using client.batch_update_points.

    Parameters:
      - client: A QdrantClient instance.
      - collection_name: The name of the Qdrant collection.
      - default_unit_value: The default value to assign to the new 'unit' field.
      - batch_size: How many records to process per batch.
      - scroll_timeout: Timeout (in seconds) for each scroll call.
    """
    total_updated = 0

    while True:
        # Retrieve a batch of records (payload only).
        records = client.scroll(
            collection_name=collection_name,
            with_payload=True,
            with_vectors=False,
            limit=batch_size,
            offset=None,
            timeout=scroll_timeout
        )

        if not records:
            break

        update_ops = []
        for record in records:
            # Retrieve the original metadata (assumed to be a dictionary).
            metadata = record.payload.get("metadata", {})
            # Reconstruct metadata as an OrderedDict to enforce ordering.
            new_metadata = OrderedDict()
            inserted = False
            for key, value in metadata.items():
                new_metadata[key] = value
                if key == "scope":
                    # Insert the new "unit" field immediately after "scope"
                    new_metadata["unit"] = default_unit_value
                    inserted = True
            if not inserted:
                # If "scope" wasn't found, simply append "unit" at the end.
                new_metadata["unit"] = default_unit_value

            # Create a SetPayloadOperation with the correct syntax.
            update_ops.append(
                models.SetPayloadOperation(
                    set_payload=models.SetPayload(
                        payload={"metadata": new_metadata},
                        points=[record.id],
                    )
                )
            )

        if update_ops:
            # Use batch_update_points to update the payloads in this batch.
            client.batch_update_points(
                collection_name=collection_name,
                update_operations=update_ops
            )
            total_updated += len(update_ops)
            print(f"Updated {total_updated} records so far.")

        if next_page_offset is None:
            break

        time.sleep(0.5)  # Optional delay to reduce load

    print(
        f"Completed updating records. Total records updated: {total_updated}")


# Example usage:
# client = QdrantClient(url="your_qdrant_url", api_key="your_api_key")
# collection_name = "ASK_vectorstore"
update_payload_with_unit_in_batches(
    client, collection_name, default_unit_value="")

## 4. Modify a metadata field

Example 1: populate missing expiration date

In [None]:
from qdrant_client.http.models import SetPayload, SetPayloadOperation
import time
import pandas as pd  # for isna() handling

DEFAULT_EXPIRATION = "2099-12-31T00:00:00Z"


def populate_missing_expiration():
    batch_size = 1000
    scroll_offset = None
    scroll_timeout = 30
    total_updated = 0

    while True:
        records, scroll_offset = client.scroll(
            collection_name=collection_name,
            with_payload=True,
            with_vectors=False,
            limit=batch_size,
            offset=scroll_offset,
            timeout=scroll_timeout
        )

        if not records:
            break

        update_ops = []

        for record in records:
            metadata = record.payload.get("metadata", {})
            current_value = metadata.get("expiration_date", None)

            if pd.isna(current_value) or current_value in ("", None):
                metadata["expiration_date"] = DEFAULT_EXPIRATION
                update_ops.append(
                    SetPayloadOperation(
                        set_payload=SetPayload(
                            payload={"metadata": metadata},
                            points=[record.id],
                        )
                    )
                )
                print(f"🔄 Set default expiration_date for record {record.id}")

        if update_ops:
            client.batch_update_points(
                collection_name=collection_name,
                update_operations=update_ops
            )
            total_updated += len(update_ops)
            print(f"✅ Updated {total_updated} records so far.")

        if scroll_offset is None:
            break

        time.sleep(0.5)

    print(f"🎉 Modification complete. Total records updated: {total_updated}")


# Run the migration
populate_missing_expiration()

Example 2: change text

In [15]:
from qdrant_client.http.models import PointStruct, SetPayload, SetPayloadOperation
from collections import OrderedDict
from dateutil import parser
from typing import Optional
import time


def convert_metadata_fields(metadata: dict, record_id: str) -> Optional[dict]:
    updated_metadata = OrderedDict()
    updated = False

    for key, value in metadata.items():
        if key == "scope" and value == "1_national":
            updated_metadata[key] = "national"
            updated = True
        else:
            updated_metadata[key] = value

    return updated_metadata if updated else None


def migrate_collection():
    batch_size = 1000
    scroll_offset = None
    scroll_timeout = 30
    total_updated = 0

    while True:
        records, scroll_offset = client.scroll(
            collection_name=collection_name,
            with_payload=True,
            with_vectors=False,
            limit=batch_size,
            offset=scroll_offset,
            timeout=scroll_timeout
        )

        if not records:
            break

        update_ops = []

        for record in records:
            metadata = record.payload.get("metadata", {})
            try:
                updated_metadata = convert_metadata_fields(
                    metadata, str(record.id))

                if updated_metadata:
                    print(f"🔄 Updating record {record.id}")
                    update_ops.append(
                        SetPayloadOperation(
                            set_payload=SetPayload(
                                payload={"metadata": updated_metadata},
                                points=[record.id],
                            )
                        )
                    )
            except Exception as e:
                print(f"❌ Exception while processing record {record.id}: {e}")

        if update_ops:
            client.batch_update_points(
                collection_name=collection_name,
                update_operations=update_ops
            )
            total_updated += len(update_ops)
            print(f"✅ Updated {total_updated} records so far.")

        if scroll_offset is None:
            break

        time.sleep(0.5)

    print(f"🎉 Migration complete. Total records updated: {total_updated}")


migrate_collection()

🔄 Updating record 00043af2-745b-4672-b128-98e122e66eb8
🔄 Updating record 000bdeac-dc0f-4bdf-84a7-d4adaa4cb327
🔄 Updating record 000ee01d-2811-4500-b847-ac8486430938
🔄 Updating record 000f377e-8aa9-4f4d-ae0d-725215542ed3
🔄 Updating record 000f6bee-0e97-442c-b9d7-82e75ed553a6
🔄 Updating record 000f7189-21ec-4e40-a8bd-b44aee74c37c
🔄 Updating record 0010b690-6584-4502-acee-540c5af267c3
🔄 Updating record 0014bfc7-8d49-49a0-b466-709123ffa9b3
🔄 Updating record 001e8dd0-16e1-4187-beda-83821d929b40
🔄 Updating record 002b30ca-0d8e-4d20-851b-e4ca24711930
🔄 Updating record 002e9c5f-e517-4d50-8deb-7d139b3759e1
🔄 Updating record 002f70a1-142b-4914-99e5-b8f2fd9b5875
🔄 Updating record 0039348f-98f9-4049-bd37-f162c6870aa6
🔄 Updating record 003a298d-4e34-491f-bb90-7a197020f438
🔄 Updating record 003dfc02-4e02-477e-a796-753a4e4eda68
🔄 Updating record 004d9f1f-e7d0-4eff-85fd-e0d2fa7f73e0
🔄 Updating record 004f0e6e-3207-4b33-b589-311cb704de45
🔄 Updating record 0059b2e4-ebb7-4fdc-9d1d-cb66278134da
🔄 Updating

In [None]:
from datetime import datetime, timezone

now = datetime.now(timezone.utc)
now_ts = now.timestamp()  # ✅ float

records, _ = client.scroll(
    collection_name=collection_name,
    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.expiration_date",
                range=models.Range(gt=now_ts)
            )
        ]
    ),
    with_payload=True
)

print(f"Found {len(records)} documents with expiration_date in the future.")

## 5. Check a record

In [None]:
# usres qdrant id


records = client.retrieve(
    collection_name=collection_name,
    ids=["000bdeac-dc0f-4bdf-84a7-d4adaa4cb327"],
    with_payload=True,
    with_vectors=False
)

metadata = records[0].payload["metadata"]
print("expiration_date:", metadata.get("expiration_date"),
      type(metadata.get("expiration_date")))
records[0].payload["metadata"]

In [None]:
# uses pdf_id

from qdrant_client.http import models

records = client.scroll(
    collection_name=collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=20000, # 👈 dont' forget to set this

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.pdf_id",  # only in upsert>14NOV2024
                match=models.MatchText(
                    text="0aaa5c4d-8f8d-5ea3-8d45-418441a2f6a3"),
            ),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")
records[0][0].payload["metadata"]

metadata = records[0].payload["metadata"]
print("expiration_date:", metadata.get("expiration_date"),
      type(metadata.get("expiration_date")))

In [None]:
records[0][0].payload["metadata"]

In [None]:
for field in ["issue_date", "expiration_date", "upsert_date"]:
    value = records[0][0].payload["metadata"].get(field)
    print(f"{field}: {value} (type: {type(value)})")

### Utility to close Qdrant client

In [None]:
lib.close_qdrant(client)

### REFERENCES

#### How to access diffferent parts of a record

```python
all_records  # a tuple
records = all_records[0]  # records is a list
records[0].id  # a string
records[0].payload  # a dict
records[0].payload["metadata"]  # a dict
records[0].payload["metadata"]["pdf_id"]  # a string