# Upsert PDFs to qdrant

#### **HOW IT WORKS**: It operate in PDF-wise fashion, unlike my previous notebooks that loaded and chunked an entire PDF in one go. This one loads and chunks PDFs one at a time to enable for checking the id and enabling some better error handling.

To upsert a single pdf:
1. Set the qdrant collection name 
2. Choose local or cloud
3. Run cells individually until you generate a pdf_id.
4. Add the pdf medata into the spreadsheet
5. Run the rest of the cells to upsert the pdf

To upsert a folder of pdfs:
1. Set the qdrant collection name 
2. Choose local or cloud
3. Run the cells individually, stopping at `qdrant.add_documents(chunks)`. This initialize all the functions and config variables needed for the batch process. Yes, you will need to specify a specific PDF file.

In [24]:
# %pip install -qU pip
# %pip install -qU langchain-openai openai langchain-community langchain-qdrant qdrant-client pympler pypdf==5.0.1

# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9

## 0. Imports and Configs


In [1]:
import os, sys
from dotenv import load_dotenv

# Confirm correct interpreter is used
print(sys.executable)

# Add parent directory to sys.path to import modules from a subdirectory
sys.path.append(os.path.abspath('..'))
import library_utils as lib

/Users/drew_wilkins/Drews_Files/Drew/Python/Localcode/.venv-311/bin/python


In [2]:
ENV_PATH = "/Users/drew_wilkins/Drews_Files/Drew/Python/Localcode/.env"
load_dotenv(ENV_PATH)

# Config LangSmith observability
# LANGCHAIN_API_KEY = os.environ["LANGCHAIN_API_KEY"]
# os.environ["LANGCHAIN_TRACING_V2"] = "false"
# os.environ["LANGCHAIN_PROJECT"] = "ASK_main_upsert_notebook"


# Config Qdrant
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
# QDRANT_URL = st.secrets["QDRANT_URL"]
# QDRANT_API_KEY = st.secrets["QDRANT_API_KEY"]
QDRANT_PATH = "./qdrant_db"


# Config langchain_openai
from langchain_openai import OpenAIEmbeddings
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_ASK")
# OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]


# Misc configs for tracing
CONFIG = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "qdrant_collection_name": "ASK_vectorstore",
    "embedding_model": "text-embedding-ada-002",  # alt: text-embedding-3-large
    "embedding_dims": 1536,  # alt: 1024
    "vector_name": "text-dense",
    "sparse_vector_name": "None",
    "sparse_embedding": "None",
    "search_type": "mmr",
    "k": 5,
    'fetch_k': 20,   # fetch 30 docs then select 5
    'lambda_mult': .7,    # 0= max diversity, 1 is min. default is 0.5
    "score_threshold": 0.5,
    "generation_model": "gpt-3.5-turbo-16k",
    "temperature": 0.7,
}

## 1. Initialize the Qdrant and LC Vectorstore objects

In [3]:
from qdrant_client import QdrantClient

client = QdrantClient(
    url=QDRANT_URL,  # for cloud
    api_key=QDRANT_API_KEY,  # for cloud
    prefer_grpc=True,
    # path=QDRANT_PATH,  # for local
)


lib.which_qdrant(client)
lib.list_collections(client)

qdrant location: cloud

Available collections:
ASK_vectorstore
ask_pdf_docs
ASK_vectorstore-backup21APR2025
ask_pdf_pages


In [4]:
collection_info = client.get_collection(
    collection_name="ASK_vectorstore")
print(collection_info)

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=10566 points_count=11615 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=True, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'text-sparse': SparseVectorParams(index=SparseIndexParams(full_scan_threshold=None, on_disk=False, datatype=None), modifier=None)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_in

In [5]:
from langchain_qdrant import QdrantVectorStore

# Initialize a LangChain vectorstore object
qdrant = QdrantVectorStore(client=client,
                           collection_name=CONFIG["qdrant_collection_name"],
                           # embedding here is LC interface to the embedding model
                           embedding=OpenAIEmbeddings(
                               model=CONFIG["embedding_model"]),
                           validate_collection_config=True  # Skip validation
                           )


qdrant

<langchain_qdrant.qdrant.QdrantVectorStore at 0x319329490>

## 2. Specify the file locations

In [10]:
pdf_filename = "D7-SOP-AD-002_Proctor_Designation.pdf"  #
pdf_source_directory = "./pdfs_backlog"
pdf_path = os.path.join(pdf_source_directory, pdf_filename)

metadata_source_path = "./library_catalog_2025-04-21.xlsx"

## 3. Generate PDF ID

In [11]:
pdf_id = lib.get_pdf_id(pdf_path)
print(f"UUID: {pdf_id}   for file: '{pdf_filename}'")

UUID: a9794b42-9ad4-5992-8dbe-29576740d623   for file: 'D7-SOP-AD-002_Proctor_Designation.pdf'


NOTE: Don't forget to add this UUID into the Excel spreadsheet and SAVE IT so it can be accessed below.

## 4. Check if PDF is already in Qdrant

In [14]:
'''usage'''
lib.is_pdf_id_in_qdrant(client, CONFIG, pdf_id)

False

## 5. Retrieve the PDF file's metadata into a dictionary; edit and add stuff to it

In [15]:
planned_metadata = lib.get_planned_metadata_for_single_record(
    pdf_id, metadata_source_path)

print("\nMetadata:\n")
print(f"{'Key':<25} {'Type':<12} {'Value'}")
print("-" * 70)

for k, v in planned_metadata.items():
    value_type = type(v).__name__
    print(
        f"{k:<25} {value_type:<12} {str(v)[:40]}{'...' if len(str(v)) > 40 else ''}")

Successfully accessed metadata for pdf: a9794b42-9ad4-5992-8dbe-29576740d623
Successfully added upsert date 2025-04-22T21:05:15Z to metadata

Metadata:

Key                       Type         Value
----------------------------------------------------------------------
title                     str          Proctor Request and Designation Process ...
pdf_id                    str          a9794b42-9ad4-5992-8dbe-29576740d623
publication_number        str          D7-SOP-AD-002
organization              str          DCO
scope                     str          District
unit                      str          7
issue_date                str          2025-04-18T00:00:00Z
upsert_date               str          2025-04-22T21:05:15Z
expiration_date           str          2099-12-31T00:00:00Z
aux_specific              bool         True
public_release            bool         True
pdf_file_name             str          D7-SOP-AD-002_Proctor_Designation.pdf
embedding                 str          tex

In [16]:
# Just some type checking for debugging purposes

print("Data types found by key in payload metadata:")
for key, value in planned_metadata.items():
    value_type = type(value).__name__
    print(f"  - Key '{key}' has data type: {value_type}")

Data types found by key in payload metadata:
  - Key 'title' has data type: str
  - Key 'pdf_id' has data type: str
  - Key 'publication_number' has data type: str
  - Key 'organization' has data type: str
  - Key 'scope' has data type: str
  - Key 'unit' has data type: str
  - Key 'issue_date' has data type: str
  - Key 'upsert_date' has data type: str
  - Key 'expiration_date' has data type: str
  - Key 'aux_specific' has data type: bool
  - Key 'public_release' has data type: bool
  - Key 'pdf_file_name' has data type: str
  - Key 'embedding' has data type: str


## 6. Merge the PDF content and metadata dict into LangChain Document objects

In [17]:
import os
import pypdf
from langchain_community.document_loaders import PyPDFLoader


def pdf_to_Docs_via_pypdf(pdf_path, planned_validated_metadata):  # pdf_dict is optional
    """Extrracts text and metadata for one PDF and turns them into a list of page-level Document objects ("docs_pages")."""

    docs_pages = []

    try:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()  # This returns a list of LC page document objects

        with open(pdf_path, "rb") as pdf_file_obj:
            reader = pypdf.PdfReader(pdf_file_obj)
            # in case you want metadata from the orignal PDF
            original_pdf_metadata = reader.metadata
            enriched_metadata = {
                'page_count': len(reader.pages),
            }
            planned_metadata.update(enriched_metadata)
            planned_metadata.pop('pdf_file_name', None)

        for doc in docs:
            doc.metadata.update(planned_validated_metadata)
            docs_pages.append(doc)

        print(f"Processed {pdf_path}")
        print("number of pages processed:", len(docs_pages))
    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    return docs_pages


'''usage'''
docs_pages = pdf_to_Docs_via_pypdf(pdf_path, planned_metadata)

Processed ./pdfs_backlog/D7-SOP-AD-002_Proctor_Designation.pdf
number of pages processed: 5


In [18]:
docs_pages[0].metadata

{'source': './pdfs_backlog/D7-SOP-AD-002_Proctor_Designation.pdf',
 'page': 0,
 'title': 'Proctor Request and Designation Process Standard Operating Procedures',
 'pdf_id': 'a9794b42-9ad4-5992-8dbe-29576740d623',
 'publication_number': 'D7-SOP-AD-002',
 'organization': 'DCO',
 'scope': 'District',
 'unit': '7',
 'issue_date': '2025-04-18T00:00:00Z',
 'upsert_date': '2025-04-22T21:05:15Z',
 'expiration_date': '2099-12-31T00:00:00Z',
 'aux_specific': True,
 'public_release': True,
 'embedding': 'text-embedding-ada-002',
 'page_count': 5}

In [19]:
docs_pages[0].page_content

'D7-SOP-AD-002 1 18 April 2025    \n \n \n \n \nSeventh District Coast Guard Auxiliary \nOffice of the District Commodore \nSeventh Coast Guard District - (dpa) \n909 SE 1st Avenue, Suite 446 \nMiami, FL 33131 US \n \n \nDistrict 7 \nStandard Operating Procedure \n \nSOP Title:  Proctor Request and Designation Process \nNumber:  D7-SOP-AD-002 \nReference:  (a) Auxiliary Manual, COMDINST M16790.1 (series)  \n    (b) Auxiliary Training Handbook ‚Äì Boat Crew, ATH 16794.51 (series) \n \n1. PURPOSE. This Standard Operating Procedure (SOP) establishes the standardized process \nwithin District Seven for requesting, appointing, and designating proctors for Coast Guard \nAuxiliary course examinations, including online and in-person formats. In accordance with \nguidance from the Chief Director of the Auxiliary, the authority to control and administer \ncourse examinations resides with the Director of Auxiliary (DIRAUX), who may authorize \neligible personnel to serve as proctors.  \n \n2. PRO



### Example enriched Langchain Page Document object

```python
page_content='Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus nunc sapien' metadata={'source': './raw_pdfs/lorem_ipsum.pdf', 'page': 1, 'page_count': 13, 'pdf_id': 'df6b2344-b73b-5c11-9f3e-aa2a370b1696'}
```


  

## 7. Chunk the Page-level Documents into chunked Documents


In [20]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def chunk(docs_pages):
    # Turns a list of full-page Document objects ("docs_pages") into a list of smaller Document objects (i.e., "docs_chunks")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CONFIG["chunk_size"],
        chunk_overlap=CONFIG["chunk_overlap"],
        length_function=CONFIG["length_function"],
        separators=CONFIG["separators"]
    )

    docs_chunks = text_splitter.split_documents(docs_pages)

    return docs_chunks


'''usage'''
docs_chunks = chunk(docs_pages)
print("number of chunks:", len(docs_chunks))
docs_chunks[0]

number of chunks: 5


Document(metadata={'source': './pdfs_backlog/D7-SOP-AD-002_Proctor_Designation.pdf', 'page': 0, 'title': 'Proctor Request and Designation Process Standard Operating Procedures', 'pdf_id': 'a9794b42-9ad4-5992-8dbe-29576740d623', 'publication_number': 'D7-SOP-AD-002', 'organization': 'DCO', 'scope': 'District', 'unit': '7', 'issue_date': '2025-04-18T00:00:00Z', 'upsert_date': '2025-04-22T21:05:15Z', 'expiration_date': '2099-12-31T00:00:00Z', 'aux_specific': True, 'public_release': True, 'embedding': 'text-embedding-ada-002', 'page_count': 5}, page_content='D7-SOP-AD-002 1 18 April 2025    \n \n \n \n \nSeventh District Coast Guard Auxiliary \nOffice of the District Commodore \nSeventh Coast Guard District - (dpa) \n909 SE 1st Avenue, Suite 446 \nMiami, FL 33131 US \n \n \nDistrict 7 \nStandard Operating Procedure \n \nSOP Title:  Proctor Request and Designation Process \nNumber:  D7-SOP-AD-002 \nReference:  (a) Auxiliary Manual, COMDINST M16790.1 (series)  \n    (b) Auxiliary Training Ha

## 8. Add the chunked Documents to Qdrant


#### **Option 1**: Load a single PDF <span style="color:orange"><b>SKIP THIS CELL for batch load</b></span>

In [15]:
qdrant.add_documents(docs_chunks)

['a81bfe59a09f4fcaa4a9c97510f9feeb',
 'fac7ed896e6e43b49cf5922f97dd0597',
 'c67151252ef845f4856c61dc878e9277',
 '9c3c09b5677f43f489e95952913d650a',
 '4bd8cc3e59cf486583a7539dc1eb9a2a',
 'f76b51c80ba6489ea8f93d6f006aabee',
 '6bbaa8f038ad4cb9ae8c1af496af2936',
 '12397763a55c435dbc6de58a9d685448',
 'b259740b24b048bd9f32689eaf8312b0',
 'ab10cd7af0bf47f29c14af353625af71',
 '129477b4b6af45e2ae09c8aaa9b2939d',
 '29023c5e34064f0692a2c8937c502928',
 'f30f02ca3bd2414e8a7127ac86c3a1c2',
 'fe30a463f1294b4b8a0d87367964b9d7',
 'd2a17378979045ad982824bfb733235e',
 'c139a7806dcc4e788039a37e8c4f5b27',
 'ded08327e4984dd59e53ddff1d0821fb',
 '6b2ed4c2689b4afc95845b69003fc2f1',
 'bdbc387375884a66be4132d9927d8fce',
 'cfae666619ef4e289f0961d497019669',
 '61b265ec0c984ca5b88be142e11c35f7',
 '89b32558d2184fe4bfc3954db8cf64e6',
 'da740145004242a0a95adaa11f0f3cd1',
 'cd003d77c563458697cfab097fd3008f',
 'e4db3306168144918518dd203d2b03e0',
 '3a47b58ea3b04063aaaf7a4d1c0498a9',
 '8a097a9685644e46aa43413d61847d54',
 

#### **Option 2**: Load an entire folder of PDFs

In [21]:
pdf_source_directory = pdf_source_directory
metadata_source_path = metadata_source_path


def bulk_file_processor(pdf_source_directory, metadata_source_path):
    print("Processing...")
    uploaded_files = []
    rejected_files = []
    failed_files = []

    for dirpath, dirnames, filenames in os.walk(pdf_source_directory):
        for filename in filenames:
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(dirpath, filename)
                print(f"{filename}")
                try:
                    pdf_id = lib.get_pdf_id(pdf_path)
                    if lib.is_pdf_id_in_qdrant(client, CONFIG, pdf_id):
                        # print(
                        #    f"\033[91müí• pdf_id {pdf_id} already in Qdrant. Delete first.\033[0m")
                        rejected_files.append(filename)
                        continue  # Skip to the next file
                    planned_metadata = lib.get_planned_metadata_for_single_record(
                        pdf_id, metadata_source_path)
                    docs_pages = pdf_to_Docs_via_pypdf(
                        pdf_path, planned_metadata)
                    docs_chunks = chunk(docs_pages)
                    qdrant.add_documents(docs_chunks)
                    uploaded_files.append(filename)
                except Exception as e:
                    print(f"üòà Error processing {filename}: {e}")
                    failed_files.append(filename)
                print("\n‚úÖ Uploaded files:")
                for item in uploaded_files:
                    print(item)
                print("\nüòà Failed files:")
                for item in failed_files:
                    print(item)
                print("\nüí• Rejected files:")
                for item in rejected_files:
                    print(item)
                print("\n\n‚úÖ Number of files successfully uploaded: ",
                      len(uploaded_files))
                print("üòà Number of files failed during processing: ",
                      len(failed_files))
                print("üí• Number of files rejected as duplicate: ",
                      len(rejected_files))


'''usage'''
bulk_file_processor(pdf_source_directory, metadata_source_path)

Processing...
D7-SOP-AD-002_Proctor_Designation.pdf
Successfully accessed metadata for pdf: a9794b42-9ad4-5992-8dbe-29576740d623
Successfully added upsert date 2025-04-22T21:05:30Z to metadata
Processed ./pdfs_backlog/D7-SOP-AD-002_Proctor_Designation.pdf
number of pages processed: 5

‚úÖ Uploaded files:
D7-SOP-AD-002_Proctor_Designation.pdf

üòà Failed files:

üí• Rejected files:


‚úÖ Number of files successfully uploaded:  1
üòà Number of files failed during processing:  0
üí• Number of files rejected as duplicate:  0
D7-AUX-ID-Card-Request-Procedures.pdf
Successfully accessed metadata for pdf: 47781899-a0a9-503d-8063-f929eb6518b5
Successfully added upsert date 2025-04-22T21:05:31Z to metadata
Processed ./pdfs_backlog/D7-AUX-ID-Card-Request-Procedures.pdf
number of pages processed: 1

‚úÖ Uploaded files:
D7-SOP-AD-002_Proctor_Designation.pdf
D7-AUX-ID-Card-Request-Procedures.pdf

üòà Failed files:

üí• Rejected files:


‚úÖ Number of files successfully uploaded:  2
üòà Number

## <span style="color:green"><b>CONGRATULATIONS: You're done</b></span>


### Simple Utility to see record in Qdrant

In [None]:
from qdrant_client.http import models

UUID = "0764d16-7066-5d44-9047-2cfa8a8fb888"

records = client.scroll(
    collection_name=CONFIG["qdrant_collection_name"],
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=1000000,

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(key="metadata.pdf_id",
                                  match=models.MatchText(text=UUID),),
            # models.HasIdCondition(has_id=[UUID]),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")
records[0]

Number of points found: 709


Record(id='f871e049-3c72-4460-9573-9cba59a95276', payload={'metadata': {'upsert_date': '2025-04-21T23:45:47Z', 'embedding': 'text-embedding-ada-002', 'expiration_date': '2021-08-17T00:00:00Z', 'scope': 'national', 'organization': 'CG-BSX', 'page_count': 711, 'issue_date': '2011-08-17T00:00:00Z', 'aux_specific': True, 'public_release': True, 'publication_number': 'COMDTINST M16790.1G', 'pdf_id': '20764d16-7066-5d44-9047-2cfa8a8fb888', 'unit': '', 'source': './pdfs_backlog/CIM_16790_1G_EO_14151_version_Feb2025.pdf', 'title': 'Auxiliary Manual, COMDTINST M16790.1G, Exec. Order No.14151', 'page': 182}, 'page_content': 'COMDTINST M16790.1G  \n   A.4.Designation Auxiliary flotillas are designated by Arabic numerals.  The first numeral indicates the division, followed by the flotilla number.  The name of the city or town in which the flotilla is located completes the designation.  For example, Flotilla 32, St. Louis, is Division 3, Flotilla 2, located at St. Louis, Missouri.  Different flotil

In [18]:
print(
    f"Number of points in {CONFIG['qdrant_collection_name']} on {client._client}: {client.get_collection(CONFIG['qdrant_collection_name']).points_count}")

Number of points in ASK_vectorstore on <qdrant_client.local.qdrant_local.QdrantLocal object at 0x335b0a850>: 11406


### <span style="color:red"><b>Delete</b></span> a PDFs records in Qdrant

In [13]:
from qdrant_client.http import models

client.delete(
    collection_name=CONFIG["qdrant_collection_name"],
    points_selector=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.pdf_id",
                match=models.MatchText(
                    text="a9794b42-9ad4-5992-8dbe-29576740d623"),  # BE CAREFUL CHECK THIS IS WHAT YOUR
            ),
        ]
    ),
)

UpdateResult(operation_id=23236, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
client.count(
    collection_name=CONFIG["qdrant_collection_name"],
    count_filter=models.Filter(
        must_not=[
            models.HasIdCondition(has_id=[1]),
        ]
    ),
    exact=True,
)

### <span style="color:red"><b>Delete ALL </b></span> records Qdrant collection

In [None]:
from qdrant_client.http import models

client.delete(
    collection_name=CONFIG["qdrant_collection_name"],
    points_selector=models.Filter()
)

In [None]:
import matplotlib.font_manager as font_manager

def list_fonts():
    """Lists all fonts available on the system."""
    fonts = font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
    print("Available Fonts:")
    for font in fonts:
        print(font)

if __name__ == "__main__":
    list_fonts()

### Close Qdrant client

In [None]:
lib.close_qdrant(client)

deleting qdrant
closing client


### **REFERENCE**: Example record

```python

Record(
    id='01999d49fe0044478a1bffbb078938e8', 
    payload={
        'page_content': 'I Want a VSC\n‚Ä¢Revised ‚ÄúI Want a VSC‚Äù form is online with a new look.\n‚Ä¢Major changes are:\n‚Ä¢The originator no longer sees the selected VEs‚Äô emails, only \nselected VEs see the emails.\n‚Ä¢Originator gets an email indicating they‚Äôll be contacted.\n‚Ä¢Its‚Äô important VEs ‚ÄúReply All‚Äù they‚Äôve made contact or aren‚Äôt \navailable.\n‚Ä¢If the originator does not receive a response -they wil be instructed \nto initiate another request.\n‚ÄúI Want a VSC‚Äù can be found at:https://wow.uscgaux.info/i_want_a_vsc/index.php\n3\nŸ†The new ‚ÄúI Want A VSC‚Äù program is on-line.\n‚Ä¢Major changes are:\n‚Ä¢Originator no longer see VEs‚Äô emails.\n‚Ä¢Originator receives an email indicating they‚Äôll be contacted.\n‚Ä¢It‚Äôs important VEs ‚ÄúReply All‚Äù they‚Äôve made contact with originator or that \nthey‚Äôre not available.\n‚Ä¢If the originator hears nothing they‚Äôre instructed to initiate another request.\n‚Ä¢The new form is on line and can be found on the V-Directorate web page or with the \nQR code or web site.', 
        'metadata': {'source': './test_pdfs_copies_delete_after_use/VE_workshop_Dec_4_22_2023.pdf', 'page': 2, 'title': 'Auxiliary Vessel Examiner VE Workshop 2023', 'pdf_id': 'a0d00ff2-7ad5-5ea7-baa0-0d738380cd3e', 'publication_number': nan, 'organization': nan, 'scope': '1_national', 'issue_date': '2024-02-16T00:00:00Z', 'upsert_date': nan, 'expiration_date': '2034-02-16T00:00:00Z', 'aux_specific': True, 'public_release': True, 'embedding': 'text-embedding-ada-002', 'page_count': 31}
        }, 
    vector=None, 
    shard_key=None, 
    order_value=None
),
