# Upsert PDFs to qdrant

#### **HOW IT WORKS**: It operate in PDF-wise fashion, unlike my previous notebooks that loaded and chunked an entire PDF in one go. This one loads and chunks PDFs one at a time to enable for checking the id and enabling some better error handling.

To upsert a single pdf:
1. Set the qdrant collection name 
2. Choose local or cloud
3. Run cells individually until you generate a pdf_id.
4. Add the pdf medata into the spreadsheet
5. Run the rest of the cells to upsert the pdf

To upsert a folder of pdfs:
1. Set the qdrant collection name 
2. Choose local or cloud
3. Run the cells individually, stopping at `qdrant.add_documents(chunks)`. This initialize all the functions and config variables needed for the batch process. Yes, you will need to specify a specific PDF file.

In [None]:
# %pip install -qU pip
# %pip install -qU langchain-openai openai langchain-community langchain-qdrant qdrant-client pympler pypdf==5.0.1

# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9
# %pip install gspread google-auth

## 0. Imports and Configs


In [2]:
import os, sys
from dotenv import load_dotenv

# Confirm correct interpreter is used
print(sys.executable)

# Add parent directory to sys.path to import modules from a subdirectory
sys.path.append(os.path.abspath('..'))
import library_utils as lib

/Users/drew_wilkins/Drews_Files/Drew/Python/Repositories/uscgaux/.venv-main/bin/python


In [2]:
ENV_PATH = "/Users/drew_wilkins/Drews_Files/Drew/Python/Localcode/.env"
load_dotenv(ENV_PATH)

# Config LangSmith observability
# LANGCHAIN_API_KEY = os.environ["LANGCHAIN_API_KEY"]
# os.environ["LANGCHAIN_TRACING_V2"] = "false"
# os.environ["LANGCHAIN_PROJECT"] = "ASK_main_upsert_notebook"


# Config Qdrant
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
# QDRANT_URL = st.secrets["QDRANT_URL"]
# QDRANT_API_KEY = st.secrets["QDRANT_API_KEY"]
QDRANT_PATH = "./qdrant_db"


# Config langchain_openai
from langchain_openai import OpenAIEmbeddings
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_ASK")
# OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]


# Misc configs for tracing
CONFIG = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "qdrant_collection_name": "ASK_vectorstore",
    "embedding_model": "text-embedding-ada-002",  # alt: text-embedding-3-large
    "embedding_dims": 1536,  # alt: 1024
    "vector_name": "text-dense",
    "sparse_vector_name": "None",
    "sparse_embedding": "None",
    "search_type": "mmr",
    "k": 5,
    'fetch_k': 20,   # fetch 30 docs then select 5
    'lambda_mult': .7,    # 0= max diversity, 1 is min. default is 0.5
    "score_threshold": 0.5,
    "generation_model": "gpt-3.5-turbo-16k",
    "temperature": 0.7,
}

## 1. Initialize the Qdrant and LC Vectorstore objects

In [3]:
from qdrant_client import QdrantClient

client = QdrantClient(
    url=QDRANT_URL,  # for cloud
    api_key=QDRANT_API_KEY,  # for cloud
    prefer_grpc=True,
    # path=QDRANT_PATH,  # for local
)


lib.which_qdrant(client)
lib.list_collections(client)

qdrant location: cloud

Available collections:
ASK_vectorstore
ask_pdf_docs
ASK_vectorstore-backup21APR2025
ask_pdf_pages


In [4]:
collection_info = client.get_collection(
    collection_name="ASK_vectorstore")
print(collection_info)

status=<CollectionStatus.GREEN: 'green'> optimizer_status=<OptimizersStatusOneOf.OK: 'ok'> vectors_count=None indexed_vectors_count=10566 points_count=11615 segments_count=2 config=CollectionConfig(params=CollectionParams(vectors=VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=True, datatype=None, multivector_config=None), shard_number=1, sharding_method=None, replication_factor=1, write_consistency_factor=1, read_fan_out_factor=None, on_disk_payload=True, sparse_vectors={'text-sparse': SparseVectorParams(index=SparseIndexParams(full_scan_threshold=None, on_disk=False, datatype=None), modifier=None)}), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=False, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_in

In [5]:
from langchain_qdrant import QdrantVectorStore

# Initialize a LangChain vectorstore object
qdrant = QdrantVectorStore(client=client,
                           collection_name=CONFIG["qdrant_collection_name"],
                           # embedding here is LC interface to the embedding model
                           embedding=OpenAIEmbeddings(
                               model=CONFIG["embedding_model"]),
                           validate_collection_config=True  # Skip validation
                           )


qdrant

<langchain_qdrant.qdrant.QdrantVectorStore at 0x319329490>

## 2. Specify the file locations

In [None]:
pdf_filename = "D7-SOP-AD-002_Proctor_Designation.pdf"  #
pdf_source_directory = "./pdfs_backlog"
pdf_path = os.path.join(pdf_source_directory, pdf_filename)

LIBRARY_CATALOG_ID = "16F5tRIvuHncofRuXCsQ20A7utZWRuEgA2bvj4nQQjek"

PDF_LIVE_FOLDER_ID = "1-vyQQp30mKzudkTOk7YJLmmVDirBOIpg"
PDF_BACKLOG_FOLDER_ID = "1993TlUkd9_4XqWCutyY5oNTpmBdnxefc"
PDF_DELETED_FOLDER_ID = "1FYUFxenYC6nWomzgv6j1O4394Zv6Bs5F"