# Upsert PDFs to qdrant
#### IT's designed to operate in PDF-wise fashion, unlike my previous ones that loaded and chunked an entire PDF in one go. THis one loads and chunks PDFs one at a time to enable for checking the id and enabling some better error handling.

In [None]:
# %pip install --upgrade pip
# %pip list # See what's installed and versions
# %pip install openai==0.27.8
# %pip install langchain==0.0.315
# %pip install langchain-community
%pip install langchain-qdrant
# %pip install qdrant-client==1.6.3
# %pip install pympler==1.1.3
# %pip install pypdf==5.0.1
# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9
#

## 0. Imports and Configs


In [1]:
# Confirm you're using the correct interpreter
#
import sys
print(sys.executable)

/Users/drew_wilkins/Drews_Files/Drew/Python/ASK/.venv-main/bin/python


In [2]:
import streamlit as st
import os

# Add the parent directory to sys.path so you can import library_utils from a subdirectory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [None]:
# LANGSMITH CONFIG
# These have to be set as environmental variables to be accessed behind the scenes

os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "ASK_main_upsert_notebook"

In [None]:
from langchain_openai import OpenAIEmbeddings
# for langchain_openai.OpenAIEmbeddings
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]

CONFIG = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "embedding": OpenAIEmbeddings(),
    "embedding_dims": 1536,
    "model": "gpt-3.5-turbo-16k",  # gpt-4, gpt-3.5-turbo-16k
}

## 2. Initialize the Qdrant and LC Vectorstore objects

In [25]:
# CONFIG: qdrant
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance (actual location is MacHD: private tmp local_qdrant)
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"
qdrant_url = st.secrets["QDRANT_URL"]
qdrant_api_key = st.secrets["QDRANT_API_KEY"]

In [None]:
from qdrant_client import QdrantClient

client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)

In [22]:
from langchain_qdrant import QdrantVectorStore

qdrant = QdrantVectorStore(client=client,
                           collection_name=qdrant_collection_name,
                           # embedding here is LC interface to the embedding model
                           embedding=CONFIG["embedding"],
                           )


qdrant

<langchain_qdrant.qdrant.QdrantVectorStore at 0x168a90f50>

## 3. Load PDFs into LangChain Document objects

In [None]:
from qdrant_client.http import models


def is_pdf_id_in_qdrant(pdf_id: str) -> bool:
    '''Helper function checks if pdf_id is already in Qdrant'''

    response = client.count(
        collection_name=qdrant_collection_name,
        count_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="metadata.pdf_id",
                    match=models.MatchText(text=pdf_id),
                ),
            ]
        ),
        exact=True,  # Ensures accurate count
    )

    return response.count > 0


# usage
is_pdf_id_in_qdrant("df6b2344-b73b-5c11-9f3e-aa2a370b1696")

False

In [10]:
import os
import pypdf
from langchain_community.document_loaders import PyPDFLoader
import library_utils as lib

source_directory = "./raw_pdfs"


def load_pdf(source_directory, filename):
    """Loads a PDF and adds enriched metadata.

    Args:
        source_directory (str): The directory where the PDF is located.
        filename (str): The name of the PDF file.

    Returns:
        list: A list of Langchain page document objects with enriched metadata.
    """
    pdf_path = os.path.join(source_directory, filename)
    pages = []

    try:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()  # This returns a list of LC page document objects

        with open(pdf_path, "rb") as pdf_file_obj:
            reader = pypdf.PdfReader(pdf_file_obj)
            enriched_metadata = {
                # "original_metadata": reader.metadata,
                "page_count": len(reader.pages),
                "pdf_id": lib.compute_pdf_id(pdf_path)
            }

        # Check if the PDF ID is already in Qdrant
        if is_pdf_id_in_qdrant(enriched_metadata["pdf_id"]):
            raise ValueError(
                f"PDF with ID {enriched_metadata['pdf_id']} is already in Qdrant.")

        for doc in docs:
            doc.metadata.update(enriched_metadata)
            pages.append(doc)

        print(f"Processed {filename}")
        print("number of pages processed:", len(pages))
    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    return pages


pages = load_pdf(source_directory,
                 "037_24_CHIEF_DIRECTORS_FINAL_ACTION_ON_NATIONAL_BOARD_RECOMMENDATIONS_AT_NACON_2024_01NOV24.pdf")

Processed 037_24_CHIEF_DIRECTORS_FINAL_ACTION_ON_NATIONAL_BOARD_RECOMMENDATIONS_AT_NACON_2024_01NOV24.pdf
number of pages processed: 3




### Example enriched Langchain Page Document object

```python
page_content='Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus nunc sapien' metadata={'source': './raw_pdfs/lorem_ipsum.pdf', 'page': 1, 'page_count': 13, 'pdf_id': 'df6b2344-b73b-5c11-9f3e-aa2a370b1696'}
```


## 3. Chunk the Pages


In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


# chunks at the page break
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CONFIG["chunk_size"],
    chunk_overlap=CONFIG["chunk_overlap"],
    length_function=CONFIG["length_function"],
    separators=CONFIG["separators"]
)


'''usage'''
chunks = text_splitter.split_documents(pages)
'''"chunks" is a list of objects of the class langchain.schema.document.Document'''
print("number of chunks:", len(chunks))
chunks[0]

number of chunks: 3


Document(metadata={'source': './raw_pdfs/037_24_CHIEF_DIRECTORS_FINAL_ACTION_ON_NATIONAL_BOARD_RECOMMENDATIONS_AT_NACON_2024_01NOV24.pdf', 'page': 0, 'page_count': 3, 'pdf_id': '5e7e959c-02d4-5cc1-9f05-340917525134'}, page_content=" \nPage 1 of 3  \n01 NOV 24 \nFM:  CHDIRAUX  \nTO:  ALAUX  \nALAUX 037/24   Subj:  CHIEF DIRECTOR â€™S FINAL ACTION ON NATIONAL BOARD RECOMMENDATIONS \nAT NACON 2024 \n \n 1.  At the 2024 Auxiliary National Convention (NACON), recently held in Orlando, Florida, three recommendations were placed before the National Board. They are copied below along with the Chief Director's final actions.   \n  \na.  Subject: Recommendation to Have All Auxiliary Members Be Also Members of the Auxiliary Association (Originator: Commodore Gus Formato / 29 May 2024)  \n  \n(1) Recommendation:  It is recommended that all members who have enrolled from 20 JUL 2022 to date and going forward all new members, also become members of the Auxiliary Association, formerly known as AuxA. 

## 4. Add chunks to Qdrant


In [None]:
# .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.

qdrant.from_documents(
    chunks,
    embedding=CONFIG["embedding"],  
    collection_name=qdrant_collection_name,  
    url=qdrant_url,
    api_key=qdrant_api_key, 
    # keep this to false so you don[t overwrite your collection
    force_recreate=False,
)

<langchain_qdrant.qdrant.QdrantVectorStore at 0x168a207d0>

## <span style="color:green"><b>CONGRATULATIONS: You're done</b></span>
