# Upsert PDFs to qdrant
#### IT's designed to operate in PDF-wise fashion, unlike my previous ones that loaded and chunked an entire PDF in one go. THis one loads and chunks PDFs one at a time to enable for checking the id and enabling some better error handling.

In [None]:
# %pip install --upgrade pip
# %pip list # See what's installed and versions
# %pip install openai==0.27.8
# %pip install langchain==0.0.315
# %pip install langchain-community
%pip install langchain-qdrant
# %pip install qdrant-client==1.6.3
# %pip install pympler==1.1.3
# %pip install pypdf==5.0.1
# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9
#

## 0. Imports and Configs


In [37]:
# Confirm you're using the correct interpreter
#
import sys
print(sys.executable)

/Users/drew_wilkins/Drews_Files/Drew/Python/ASK/.venv-main/bin/python


In [38]:
import pandas as pd
import streamlit as st
import os

# Add the parent directory to sys.path so you can import library_utils from a subdirectory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [39]:
import library_utils as lib

In [40]:
# LANGSMITH CONFIG
# These have to be set as environmental variables to be accessed behind the scenes

os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "ASK_main_upsert_notebook"

In [41]:
from langchain_openai import OpenAIEmbeddings
# for langchain_openai.OpenAIEmbeddings
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]

CONFIG = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "embedding": OpenAIEmbeddings(model="text-embedding-ada-002"),
    "embedding_dims": 1536,
    "model": "gpt-3.5-turbo-16k",  # gpt-4, gpt-3.5-turbo-16k
}

## 1. Initialize the Qdrant and LC Vectorstore objects

In [42]:
# CONFIG: qdrant
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance (actual location is MacHD: private tmp local_qdrant)
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"
qdrant_url = st.secrets["QDRANT_URL"]
qdrant_api_key = st.secrets["QDRANT_API_KEY"]

In [43]:
from qdrant_client import QdrantClient

client = QdrantClient(
    url=qdrant_url,
    api_key=qdrant_api_key,
)

In [44]:
from langchain_qdrant import QdrantVectorStore

qdrant = QdrantVectorStore(client=client,
                           collection_name=qdrant_collection_name,
                           # embedding here is LC interface to the embedding model
                           embedding=CONFIG["embedding"],
                           )


qdrant

<langchain_qdrant.qdrant.QdrantVectorStore at 0x17e275010>

## 2. Specify the file locations

In [45]:
pdf_filename = "VE_workshop_Dec_4_22_2023.pdf"  #
source_directory = "./test_pdfs_copies_delete_after_use"
pdf_path = os.path.join(source_directory, pdf_filename)

metadata_source_path = "./v2__planned_library_catalog_2024-11-03.xlsx"

## 3. Generate PDF ID

In [46]:
def get_pdf_id(pdf_path):
    pdf_uuid = lib.compute_pdf_id(pdf_path)
    pdf_id = str(pdf_uuid)
    return pdf_id


'''usage'''
pdf_id = get_pdf_id(pdf_path)
pdf_id

'a0d00ff2-7ad5-5ea7-baa0-0d738380cd3e'

## 4. Check if PDF is already in Qdrant

In [47]:
from qdrant_client.http import models


def is_pdf_id_in_qdrant(pdf_id: str) -> bool:
    '''Helper function checks if pdf_id is already in Qdrant'''

    response = client.count(
        collection_name=qdrant_collection_name,
        count_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="metadata.pdf_id",
                    match=models.MatchText(text=pdf_id),
                ),
            ]
        ),
        exact=True,  # Ensures accurate count
    )

    return response.count > 0


'''usage'''
is_pdf_id_in_qdrant(pdf_id)

True

## 5. Get metadata if it exists

In [48]:
# LEt's create a function to get the metadata as strings an then put this into library_utils


def get_planned_metadata_for_single_record(pdf_id, metadata_source_path):
    try:
        df = pd.read_excel(metadata_source_path)

        # Find the metadata row in df that corresponds to this pdf_id
        pdf_metadata = df[df['pdf_id'].str.strip().astype(
            str).str.lower() == pdf_id.lower()]

        if not pdf_metadata.empty:
            # Ensure no duplicate pdf_ids in the metadata
            if len(pdf_metadata) > 1:
                raise ValueError(
                    f"Found duplicates for pdf_id: '{pdf_id}', number of results: {len(pdf_metadata)}")

            pdf_metadata = pdf_metadata.iloc[0]
            document_metadata = pdf_metadata.to_dict()
            print(f"Successfully accessed metadata for pdf: {pdf_id}")
            return document_metadata
        else:
            raise ValueError(f"No metadata found for pdf: {pdf_id}")

    except Exception as e:
        print(f"Error retrieving metadata for {pdf_id}: {e}")
        return None, None  # Return None values if an error occurs to continue with the loop


planned_metadata = get_planned_metadata_for_single_record(
    pdf_id, metadata_source_path)

planned_metadata

Successfully accessed metadata for pdf: a0d00ff2-7ad5-5ea7-baa0-0d738380cd3e


{'title': 'Auxiliary Vessel Examiner VE Workshop 2023',
 'pdf_id': 'a0d00ff2-7ad5-5ea7-baa0-0d738380cd3e',
 'publication_number': nan,
 'organization': nan,
 'scope': '1_national',
 'issue_date': '2024-02-16T00:00:00Z',
 'upsert_date': nan,
 'expiration_date': '2034-02-16T00:00:00Z',
 'aux_specific': True,
 'public_release': 1.0,
 'pdf_file_name': 'VE_workshop_Dec_4_22_2023.pdf',
 'embedding': 'text-embedding-ada-002'}

## 6. Load PDFs into LangChain Document objects

In [49]:
import os
from datetime import datetime, timezone
import pypdf
from langchain_community.document_loaders import PyPDFLoader


def load_pdf(pdf_path, pdf_id, planned_metadata):  # pdf_dict is optional
    """Loads a PDF, pdf_id and, optionally, additional metadata.

    Returns:
        list: A list of Langchain page document objects with enriched metadata.
    """
    pages = []

    try:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()  # This returns a list of LC page document objects

        with open(pdf_path, "rb") as pdf_file_obj:
            reader = pypdf.PdfReader(pdf_file_obj)
            # in case you want metadata from the orignal PDF
            original_pdf_metadata = reader.metadata
            enriched_metadata = {
                'pdf_id': pdf_id,
                'upsert_date': datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
                'aux_specific': bool(planned_metadata.get('aux_specific', 0)),
                'public_release': bool(planned_metadata.get('public_release', 0)),
                'page_count': len(reader.pages),
                'embedding': "text-embedding-ada-002",
            }
            planned_metadata.update(enriched_metadata)
            planned_metadata.pop('pdf_file_name', None)

        for doc in docs:
            doc.metadata.update(planned_metadata)
            pages.append(doc)

        print(f"Processed {pdf_path}")
        print("number of pages processed:", len(pages))
    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    return pages


'''usage'''
pages = load_pdf(pdf_path, pdf_id, planned_metadata)

Processed ./test_pdfs_copies_delete_after_use/VE_workshop_Dec_4_22_2023.pdf
number of pages processed: 31


In [50]:
pages[0]

Document(metadata={'source': './test_pdfs_copies_delete_after_use/VE_workshop_Dec_4_22_2023.pdf', 'page': 0, 'title': 'Auxiliary Vessel Examiner VE Workshop 2023', 'pdf_id': 'a0d00ff2-7ad5-5ea7-baa0-0d738380cd3e', 'publication_number': nan, 'organization': nan, 'scope': '1_national', 'issue_date': '2024-02-16T00:00:00Z', 'upsert_date': '2024-11-14T20:09:34Z', 'expiration_date': '2034-02-16T00:00:00Z', 'aux_specific': True, 'public_release': True, 'embedding': 'text-embedding-ada-002', 'page_count': 31}, page_content='2023 VESSEL EXAMINER WORKSHOP\nfor both USCG Auxiliary and US Power Squadrons  \nPrepared by the \nUNITED STATES COAST GUARD AUXILIARY\n DIRECTORATE FOR VESSEL EXAMINATION \nAND PARTNER VISITATION\n1\n٠Welcome to the 2023 Vessel Examiner Workshop, for both US Power Squadrons \n(America’s Boating Club) and USCG Auxiliary.\n٠It includes topics VEs requested to be included and information based on help desk \nquestions received.\n٠It can be done either in a group facilitated 



### Example enriched Langchain Page Document object

```python
page_content='Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus nunc sapien' metadata={'source': './raw_pdfs/lorem_ipsum.pdf', 'page': 1, 'page_count': 13, 'pdf_id': 'df6b2344-b73b-5c11-9f3e-aa2a370b1696'}
```


## 7. Chunk the Pages


In [51]:
from langchain.text_splitter import RecursiveCharacterTextSplitter


def chunk(pages):
    # chunks at the page break
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CONFIG["chunk_size"],
        chunk_overlap=CONFIG["chunk_overlap"],
        length_function=CONFIG["length_function"],
        separators=CONFIG["separators"]
    )

    chunks = text_splitter.split_documents(pages)
    '''"chunks" is a list of objects of the class langchain.schema.document.Document'''

    return chunks


'''usage'''
chunks = chunk(pages)
print("number of chunks:", len(chunks))
chunks[0]

number of chunks: 31


Document(metadata={'source': './test_pdfs_copies_delete_after_use/VE_workshop_Dec_4_22_2023.pdf', 'page': 0, 'title': 'Auxiliary Vessel Examiner VE Workshop 2023', 'pdf_id': 'a0d00ff2-7ad5-5ea7-baa0-0d738380cd3e', 'publication_number': nan, 'organization': nan, 'scope': '1_national', 'issue_date': '2024-02-16T00:00:00Z', 'upsert_date': '2024-11-14T20:09:34Z', 'expiration_date': '2034-02-16T00:00:00Z', 'aux_specific': True, 'public_release': True, 'embedding': 'text-embedding-ada-002', 'page_count': 31}, page_content='2023 VESSEL EXAMINER WORKSHOP\nfor both USCG Auxiliary and US Power Squadrons  \nPrepared by the \nUNITED STATES COAST GUARD AUXILIARY\n DIRECTORATE FOR VESSEL EXAMINATION \nAND PARTNER VISITATION\n1\n٠Welcome to the 2023 Vessel Examiner Workshop, for both US Power Squadrons \n(America’s Boating Club) and USCG Auxiliary.\n٠It includes topics VEs requested to be included and information based on help desk \nquestions received.\n٠It can be done either in a group facilitated 

## 8. Add chunks to Qdrant


In [57]:
# .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.
def upsert_to_qdrant(chunks):
    qdrant.from_documents(
        chunks,
        embedding=CONFIG["embedding"],
        collection_name=qdrant_collection_name,
        url=qdrant_url,
        api_key=qdrant_api_key,
        # keep this to false so you don[t overwrite your collection
        force_recreate=False,
    )
    print(f"🗄️ Upserted to Qdrant")

### Option 1: Use this cell if loading a single PDF

In [None]:
'''usage'''
pdf_chunks = upsert_to_qdrant(chunks)

### Option 2: use this cell to load an entire folder of PDFs using the code above

In [None]:
source_directory = source_directory
metadata_source_path = metadata_source_path

uploaded_files = []
failed_files = []


def mac_daddy_function(source_directory, metadata_source_path):
    for dirpath, dirnames, filenames in os.walk(source_directory):
        for filename in filenames:
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(dirpath, filename)
                print(f"Processing {filename}")
                try:
                    pdf_id = get_pdf_id(pdf_path)
                    if is_pdf_id_in_qdrant(pdf_id):
                        # ANSI escape code for red text
                        print(
                            f"\033[91m💥 pdf_id {pdf_id} is already in Qdrant. Delete first before proceeding.\033[0m")
                        failed_files.append(filename)
                        continue  # Skip to the next file
                    planned_metadata = get_planned_metadata_for_single_record(
                        pdf_id, metadata_source_path)
                    pages = load_pdf(pdf_path, pdf_id, planned_metadata)
                    chunks = chunk(pages)
                    upsert_to_qdrant(chunks)
                    uploaded_files.append(filename)
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
                    failed_files.append(filename)


'''usage'''
mac_daddy_function(source_directory, metadata_source_path)
print("🗄️ Uploaded files:", uploaded_files)
print("💥 Failed files:", failed_files)

Processing USCGAUX_Social_Media_SOP_FINAL_ESIGNED_12JUN23.pdf
[91m💥 pdf_id 31fba5d9-53a8-5d3c-81cc-358d15f0b6e4 is already in Qdrant. Delete first before proceeding.[0m
Processing VE_workshop_Dec_4_22_2023.pdf
[91m💥 pdf_id a0d00ff2-7ad5-5ea7-baa0-0d738380cd3e is already in Qdrant. Delete first before proceeding.[0m

🗄️ Uploaded files: []
💥 Failed files: ['USCGAUX_Social_Media_SOP_FINAL_ESIGNED_12JUN23.pdf', 'VE_workshop_Dec_4_22_2023.pdf']


## <span style="color:green"><b>CONGRATULATIONS: You're done</b></span>


## Simple Utility to see record in Qdrant

In [11]:
from qdrant_client.http import models

records = client.scroll(
    collection_name=qdrant_collection_name,
    with_payload=True,  # change to True to see the payload
    with_vectors=False,  # change to True to see the vectors
    limit=1000,

    scroll_filter=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.pdf_id",  # only in upsert>14NOV2024
                match=models.MatchText(
                    text="31fba5d9-53a8-5d3c-81cc-358d15f0b6e4"),
            ),
        ]
    ),
)

print(f"Number of points found: {len(records[0])}")
records[0]

Number of points found: 34


[Record(id='01c69c14-8dbe-4113-b99c-bd9acfc84f2b', payload={'page_content': '                                                                                                                          AUX- SOP-012(A)  \n                                                                                                                12 Jun 2023  \n \n11 \n (13)  Unmonitored or abandoned social media accounts can be the target of hackers. The  \nunit commander  has a choice of permanent versus temporary removal. If the page is \nsimply deactivated or unpublished, the page can be reactivated or published in the \nfuture without losing posts and followers.  \n \n(14)  DSO- PAs, with cooperation from unit leaders, will reinforce  social media policy, \nsecurity, and pro cedure by providing unit or individual training in f lotillas , divisions \nand within districts.  \n \n4.  Rules of Engagement. \n \na.  All Coast Guard personnel, including Auxiliarists, utilizing social media tools whether in

## Simple Utility to <span style="color:red"><b>delete</b></span> record in Qdrant

In [70]:
from qdrant_client.http import models

client.delete(
    collection_name=qdrant_collection_name,
    points_selector=models.Filter(
        must=[
            models.FieldCondition(
                key="metadata.pdf_id",
                match=models.MatchText(
                    text="31fba5d9-53a8-5d3c-81cc-358d15f0b6e4"),  # BE CAREFUL CHECK THIS IS WHAT YOUR
            ),
        ]
    ),
)

print(f"Number of points deleted: {len(records[0])}")

Number of points deleted: 34
