# Upsert PDFs to qdrant
#### This is an entire workflow from PDF to RAG.
#### It uses a now deprecated version of the langchain qdrant vectorstore object which does not allow for adjustment to metadata during the upsert process. As a result, it relied on a process of adding metadata to the pdfs prior to chunking them. This was less than ideal as the pdf standard adds a '/' to each field which renders them inaccessible as filter keys in Qdrant

In [1]:
# Confirm you're using the correct interpreter
#
import sys
print(sys.executable)

/Users/drew_wilkins/Drews_Files/Drew/Python/ASK/.venv-main/bin/python


In [2]:
import streamlit as st
import os

# Add the parent directory to sys.path so you can import library_utils from a subdirectory
parent_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

## 0. Installs and Imports


In [None]:
# %pip install --upgrade pip
# %pip list # See what's installed and versions
# %pip install openai==0.27.8
# %pip install langchain==0.0.315
# %pip install langchain-community
# %pip install --upgrade docarray
# %pip install python-doten
# %pip install --upgrade wandb
# %pip install qdrant-client==1.6.3
# %pip install pympler==1.1.3
# %pip install pypdf==5.0.1
# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9
#

### Langsmith
accessible [here](https://smith.langchain.com/o/3941ecea-6957-508c-9f4f-08ed62dc7d61/projects/p/0aea481f-080e-45eb-bae1-2ae8ee246bd9)

In [3]:
# LANGSMITH CONFIG
#
# These have to be set as environmental variables to be accessed behind the scenes
import os
from dotenv import load_dotenv, find_dotenv

env_path = find_dotenv()
load_dotenv(env_path)

# os.environ["LANGCHAIN_TRACING_V2"] = st.secrets["LANGCHAIN_TRACING_V2"]
# os.environ["LANGCHAIN_PROJECT"] = st.secrets["LANGCHAIN_PROJECT"]

os.environ["LANGCHAIN_API_KEY"] = st.secrets["LANGCHAIN_API_KEY"]
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "ASK_main_upsert_notebook"

In [4]:
from langchain_openai import OpenAIEmbeddings
# for langchain_openai.OpenAIEmbeddings
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]

In [5]:

CONFIG = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "embedding": OpenAIEmbeddings(),
    "embedding_dims": 1536,
    "model": "gpt-3.5-turbo-16k",  # gpt-4, gpt-3.5-turbo-16k
}

OPTIONAL: Langchain debugging


In [6]:
from langchain.globals import set_debug

set_debug(False)

In [None]:
# CONFIG: qdrant
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance (actual location is MacHD: private tmp local_qdrant)
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"
# qdrant_path = "/tmp/local_qdrant"


In [None]:
source_directory = "./raw_pdfs"
filename = "lorem_ipsum.pdf"

THESE WILl END UP BEING HELPER FUNCTIONS IN LIBRARY UTILS

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import exceptions as qdrant_exceptions





collections=[CollectionDescription(name='ask_pdf_pages'), CollectionDescription(name='ASK_vectorstore'), CollectionDescription(name='ask_pdf_docs')]


In [None]:
from qdrant_client.http import models


def is_pdf_id_in_qdrant(pdf_id: str) -> bool:
    '''Helper function checks if pdf_id is already in Qdrant
    
    usage
    is_pdf_id_in_qdrant("df6b2344-b73b-5c11-9f3e-aa2a370b1696")
    '''
    client = QdrantClient(
    url=st.secrets["QDRANT_URL"], api_key=st.secrets["QDRANT_API_KEY"])
    
    response = client.count(
        collection_name=qdrant_collection_name,
        count_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="metadata.pdf_id",
                    match=models.MatchText(text=pdf_id),
                ),
            ]
        ),
        exact=True,  # Ensures accurate count
    )
    return response.count > 0



False

## 2. Load PDFs into LangChain Document objects

In [None]:
import os
import pypdf
from langchain_community.document_loaders import PyPDFLoader
import library_utils as lib


def load_pdf(source_directory, filename):
    """Loads a PDF and adds enriched metadata.

    Args:
        source_directory (str): The directory where the PDF is located.
        filename (str): The name of the PDF file.

    Returns:
        list: A list of Langchain page document objects with enriched metadata.
    """
    pdf_path = os.path.join(source_directory, filename)
    pages = []

    try:
        loader = PyPDFLoader(pdf_path)
        docs = loader.load()  # This returns a list of LC page document objects

        with open(pdf_path, "rb") as pdf_file_obj:
            reader = pypdf.PdfReader(pdf_file_obj)
            enriched_metadata = {
                # "original_metadata": reader.metadata,
                "page_count": len(reader.pages),
                "pdf_id": lib.compute_pdf_id(pdf_path)
            }

        # Check if the PDF ID is already in Qdrant
        if is_pdf_id_in_qdrant(enriched_metadata["pdf_id"]):
            raise ValueError(f"PDF with ID {enriched_metadata['pdf_id']} is already in Qdrant.")

        for doc in docs:
            doc.metadata.update(enriched_metadata)
            pages.append(doc)

        print(f"Processed {filename}")

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    return pages

In [31]:
pages = load_pdf(source_directory, filename)

Processed lorem_ipsum.pdf




### Example enriched Langchain Page Document object

```python
page_content='Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus nunc sapien' metadata={'source': './raw_pdfs/lorem_ipsum.pdf', 'page': 1, 'page_count': 13, 'pdf_id': 'df6b2344-b73b-5c11-9f3e-aa2a370b1696'}
```


In [38]:
print(pages[1])

page_content='peeesqueebsedooacusbeoospeeesquedaelectus. In hac habitasse platea dictumst. Phasellus mauris enim, posuere eget    luctus ac, iaculis et quam. Vivamus et nibh diam, elementum egestas tellus. Aenean    vulputate malesuada est. Sed posuere porta diam a sodales. Proin eu sem non velit    facilisis venenatis sed a turpis.    Pellentesque sed risus a ante vulputate lobortis sit amet eu nisl. Suspendisse ut eros mi, a rhoncus lacus. Curabitur fermentum vehicula tellus, a ornare mi    condimentum vel. Integer molestie volutpat viverra. Integer posuere euismod    venenatis. Proin ac mauris sed nulla pharetra porttitor. Duis vel dui in risus    sodales auctor sit amet non enim. Maecenas mollis lacus at ligula faucibus    sodales. Cras vel neque arcu. Sed tincidunt tortor pretium nisi interdum quis    dictum arcu laoreet. Morbi pretium ultrices feugiat. Maecenas convallis augue nec    felis malesuada malesuada scelerisque mauris placerat. Sed at magna enim, at    fringilla dolor. 

## 3. Chunk


## <span style="color:YELLOW"><b>HAVENT DONE ANYTHING WITH THIS YET</b></span>

In [None]:
def extract_metadata_from_pdfs(pdfs_source_dir):
    file_list = []
    pages = []
    total_size = 0

    # Check if the path is a directory or a file
    if os.path.isdir(pdfs_source_dir):
        print("Loading PDFs from directory...")
        for foldername, subfolders, filenames in os.walk(pdfs_source_dir):
            for file in filenames:
                if file.lower().endswith('.pdf'):
                    process_pdf(os.path.join(foldername, file),
                                file_list, pages, total_size)
    elif os.path.isfile(pdfs_source_dir) and pdfs_source_dir.lower().endswith('.pdf'):
        print("Loading a single PDF file...")
        process_pdf(pdfs_source_dir, file_list, pages, total_size)
    else:
        print(
            f"Error: The path '{pdfs_source_dir}' is not a valid directory or PDF file!")

    return pages


def process_pdf(pdf_path, file_list, pages, total_size):
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        file_processed = False  # Flag to track if the file has been processed

        for doc in documents:
            with open(doc.metadata["source"], "rb") as pdf_file_obj:
                reader = pypdf.PdfReader(pdf_file_obj)
                pdf_metadata = reader.metadata
                doc.metadata.update(
                    {key: pdf_metadata[key] for key in pdf_metadata.keys()})

            pages.append(doc)
            if not file_processed:
                file_list.append(pdf_path.split('/')[-1])
                total_size += os.path.getsize(pdf_path)
                file_processed = True  # Set flag to True after processing the file

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    if file_processed:
        print(f"Processed {pdf_path.split('/')[-1]}")


'''usage'''
pages = extract_metadata_from_pdfs(source_directory)
if pages:
    last_page = pages[-1]
else:
    print("No pages were processed.")

#### Creat chunks


In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


# chunks at the page break
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CONFIG["chunk_size"],
    chunk_overlap=CONFIG["chunk_overlap"],
    length_function=CONFIG["length_function"],
    separators=CONFIG["separators"]
)


'''usage'''
# concat.pages_to_page(pages) #concatenates all the pages of the pdf into one
chunks = text_splitter.split_documents(pages)
'''"chunks" is a list of objects of the class langchain.schema.document.Document'''
chunks[0]

In [None]:
def print_document_load_summary(source_directory):
    from pympler import asizeof
    import tiktoken

    encoding = tiktoken.encoding_for_model(CONFIG["model"])
    vectorstore_tokens = encoding.encode(str(chunks))
    num_vectorestore_tokens = len(vectorstore_tokens)
    num_chunks = len(chunks)
    # Qudrant's formula is memory_size in bytes = number_of_vectors * vector_dimension * 4 bytes * 1.5
    memory_size = num_chunks * CONFIG["embedding_dims"] * 4 * 1.5

    print(f"""
        Source folder: {source_directory}
        Pages processed: {len(pages)}
        Text splitter: {CONFIG["splitter_type"]}
        Chunk size: {CONFIG["chunk_size"]} characters
        Chunk overlap: {CONFIG["chunk_overlap"]} characters
        Chunks (vectors) created: {num_chunks} 
        Dictionary size: {asizeof.asizeof(pages) / (1024 * 1024):.2f} MB
        Vectorstore tokens: {num_vectorestore_tokens}
        Estimated memory size (Qdrant): {memory_size / (1024 * 1024):.2f} MB
    """)

    ''' TODO These variables are now in a function so not accessible.    
        Document(s)loaded: {len(file_list)}
        Load size: {total_size / (1024 * 1024):.2f} MB
        '''


print_document_load_summary(source_directory)

## 4. Add chunks to Qdrant


In [None]:
from qdrant_client import QdrantClient

client = QdrantClient()

In [None]:
from langchain_qdrant import QdrantVectorStore

qdrant = QdrantVectorStore(client=client,
                collection_name=qdrant_collection_name,
                # embedding here is LC interface to the embedding model
                embedding=CONFIG["embedding"],
                )


qdrant

In [None]:
# .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.

qdrant.from_documents(
    chunks,
    embedding=CONFIG["embedding"],  # yes this is required here too
    # path=qdrant_path,  # Only required for local instance
    collection_name=qdrant_collection_name,  # yes this is required here too
    url=st.secrets["QDRANT_URL"],
    api_key=st.secrets["QDRANT_API_KEY"],  # Only required for Qdrant Cloud
    # keep this to falsse so you don[t overwrite your collection
    force_recreate=False,
)

In [None]:
print(client.get_collections())
print(
    f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")

## <span style="color:green"><b>CONGRATULATIONS: You're done</b></span>


## OPTIONAL: Create NEW vector store and add documents into it


#### Combo Create + Add Docs

#### <span style="color:red">WARNING: This will overwrite existing Qdrant collection</span>


In [None]:
from qdrant_client import QdrantClient
from langchain.vectorstores.qdrant import Qdrant


def create_localdb_and_add_chunks():
    """Use only to create the vectore db and load docs the first time. 

    .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.

    It overcomes limitations in Langchain by releaseing the vecDB afterwards"""

    client = QdrantClient()

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=qdrant_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=CONFIG["embedding"],
                    )
    qdrant.from_documents(
        chunks,
        embedding=CONFIG["embedding"],  # yes this is required here too
        path=qdrant_path,  # Only required for local instance
        collection_name=qdrant_collection_name,  # yes this is required here too
        # url=os.environ.get("QDRANT_URL"),
        # Only required for Qdrant Cloud
        # api_key=os.environ.get("QDRANT_API_KEY"),
        force_recreate=False,  # don't use if db doesn't already exist
    )
    # print(client.get_collections())
    # print(
    # f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")


check_me = create_localdb_and_add_chunks()

#### Create new Qdrant DB / Collection.

#### <span style="color:red">WARNING: This will overwrite existing Qdrant collection</span>


In [None]:
# this may not work

from qdrant_client import QdrantClient
from qdrant_client.http import models


client = QdrantClient(
    path=qdrant_path
)  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

client.create_collection(
    collection_name=new_collection_name,
    vectors_config=models.VectorParams(
        size=CONFIG["embedding_dims"], distance=models.Distance.COSINE)
)
# You may need to delete the lock file to access this afterwards

#### Add Documents with Timer


In [None]:
import time


def add_chunks_to_existingdb_with_delay(batch_size, delay):
    """
    Use only to create the vectore db and load docs the first time. (7min)
    This version loads the chunks into the vector store with a delay.

    Unlike .from_documents, which is a class method of Qdrant, 
    this uses  .add_documents which is an instance method of DocArrayInMemorySearch.
    This means you have to set up th Qdrant instance first before using it. 
    By separating the two we can insert a timer delay. It releases the vecDB afterwards.

    USAGE: Aim for ~800K tokens and then have the timer delay until 60 sec is reached
    """

    from qdrant_client import QdrantClient
    from langchain.vectorstores import Qdrant

    client = QdrantClient(
        path=qdrant_path
    )  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=new_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=CONFIG["embedding"],
                    )

    # generate indices starting from 0. increment by batch_size until len(chunks)
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]  # Create a batch of chunks
        qdrant.add_documents(documents=batch)  # Add the batch of chunks
        # pause time probably don't need to be changed since tokens usually hit limit by 18 sec.
        time.sleep(delay)

    del qdrant
    client.close()    # Release the database from this process
    del client


add_chunks_to_existingdb_with_delay(1700, 45)

## OPTIONAL: Close Client