# Upsert PDFs to qdrant
#### This is an entire workflow from PDF to RAG.
#### It uses a now deprecated version of the langchain qdrant vectorstore object which does not allow for adjustment to metadata during the upsert process. As a result, it relied on a process of adding metadata to the pdfs prior to chunking them. This was less than ideal as the pdf standard adds a '/' to each field which renders them inaccessible as filter keys in Qdrant

In [1]:
# Confirm you're using the correct interpreter
#
import sys
print(sys.executable)

/Users/drew_wilkins/Drews_Files/Drew/Python/ASK/.venv-v1/bin/python


## 0. Installs and Imports


In [None]:
# %pip install --upgrade pip
# %pip list # See what's installed and versions
# %pip install openai==0.27.8
# %pip install langchain==0.0.315 
# %pip install --upgrade docarray
# %pip install python-doten
# %pip install --upgrade wandb
# %pip install qdrant-client==1.6.3
# %pip install pympler==1.1.3
# %pip install pypdf==5.0.1
# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9
#

In [6]:
import streamlit as st
import openai
from langchain.embeddings import OpenAIEmbeddings
# required for langchain.embeddings.OpenAIEmbeddings. If this form of the key doesn't work, try OPENAI_API_KEY = st.secrets["QDRANT_API_KEY"]

openai.api_key = st.secrets["QDRANT_API_KEY"]

## 1. Configs


In [None]:

config = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "embedding": OpenAIEmbeddings(),
    "embedding_dims": 1536,
    "model": "gpt-3.5-turbo-16k",  # gpt-4, gpt-3.5-turbo-16k
    "temperature": 0.7,
    "chain_type": "stuff",
}

OPTIONAL: Langchain debugging


In [8]:
from langchain.globals import set_debug

set_debug(False)

In [None]:
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance (actual location is MacHD: private tmp local_qdrant)
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"
# qdrant_path = "/tmp/local_qdrant"

source_directory = "./raw_pdfs"

## 2. Chunk


In [10]:
import os
import pypdf
from langchain.document_loaders import PyPDFLoader


def extract_metadata_from_pdfs(pdfs_source_dir):
    file_list = []
    pages = []
    total_size = 0

    # Check if the path is a directory or a file
    if os.path.isdir(pdfs_source_dir):
        print("Loading PDFs from directory...")
        for foldername, subfolders, filenames in os.walk(pdfs_source_dir):
            for file in filenames:
                if file.lower().endswith('.pdf'):
                    process_pdf(os.path.join(foldername, file),
                                file_list, pages, total_size)
    elif os.path.isfile(pdfs_source_dir) and pdfs_source_dir.lower().endswith('.pdf'):
        print("Loading a single PDF file...")
        process_pdf(pdfs_source_dir, file_list, pages, total_size)
    else:
        print(
            f"Error: The path '{pdfs_source_dir}' is not a valid directory or PDF file!")

    return pages


def process_pdf(pdf_path, file_list, pages, total_size):
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        file_processed = False  # Flag to track if the file has been processed

        for doc in documents:
            with open(doc.metadata["source"], "rb") as pdf_file_obj:
                reader = pypdf.PdfReader(pdf_file_obj)
                pdf_metadata = reader.metadata
                doc.metadata.update(
                    {key: pdf_metadata[key] for key in pdf_metadata.keys()})

            pages.append(doc)
            if not file_processed:
                file_list.append(pdf_path.split('/')[-1])
                total_size += os.path.getsize(pdf_path)
                file_processed = True  # Set flag to True after processing the file

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    if file_processed:
        print(f"Processed {pdf_path.split('/')[-1]}")


'''usage'''
pages = extract_metadata_from_pdfs(source_directory)
if pages:
    last_page = pages[-1]
else:
    print("No pages were processed.")

Loading PDFs from directory...
Processed AUXCA_SOP_005_B__20AUG24_ESIGN.pdf


#### Creat chunks


In [11]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


# chunks at the page break
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config["chunk_size"],
    chunk_overlap=config["chunk_overlap"],
    length_function=config["length_function"],
    separators=config["separators"]
)


'''usage'''
# concat.pages_to_page(pages) #concatenates all the pages of the pdf into one
chunks = text_splitter.split_documents(pages)
'''"chunks" is a list of objects of the class langchain.schema.document.Document'''
chunks[0]

Document(page_content='United States Coast Guard Auxiliary  \n \n \n \n \n \n \n \nAuxiliary Culinary Assistan ce (AUX CA) \nProgram  \n \nStandard Operating Procedures', metadata={'source': './enriched_pdfs/AUXCA_SOP_005_B__20AUG24_ESIGN.pdf', 'page': 0, '/Producer': 'pypdf', '/title': 'AUXILIARY CULINARY ASSISTANCE (AUXCA) PROGRAM STANDARD OPERATING PROCEDURES', '/leadership_scope': '1_National', '/page_count': '30', '/creation_date': '2024-08-22T00:00:00Z', '/effective_date': '2024-11-02T17:17:07Z', '/upsert_date': '2024-11-02T17:17:07Z', '/expiration_date': '2034-11-03T05:17:07Z', '/lifecycle': 'none', '/aux_specific': 'True', '/public_release': 'True', '/publication_number': 'nan', '/source': 'cgaux.org', '/originator': 'CG-BSX-1', '/curator': 'Wilkins,CA', '/pdf_id': 'b69af3d6-96ee-5a1b-8a1e-9a6feca305b2', '/pdf_file_name': 'AUXCA_SOP_005_B__20AUG24_ESIGN'})

In [12]:
def print_document_load_summary(source_directory):
    from pympler import asizeof
    import tiktoken

    encoding = tiktoken.encoding_for_model(config["model"])
    vectorstore_tokens = encoding.encode(str(chunks))
    num_vectorestore_tokens = len(vectorstore_tokens)
    num_chunks = len(chunks)
    # Qudrant's formula is memory_size in bytes = number_of_vectors * vector_dimension * 4 bytes * 1.5
    memory_size = num_chunks * config["embedding_dims"] * 4 * 1.5

    print(f"""
        Source folder: {source_directory}
        Pages processed: {len(pages)}
        Text splitter: {config["splitter_type"]}
        Chunk size: {config["chunk_size"]} characters
        Chunk overlap: {config["chunk_overlap"]} characters
        Chunks (vectors) created: {num_chunks} 
        Dictionary size: {asizeof.asizeof(pages) / (1024 * 1024):.2f} MB
        Vectorstore tokens: {num_vectorestore_tokens}
        Estimated memory size (Qdrant): {memory_size / (1024 * 1024):.2f} MB
    """)

    ''' TODO These variables are now in a function so not accessible.    
        Document(s)loaded: {len(file_list)}
        Load size: {total_size / (1024 * 1024):.2f} MB
        '''


print_document_load_summary(source_directory)


        Source folder: ./enriched_pdfs
        Pages processed: 30
        Text splitter: CharacterTextSplitter
        Chunk size: 2000 characters
        Chunk overlap: 200 characters
        Chunks (vectors) created: 30 
        Dictionary size: 0.48 MB
        Vectorstore tokens: 24627
        Estimated memory size (Qdrant): 0.26 MB
    


## 4. Add chunks to Qdrant


In [13]:
from qdrant_client import QdrantClient

client = QdrantClient()

In [None]:
# creates a langchain vectorstore object
# langchain.vectorstores.Qdrant was deprecated since 0.1.2. Works here but legacy
from langchain.vectorstores.qdrant import Qdrant

qdrant = Qdrant(client=client,
                collection_name=qdrant_collection_name,
                # embedding here is LC interface to the embedding model
                embeddings=config["embedding"],
                )


qdrant

<langchain.vectorstores.qdrant.Qdrant at 0x16661cd50>

In [None]:
# .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.

qdrant.from_documents(
    chunks,
    embedding=config["embedding"],  # yes this is required here too
    # path=qdrant_path,  # Only required for local instance
    collection_name=qdrant_collection_name,  # yes this is required here too
    url=st.secrets["QDRANT_URL"],
    api_key=st.secrets["QDRANT_API_KEY"],  # Only required for Qdrant Cloud
    # keep this to falsse so you don[t overwrite your collection
    force_recreate=False,
)

<langchain.vectorstores.qdrant.Qdrant at 0x14354cfd0>

In [None]:
print(client.get_collections())
print(
    f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")

## <span style="color:green"><b>CONGRATULATIONS: You're done</b></span>


## OPTIONAL: Create NEW vector store and add documents into it


#### Combo Create + Add Docs

#### <span style="color:red">WARNING: This will overwrite existing Qdrant collection</span>


In [None]:
from qdrant_client import QdrantClient
from langchain.vectorstores.qdrant import Qdrant


def create_localdb_and_add_chunks():
    """Use only to create the vectore db and load docs the first time. 

    .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.

    It overcomes limitations in Langchain by releaseing the vecDB afterwards"""

    client = QdrantClient()

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=qdrant_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=config["embedding"],
                    )
    qdrant.from_documents(
        chunks,
        embedding=config["embedding"],  # yes this is required here too
        path=qdrant_path,  # Only required for local instance
        collection_name=qdrant_collection_name,  # yes this is required here too
        # url=os.environ.get("QDRANT_URL"),
        # Only required for Qdrant Cloud
        # api_key=os.environ.get("QDRANT_API_KEY"),
        force_recreate=False,  # don't use if db doesn't already exist
    )
    # print(client.get_collections())
    # print(
    # f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")


check_me = create_localdb_and_add_chunks()

#### Create new Qdrant DB / Collection.

#### <span style="color:red">WARNING: This will overwrite existing Qdrant collection</span>


In [None]:
# this may not work

from qdrant_client import QdrantClient
from qdrant_client.http import models


client = QdrantClient(
    path=qdrant_path
)  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

client.create_collection(
    collection_name=new_collection_name,
    vectors_config=models.VectorParams(
        size=config["embedding_dims"], distance=models.Distance.COSINE)
)
# You may need to delete the lock file to access this afterwards

#### Add Documents with Timer


In [None]:
import time


def add_chunks_to_existingdb_with_delay(batch_size, delay):
    """
    Use only to create the vectore db and load docs the first time. (7min)
    This version loads the chunks into the vector store with a delay.

    Unlike .from_documents, which is a class method of Qdrant, 
    this uses  .add_documents which is an instance method of DocArrayInMemorySearch.
    This means you have to set up th Qdrant instance first before using it. 
    By separating the two we can insert a timer delay. It releases the vecDB afterwards.

    USAGE: Aim for ~800K tokens and then have the timer delay until 60 sec is reached
    """

    from qdrant_client import QdrantClient
    from langchain.vectorstores import Qdrant

    client = QdrantClient(
        path=qdrant_path
    )  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=new_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=config["embedding"],
                    )

    # generate indices starting from 0. increment by batch_size until len(chunks)
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]  # Create a batch of chunks
        qdrant.add_documents(documents=batch)  # Add the batch of chunks
        # pause time probably don't need to be changed since tokens usually hit limit by 18 sec.
        time.sleep(delay)

    del qdrant
    client.close()    # Release the database from this process
    del client


add_chunks_to_existingdb_with_delay(1700, 45)

## OPTIONAL: Close Client