# Upsert PDFs to qdrant
#### This is an entire workflow from PDF to RAG.
#### It uses a now deprecated version of the langchain qdrant vectorstore object which does not allow for adjustment to metadata during the upsert process. As a result, it relied on a process of adding metadata to the pdfs prior to chunking them. This was less than ideal as the pdf standard adds a '/' to each field which renders them inaccessible as filter keys in Qdrant

In [6]:
# Confirm you're using the correct interpreter
#
import sys
print(sys.executable)

/Users/drew_wilkins/Drews_Files/Drew/Python/ASK/.venv-v1/bin/python


## 0. Installs and Imports


In [7]:
# %pip install --upgrade pip
# %pip list # See what's installed and versions
# %pip install openai==0.27.8
# %pip install langchain==0.0.315
# %pip install --upgrade docarray
# %pip install python-doten
# %pip install --upgrade wandb
# %pip install qdrant-client==1.6.3
# %pip install pympler==1.1.3
# %pip install pypdf==5.0.1
# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9
#

In [8]:
import streamlit as st
import openai
from langchain.embeddings import OpenAIEmbeddings
# required for langchain.embeddings.OpenAIEmbeddings. If this form of the key doesn't work, try OPENAI_API_KEY = st.secrets["QDRANT_API_KEY"]

openai.api_key = st.secrets["QDRANT_API_KEY"]

## 1. Configs


In [9]:

config = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "embedding": OpenAIEmbeddings(),
    "embedding_dims": 1536,
    "search_type": "mmr",
    'fetch_k': 20,   # number of documents to pass to the search alg (eg., mmr)
    "k": 5,  # number of document from fetch to pass to the LLM for inference
    'lambda_mult': .7,    # 0= max diversity, 1 is max relevance. default is 0.5
    "score_threshold": 0.5,  # for similarity score
    "model": "gpt-3.5-turbo-16k",  # gpt-4, gpt-3.5-turbo-16k
    "temperature": 0.7,
    "chain_type": "stuff",
}

OPTIONAL: Langchain debugging


In [10]:
from langchain.globals import set_debug

set_debug(False)

In [11]:
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance (actual location is MacHD: private tmp local_qdrant)
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"
# qdrant_path = "/tmp/local_qdrant"

source_directory = "./raw_pdfs"

## 2. Chunk


In [50]:
import os
import pypdf
from langchain.document_loaders import PyPDFLoader


def extract_metadata_from_pdfs(pdfs_source_dir):
    file_list = []
    pages = []
    total_size = 0

    # Check if the path is a directory or a file
    if os.path.isdir(pdfs_source_dir):
        print("Loading PDFs from directory...")
        for foldername, subfolders, filenames in os.walk(pdfs_source_dir):
            for file in filenames:
                if file.lower().endswith('.pdf'):
                    process_pdf(os.path.join(foldername, file),
                                file_list, pages, total_size)
    elif os.path.isfile(pdfs_source_dir) and pdfs_source_dir.lower().endswith('.pdf'):
        print("Loading a single PDF file...")
        process_pdf(pdfs_source_dir, file_list, pages, total_size)
    else:
        print(
            f"Error: The path '{pdfs_source_dir}' is not a valid directory or PDF file!")

    return pages


def process_pdf(pdf_path, file_list, pages, total_size):
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        file_processed = False  # Flag to track if the file has been processed

        for doc in documents:
            with open(doc.metadata["source"], "rb") as pdf_file_obj:
                reader = pypdf.PdfReader(pdf_file_obj)
                pdf_metadata = reader.metadata
                doc.metadata.update(
                    {key: pdf_metadata[key] for key in pdf_metadata.keys()})

            pages.append(doc)
            if not file_processed:
                file_list.append(pdf_path.split('/')[-1])
                total_size += os.path.getsize(pdf_path)
                file_processed = True  # Set flag to True after processing the file

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    if file_processed:
        print(f"Processed {pdf_path.split('/')[-1]}")


'''usage'''
pages = extract_metadata_from_pdfs(source_directory)
if pages:
    last_page = pages[-1]
else:
    print("No pages were processed.")

Loading PDFs from directory...
Processed USCG_Addendum_to_US_NSS_to_IAMSAR-CI_16130_2G-2022-10-01.pdf


#### Creat chunks

In [51]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


# chunks at the page break
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config["chunk_size"],
    chunk_overlap=config["chunk_overlap"],
    length_function=config["length_function"],
    separators=config["separators"]
)


'''usage'''
# concat.pages_to_page(pages) #concatenates all the pages of the pdf into one
chunks = text_splitter.split_documents(pages)
'''"chunks" is a list of objects of the class langchain.schema.document.Document'''
chunks[0]

Document(page_content='U.S. Department of \nHomeland Security  \nUnited States \nCoast Guard\nU. S. COAST GUARD \nADDENDUM  \nTO THE  \nUNITED STATES  \nNATIONAL SEARCH AND RESCUE SUPPLEMENT (NSS)  \nTo The  \nInternational Aeronautical and Maritime Search and Rescue Manual \n(IAMSAR)  \nCOMDTINST 16130.2 G \nOctober  2022', metadata={'source': './raw_pdfs/USCG_Addendum_to_US_NSS_to_IAMSAR-CI_16130_2G-2022-10-01.pdf', 'page': 0, '/Author': 'USCG', '/CreationDate': "D:20221028162404-04'00'", '/Creator': 'Adobe Acrobat Pro DC (32-bit) 22.2.20191', '/Keywords': 'CI_16130_2G_THE U.S. COAST GUARD ADDENDUM TO THE UNITED STATES NATIONAL SEARCH AND RESCUE SUPPLEMENT (NSS) TO THE INTERNATIONAL AERONAUTICAL AND MARITIME SEARCH AND RESCUE MANUAL (IAMSAR)', '/ModDate': "D:20221102090321-04'00'", '/Producer': 'Adobe Acrobat Pro DC (32-bit) 22.2.20191', '/Subject': 'CG-5R, (202) 372-2010', '/Title': 'THE U.S. COAST GUARD ADDENDUM TO THE UNITED STATES NATIONAL SEARCH AND RESCUE SUPPLEMENT (NSS) TO TH

In [52]:
def print_document_load_summary(source_directory):
    from pympler import asizeof
    import tiktoken

    encoding = tiktoken.encoding_for_model(config["model"])
    vectorstore_tokens = encoding.encode(str(chunks))
    num_vectorestore_tokens = len(vectorstore_tokens)
    num_chunks = len(chunks)
    # Qudrant's formula is memory_size in bytes = number_of_vectors * vector_dimension * 4 bytes * 1.5
    memory_size = num_chunks * config["embedding_dims"] * 4 * 1.5

    print(f"""
        Source folder: {source_directory}
        Pages processed: {len(pages)}
        Text splitter: {config["splitter_type"]}
        Chunk size: {config["chunk_size"]} characters
        Chunk overlap: {config["chunk_overlap"]} characters
        Chunks (vectors) created: {num_chunks} 
        Dictionary size: {asizeof.asizeof(pages) / (1024 * 1024):.2f} MB
        Vectorstore tokens: {num_vectorestore_tokens}
        Estimated memory size (Qdrant): {memory_size / (1024 * 1024):.2f} MB
    """)

    ''' TODO These variables are now in a function so not accessible.    
        Document(s)loaded: {len(file_list)}
        Load size: {total_size / (1024 * 1024):.2f} MB
        '''


print_document_load_summary(source_directory)


        Source folder: ./raw_pdfs
        Pages processed: 697
        Text splitter: CharacterTextSplitter
        Chunk size: 2000 characters
        Chunk overlap: 200 characters
        Chunks (vectors) created: 698 
        Dictionary size: 7.44 MB
        Vectorstore tokens: 646289
        Estimated memory size (Qdrant): 6.13 MB
    


In [47]:
print(chunks[0])

page_content='COAST GUARD AUXILIARY \nOPERATIONAL EXCELLENCE PROGRAM  \n \n \n \n \n       \n \n     Handbook 16794.4 \nMarch  2024' metadata={'source': './raw_pdfs/USCG_Operational_Excellence_Program_Handbook_16794_4.pdf', 'page': 0, '/Author': 'Wyman, Kevin S BMCS', '/Comments': '', '/Company': 'Department of Defense', '/ContentTypeId': '0x010100CB76551212F59C40ACA5AFB2CA9DF8AB', '/CreationDate': "D:20240302095401-05'00'", '/Creator': 'Acrobat PDFMaker 23 for Word', '/Keywords': '', '/MediaServiceImageTags': '', '/ModDate': "D:20240302155800-05'00'", '/Order': '565000.000000000', '/Producer': 'Adobe PDF Library 23.8.53', '/SourceModified': '', '/Subject': '', '/TemplateUrl': '', '/Title': '', '/URL': ', ', '/_dlc_DocId': '65FQPP6MHWJT-513687613-6591', '/_dlc_DocIdItemGuid': '2f10abfc-926b-49a2-9f1d-8c4d7c4987bc', '/_dlc_DocIdPersistId': '0', '/_dlc_DocIdUrl': 'https://cg.portal.uscg.mil/units/cg731/1.HQ/_layouts/DocIdRedir.aspx?ID=65FQPP6MHWJT-513687613-6591, 65FQPP6MHWJT-513687613-6

## 3. OPTIONAL: Enrich the chunks with metadata
### <span style="color:red">WARNING: Ship this for now. TODO: 1. preserve the page key from the original chunk. 2. Calculate doc_id, check for duplicates in qdrant, and insert into metadata. This calls into question the structure of this notebook which currently splits and chunks the entire pdf folder at once, making this complicated. Maybe reqrite to split and chunk on a per-pdf basis  </span>

#### This produces a list of chunks, where each list item is an instane of the `Document` class from `langchain.schema.document`. Each chunk contains the text of the pdf, the metadata of the pdf, and the pdf's id.

##### The metadata for the first chunk is `chunks[0].metadata`
```python
{'source': './raw_pdfs/AUXCA_SOP_005_B__20AUG24_ESIGN.pdf',
 'page': 0,
 '/CreationDate': "D:20240822072431-04'00'",
 '/ModDate': "D:20240822072431-04'00'"}
 ```

There will be many chunks for many documents. We'll use the 'source' field in the existing chunk metadata to match with a dictionary of metadata for each pdf. Then we'll replace the metadata in each chunk with the new metadata.


In [None]:
dict_of_metadatas_to_insert = {'./raw_pdfs/CI_5400_7G.pdf': {'title': 'ORGANIZATION MANAGEMENT, COMDTINST 5400.7G', 'leadership_scope': '1_National', 'page_count': 37, 'creation_date': '2023-12-01T00:00:00Z', 'effective_date': '2024-11-02T22:04:43Z', 'upsert_date': '2024-11-02T22:04:43Z', 'expiration_date': '2034-11-03T10:04:43Z', 'lifecycle': 'none', 'aux_specific': True, 'public_release': True, 'publication_number': 'none', 'source': 'cgaux.org', 'originator': 'CG-BSX-1', 'curator': 'Hamilton,A', 'pdf_id': '7ea37b80-a7ab-58b1-8cb5-afc1ccee61a5', 'pdf_file_name': 'CI_5400_7G', 'pdf_path': './raw_pdfs/CI_5400_7G.pdf'},
                               './raw_pdfs/AUXCA_SOP_005_B__20AUG24_ESIGN.pdf': {'title': 'AUXCA SOP 005 B  20AUG24 ESIGN', 'leadership_scope': '1_National', 'page_count': 30, 'creation_date': '2024-08-22T00:00:00Z', 'effective_date': '2024-11-02T22:04:44Z', 'upsert_date': '2024-11-02T22:04:44Z', 'expiration_date': '2034-11-03T10:04:44Z', 'lifecycle': 'none', 'aux_specific': True, 'public_release': True, 'publication_number': 'none', 'source': 'cgaux.org', 'organization': 'CG-BSX', 'curator': 'Wilkins,CA', 'pdf_id': 'b69af3d6-96ee-5a1b-8a1e-9a6feca305b2', 'pdf_file_name': 'AUXCA_SOP_005_B__20AUG24_ESIGN', 'pdf_path': './raw_pdfs/AUXCA_SOP_005_B__20AUG24_ESIGN.pdf'}}

In [None]:
# iterates through each text chunk, checking if the chunk’s source file path matches a key in dict_of_metadatas_to_insert. If a match is found, it replaces the chunk’s metadata with the comprehensive metadata for that document. If no match exists, it issues a warning indicating missing metadata for that source.

for chunk in chunks:
    source_path = chunk.metadata.get("source")
    if source_path in dict_of_metadatas_to_insert:
        # Update the chunk's metadata with the matched metadata dictionary
        chunk.metadata = dict_of_metadatas_to_insert[source_path]
    else:
        print(f"Warning: No metadata found for {source_path}")

In [None]:
# check a chunk

chunks[0].metadata
print(chunks[0])

page_content='ORGANIZATION \nMANAGEMENT \nCOMDTINST 5400.7G \nJanuary 2024' metadata={'title': 'ORGANIZATION MANAGEMENT, COMDTINST 5400.7G', 'leadership_scope': '1_National', 'page_count': 37, 'creation_date': '2023-12-01T00:00:00Z', 'effective_date': '2024-11-02T22:04:43Z', 'upsert_date': '2024-11-02T22:04:43Z', 'expiration_date': '2034-11-03T10:04:43Z', 'lifecycle': 'none', 'aux_specific': True, 'public_release': True, 'publication_number': 'none', 'source': 'cgaux.org', 'originator': 'CG-BSX-1', 'curator': 'Hamilton,A', 'pdf_id': '7ea37b80-a7ab-58b1-8cb5-afc1ccee61a5', 'pdf_file_name': 'CI_5400_7G', 'pdf_path': './raw_pdfs/CI_5400_7G.pdf'}


## 4. Add chunks to Qdrant


In [41]:
from qdrant_client import QdrantClient

client = QdrantClient()

In [53]:
# creates a langchain vectorstore object
# langchain.vectorstores.Qdrant was deprecated since 0.1.2. Works here but legacy
from langchain.vectorstores.qdrant import Qdrant

qdrant = Qdrant(client=client,
                collection_name=qdrant_collection_name,
                # embedding here is LC interface to the embedding model
                embeddings=config["embedding"],
                )

qdrant

<langchain.vectorstores.qdrant.Qdrant at 0x3cb0c4d50>

In [54]:
# .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.

qdrant.from_documents(
    chunks,
    embedding=config["embedding"],  # yes this is required here too
    # path=qdrant_path,  # Only required for local instance
    collection_name=qdrant_collection_name,  # yes this is required here too
    url=st.secrets["QDRANT_URL"],
    api_key=st.secrets["QDRANT_API_KEY"],  # Only required for Qdrant Cloud
    # keep this to falsse so you don[t overwrite your collection
    force_recreate=False,
)

<langchain.vectorstores.qdrant.Qdrant at 0x16556ac90>

In [None]:
print(client.get_collections())
print(
    f"""number of points in collection {client.count(collection_name=new_collection_name,)}""")

### <span style="color:green"><b>CONGRATULATIONS: You're done</b></span>


## OPTIONAL: Create NEW vector store and add documents into it


##THIS DOES NOT CURENTLY INJECT NEW METADATA

#### Combo Create + Add Docs

#### <span style="color:red">WARNING: This will overwrite existing Qdrant collection</span>


In [None]:
from qdrant_client import QdrantClient
from langchain.vectorstores.qdrant import Qdrant


def create_localdb_and_add_chunks():
    """Use only to create the vectore db and load docs the first time. 

    .from_documents  creates the Qdrant vectorstore object with the documents/chunks, embeddings, and metadata and automatically configures the collection.  It is a Langchain class method used to initialize a new Qdrant instance with a collection of documents all at once.

    It overcomes limitations in Langchain by releaseing the vecDB afterwards"""

    client = QdrantClient()

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=qdrant_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=config["embedding"],
                    )
    qdrant.from_documents(
        chunks,
        embedding=config["embedding"],  # yes this is required here too
        path=qdrant_path,  # Only required for local instance
        collection_name=qdrant_collection_name,  # yes this is required here too
        # url=os.environ.get("QDRANT_URL"),
        # Only required for Qdrant Cloud
        # api_key=os.environ.get("QDRANT_API_KEY"),
        force_recreate=False,  # don't use if db doesn't already exist
    )
    # print(client.get_collections())
    # print(
    # f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")


check_me = create_localdb_and_add_chunks()

#### Create new Qdrant DB / Collection.

#### <span style="color:red">WARNING: This will overwrite existing Qdrant collection</span>


In [None]:
# this may not work

from qdrant_client import QdrantClient
from qdrant_client.http import models


client = QdrantClient(
    path=qdrant_path
)  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

client.create_collection(
    collection_name=new_collection_name,
    vectors_config=models.VectorParams(
        size=1536, distance=models.Distance.COSINE)
)
# You may need to delete the lock file to access this afterwards

#### Add Documents with Timer


In [None]:
import time


def add_chunks_to_existingdb_with_delay(batch_size, delay):
    """
    Use only to create the vectore db and load docs the first time. (7min)
    This version loads the chunks into the vector store with a delay.

    Unlike .from_documents, which is a class method of Qdrant, 
    this uses  .add_documents which is an instance method of DocArrayInMemorySearch.
    This means you have to set up th Qdrant instance first before using it. 
    By separating the two we can insert a timer delay. It releases the vecDB afterwards.

    USAGE: Aim for ~800K tokens and then have the timer delay until 60 sec is reached
    """

    from qdrant_client import QdrantClient
    from langchain.vectorstores import Qdrant

    client = QdrantClient(
        path=qdrant_path
    )  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=new_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=config["embedding"],
                    )

    # generate indices starting from 0. increment by batch_size until len(chunks)
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]  # Create a batch of chunks
        qdrant.add_documents(documents=batch)  # Add the batch of chunks
        # pause time probably don't need to be changed since tokens usually hit limit by 18 sec.
        time.sleep(delay)

    del qdrant
    client.close()    # Release the database from this process
    del client


add_chunks_to_existingdb_with_delay(1700, 45)

In [None]:
print(client.get_collections())

print(
    f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")

## 5. Connect to Vector Store


#### Init Qdrant Cloud service entrypoint


In [None]:
del qdrant
client.close()    # Release the database from this process
del client

In [None]:
from qdrant_client import QdrantClient
from langchain.vectorstores import Qdrant


if 'client' not in globals():
    client = QdrantClient(url=st.secrets["QDRANT_URL"],  # for local instance substitute (path=qdrant_path)
                          api_key=st.secrets["QDRANT_API_KEY"])  # not needed for local instance
else:
    print(f"Client already exists at {client}")
client.get_collections()

### Confirm client is initialized and location


In [None]:
from qdrant_client.local.qdrant_local import QdrantLocal
from qdrant_client.qdrant_remote import QdrantRemote


try:
    # Check if the client is running locally or via a URL
    if isinstance(client._client, QdrantLocal):
        print("The client is running locally.")
    elif isinstance(client._client, QdrantRemote):
        print("The client is running via a URL.")
    else:
        # This else block handles cases where client._client is neither QdrantLocal nor QdrantRemote
        print("Unable to determine the running mode of the Qdrant client.")
except Exception as e:
    # This block catches any other exceptions that might occur
    print("Unable to determine the running mode of the Qdrant client. Error: ", str(e))

In [None]:
# Creates a LangChain "vector store" object with entrypoint to your DB within it

qdrant = Qdrant(
    client=client,
    collection_name=qdrant_collection_name,
    # embedding here is a LC interface to the embedding model,
    embeddings=config["embedding"],
)

## 6. Initialize a Document Retriever


#### Define a Retriever


In [None]:
# Initializes a VectorStoreRetriever called retriever from the LC qdrant vector store object

# Option 1 using MMR search
retriever = qdrant.as_retriever(
    search_type="mmr",
    search_kwargs={'k': config["k"], "fetch_k": config["fetch_k"],
                   "lambda_mult": config["lambda_mult"]},
)

### Test the retriever is functioning


In [None]:
from IPython.display import Markdown
import re

retrieved_docs = retriever.get_relevant_documents(
    "AUX-PL-001(A) RISK MANAGEMENT TRAINING REQUIREMENTS FOR THE COAST GUARD AUXILIARY")


# Regular expression pattern to match metadata inside parentheses
metadata_pattern = re.compile(r"metadata=\{(.*?)\}")

# Function to extract metadata


def extract_metadata(doc_list):
    metadata_list = []
    for doc in doc_list:
        # Convert doc to string if it's not already a string
        if not isinstance(doc, str):
            doc = str(doc)

        matches = metadata_pattern.findall(doc)
        for match in matches:
            # Convert the matched string to a dictionary
            metadata_dict = eval('{' + match + '}')
            metadata_list.append(metadata_dict)
    return metadata_list


# Extracting metadata
metadata_list = extract_metadata(retrieved_docs)

# Print each metadata dictionary as a Markdown list item


def display_selected_metadata_as_markdown(metadata_list):
    # Start with an empty string
    markdown_string = ""

    # Iterate over each metadata dictionary
    for metadata in metadata_list:
        # Extract the /Title and page values
        title = metadata.get('/Title', 'No Title')
        source = metadata.get('source', 'No Source')
        page = metadata.get('page', 'No Page')

        # Add them as a list item in the markdown string
        markdown_string += "Title: {}, Source: {}, Page: {}  \n".format(
            title, source, page)

    # Display the markdown string
    display(Markdown(markdown_string))


# Assuming metadata_list is your list of metadata dictionaries
display_selected_metadata_as_markdown(metadata_list)