## 0. Installs and Imports


In [None]:
# %pip install --upgrade pip
# %pip list # See what's installed and versions


# %pip install --upgrade langchain
# %pip install --upgrade docarray
# %pip install python-doten
# %pip install --upgrade wandb
# %pip install qdrant-client # applies to all qdrant implementations
# %pip install pypdf
# %pip install git+https://github.com/pikepdf/pikepdf.git#egg=pikepdf this requies python>=3.9

In [None]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

## 1. Set the model parameters


In [None]:
from langchain.embeddings import OpenAIEmbeddings

config = {
    "splitter_type": "CharacterTextSplitter",
    "chunk_size": 2000,
    "chunk_overlap": 200,
    "length_function": len,
    "separators": ["}"],  # [" ", ",", "\n"]
    "embedding": OpenAIEmbeddings(),
    "embedding_dims": 1536,
    "search_type": "mmr",
    'fetch_k': 20,   # number of documents to pass to the search alg (eg., mmr)
    "k": 5,  # number of document from fetch to pass to the LLM for inference
    'lambda_mult': .7,    # 0= max diversity, 1 is max relevance. default is 0.5
    "score_threshold": 0.5,  # for similarity score
    "model": "gpt-3.5-turbo-16k",  # gpt-4, gpt-3.5-turbo-16k
    "temperature": 0.7,
    "chain_type": "stuff",
}

OPTIONAL: Langchain debugging

In [None]:
from langchain.globals import set_debug

set_debug(False)

In [None]:
qdrant_collection_name = "ASK_vectorstore"
# Only required for local instance (actual location is MacHD: private tmp local_qdrant)
qdrant_path = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/qdrant"
# qdrant_path = "/tmp/local_qdrant"

## 3. Chunk 'n' Load


In [None]:
import os
import pypdf
from langchain.document_loaders import PyPDFLoader


def extract_metadata_from_pdfs(path_to_ingest_files):
    file_list = []
    pages = []
    total_size = 0

    # Check if the path is a directory or a file
    if os.path.isdir(path_to_ingest_files):
        print("Loading PDFs from directory...")
        for foldername, subfolders, filenames in os.walk(path_to_ingest_files):
            for file in filenames:
                if file.lower().endswith('.pdf'):
                    process_pdf(os.path.join(foldername, file),
                                file_list, pages, total_size)
    elif os.path.isfile(path_to_ingest_files) and path_to_ingest_files.lower().endswith('.pdf'):
        print("Loading a single PDF file...")
        process_pdf(path_to_ingest_files, file_list, pages, total_size)
    else:
        print(
            f"Error: The path '{path_to_ingest_files}' is not a valid directory or PDF file!")

    return pages


def process_pdf(pdf_path, file_list, pages, total_size):
    try:
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()
        file_processed = False  # Flag to track if the file has been processed

        for doc in documents:
            with open(doc.metadata["source"], "rb") as pdf_file_obj:
                reader = pypdf.PdfReader(pdf_file_obj)
                pdf_metadata = reader.metadata
                doc.metadata.update(
                    {key: pdf_metadata[key] for key in pdf_metadata.keys()})

            pages.append(doc)
            if not file_processed:
                file_list.append(pdf_path.split('/')[-1])
                total_size += os.path.getsize(pdf_path)
                file_processed = True  # Set flag to True after processing the file

    except FileNotFoundError:
        print(f"Error: Could not find {pdf_path}")

    if file_processed:
        print(f"Processed {pdf_path.split('/')[-1]}")


'''usage'''
path_to_ingest_files = "/Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/original_library_documents/CG_Auxiliary-specific"
pages = extract_metadata_from_pdfs(path_to_ingest_files)
if pages:
    last_page = pages[-1]
else:
    print("No pages were processed.")

#### Creat chunks

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


# chunks at the page break
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config["chunk_size"],
    chunk_overlap=config["chunk_overlap"],
    length_function=config["length_function"],
    separators=config["separators"]
)


'''usage'''
# concat.pages_to_page(pages) #concatenates all the pages of the pdf into one
chunks = text_splitter.split_documents(pages)
'''"chunks" is a list of objects of the class langchain.schema.document.Document'''
chunks[0]

In [None]:
def print_document_load_summary():
    from pympler import asizeof
    import tiktoken

    encoding = tiktoken.encoding_for_model(config["model"])
    vectorstore_tokens = encoding.encode(str(chunks))
    num_vectorestore_tokens = len(vectorstore_tokens)
    num_chunks = len(chunks)
    # Qudrant's formula is memory_size in bytes = number_of_vectors * vector_dimension * 4 bytes * 1.5
    memory_size = num_chunks * config["embedding_dims"] * 4 * 1.5

    print(f"""
        Target folder: {path_to_ingest_files}
        Pages processed: {len(pages)}
        Text splitter: {config["splitter_type"]}
        Chunk size: {config["chunk_size"]} characters
        Chunk overlap: {config["chunk_overlap"]} characters
        Chunks (vectors) created: {num_chunks} 
        Dictionary size: {asizeof.asizeof(pages) / (1024 * 1024):.2f} MB
        Vectorstore tokens: {num_vectorestore_tokens}
        Estimated memory size (Qdrant): {memory_size / (1024 * 1024):.2f} MB
    """)

    ''' TODO These variables are now in a function so not accessible.    
        Document(s)loaded: {len(file_list)}
        Load size: {total_size / (1024 * 1024):.2f} MB
        '''


print_document_load_summary()

## 4. OPTIONAL: Create NEW vector store and add documents into it


#### Combo Create + Add Docs

In [None]:
from qdrant_client import QdrantClient

client = QdrantClient()

In [None]:
from langchain.vectorstores.qdrant import Qdrant

qdrant = Qdrant(client=client,
                collection_name=qdrant_collection_name,
                # embedding here is LC interface to the embedding model
                embeddings=config["embedding"],
                )


qdrant

In [None]:
qdrant.from_documents(
    chunks,
    embedding=config["embedding"],  # yes this is required here too
    # path=qdrant_path,  # Only required for local instance
    collection_name=qdrant_collection_name,  # yes this is required here too
    url=os.environ.get("QDRANT_URL"),
    api_key=os.environ.get("QDRANT_API_KEY"),  # Only required for Qdrant Cloud
    force_recreate=False,  # don't use if db doesn't already exist
)

In [None]:
print(client.get_collections())
print(
    f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")

In [None]:
from qdrant_client import QdrantClient
from langchain.vectorstores.qdrant import Qdrant


def create_localdb_and_add_docs():
    """Use only to create the vectore db and load docs the first time. 
    It overcomes limitations in Langchain by releaseing the vecDB afterwards"""

    client = QdrantClient()

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=qdrant_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=config["embedding"],
                    )
    qdrant.from_documents(
        chunks,
        embedding=config["embedding"],  # yes this is required here too
        path=qdrant_path,  # Only required for local instance
        collection_name=qdrant_collection_name,  # yes this is required here too
        # url=os.environ.get("QDRANT_URL"),
        # Only required for Qdrant Cloud
        # api_key=os.environ.get("QDRANT_API_KEY"),
        force_recreate=False,  # don't use if db doesn't already exist
    )
    # print(client.get_collections())
    # print(
    # f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")


check_me = create_localdb_and_add_docs()

#### Create new Qdrant DB / Collection. 
#### <span style="color:red">WARNING: This will overwrite existing one</span>

In [None]:
# this may not work

from qdrant_client import QdrantClient
from qdrant_client.http import models


client = QdrantClient(
    path=qdrant_path
)  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

client.create_collection(
    collection_name=qdrant_collection_name,
    vectors_config=models.VectorParams(
        size=1536, distance=models.Distance.COSINE)
)
# You may need to delete the lock file to access this afterwards

#### Add Documents with Timer

In [None]:
import time


def add_docs_to_existingdb_with_delay(batch_size, delay):
    """Use only to create the vectore db and load docs the first time. (7min)
    It overcomes limitations in Langchain by releasing the vecDB afterwards.
    This version loads the chunks into the vector store with a delay"""

    '''Uses the DocArrayInMemorySearch.add_documents
    object method. Aim for ~800K tokens and then have 
    the timer delay until 60 sec is reached'''

    from qdrant_client import QdrantClient
    from qdrant_client.http import models
    from langchain.vectorstores import Qdrant

    client = QdrantClient(
        path=qdrant_path
    )  # Only required for local instance) #Initializes an entry point to communicate with Qdrant service via REST or gPRC API

    # Creates a LangChain "vector store" object with entrypoint to your DB within it
    qdrant = Qdrant(client=client,
                    collection_name=qdrant_collection_name,
                    # embedding here is LC interface to the embedding model
                    embeddings=config["embedding"],
                    )

    # generate indices starting from 0. increment by batch_size until len(chunks)
    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]  # Create a batch of chunks
        qdrant.add_documents(documents=batch)  # Add the batch of chunks
        # pause time probably don't need to be changed since tokens usually hit limit by 18 sec.
        time.sleep(delay)

    del qdrant
    client.close()    # Release the database from this process
    del client


add_docs_to_existingdb_with_delay(1700, 45)

In [None]:
print(client.get_collections())

print(
    f"""number of points in collection {client.count(collection_name=qdrant_collection_name,)}""")

## 4. Connect to Vector Store


#### Option A: Init Qdrant Cloud service entrypoint


In [4]:
from qdrant_client import QdrantClient
from langchain.vectorstores import Qdrant


if 'client' not in globals():
    client = QdrantClient(url=os.environ.get("QDRANT_URL"),
                          api_key=os.environ.get("QDRANT_API_KEY"))
else:
    print(f"Client already exists at {client}")
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='ASK_vectorstore')])

#### Option B: init Qdrant Local service entrypoint


In [None]:
from qdrant_client import QdrantClient
from langchain.vectorstores.qdrant import Qdrant
import psutil

if 'client' not in globals():
    # Only required for local instance``
    client = QdrantClient(path=qdrant_path)
else:
    print(f"Client already exists at {client}")
client.get_collections()

### Confirm client is initialized

In [6]:
from qdrant_client.local.qdrant_local import QdrantLocal
from qdrant_client.qdrant_remote import QdrantRemote


try:
    # Check if the client is running locally or via a URL
    if isinstance(client._client, QdrantLocal):
        print("The client is running locally.")
    elif isinstance(client._client, QdrantRemote):
        print("The client is running via a URL.")
    else:
        # This else block handles cases where client._client is neither QdrantLocal nor QdrantRemote
        print("Unable to determine the running mode of the Qdrant client.")
except Exception as e:
    # This block catches any other exceptions that might occur
    print("Unable to determine the running mode of the Qdrant client. Error: ", str(e))

The client is running via a URL.


In [7]:
# Creates a LangChain "vector store" object with entrypoint to your DB within it

qdrant = Qdrant(
    client=client,
    collection_name=qdrant_collection_name,
    # embedding here is a LC interface to the embedding model,
    embeddings=config["embedding"],
)

## 5. Initialize a Document Retriever


#### Define a Retriever

In [8]:
# Initializes a VectorStoreRetriever called retriever from the LC qdrant vector store object

# Option 1 using MMR search
retriever = qdrant.as_retriever(
    search_type="mmr",
    search_kwargs={'k': config["k"], "fetch_k": config["fetch_k"],
                   "lambda_mult": config["lambda_mult"]},
)

### Test the retriever is functioning

In [9]:
from IPython.display import Markdown
import re

retrieved_docs = retriever.get_relevant_documents(
    "AUX-PL-001(A) RISK MANAGEMENT TRAINING REQUIREMENTS FOR THE COAST GUARD AUXILIARY")


# Regular expression pattern to match metadata inside parentheses
metadata_pattern = re.compile(r"metadata=\{(.*?)\}")

# Function to extract metadata


def extract_metadata(doc_list):
    metadata_list = []
    for doc in doc_list:
        # Convert doc to string if it's not already a string
        if not isinstance(doc, str):
            doc = str(doc)

        matches = metadata_pattern.findall(doc)
        for match in matches:
            # Convert the matched string to a dictionary
            metadata_dict = eval('{' + match + '}')
            metadata_list.append(metadata_dict)
    return metadata_list


# Extracting metadata
metadata_list = extract_metadata(retrieved_docs)

# Print each metadata dictionary as a Markdown list item


def display_selected_metadata_as_markdown(metadata_list):
    # Start with an empty string
    markdown_string = ""

    # Iterate over each metadata dictionary
    for metadata in metadata_list:
        # Extract the /Title and page values
        title = metadata.get('/Title', 'No Title')
        source = metadata.get('source', 'No Source')
        page = metadata.get('page', 'No Page')

        # Add them as a list item in the markdown string
        markdown_string += "Title: {}, Source: {}, Page: {}  \n".format(
            title, source, page)

    # Display the markdown string
    display(Markdown(markdown_string))


# Assuming metadata_list is your list of metadata dictionaries
display_selected_metadata_as_markdown(metadata_list)

Title: Memo-Standard, Source: /Users/drew_wilkins/Drews_Files/Drew/Python/VSCode/ASK/data/original library documents/BSX_Policy_Letters/new_adds/AUX-PL-001(A)_BSX_Policy_Letter_19-01_RISK_MANAGEMENT_TRAINING_FOR_THE_COAST_GUARD_AUXILIARY.pdf, Page: 3  
Title: No Title, Source: References/Gold Side/Flotilla_Procedures_Guide_FINAL_ESIGNED_23MAR23.pdf, Page: 0  
Title: No Title, Source: For_injestion/2023 Surface Operations_Workshop Rev1.8.pptx.pdf, Page: 29  
Title: No Title, Source: For_injestion/2023 Telecomms_TCO_Workshop Rev 1.4.pptx.pdf, Page: 8  
Title: No Title, Source: References/Auxiliary Manual CIM_16790_1G.pdf, Page: 363  
