## Creating an index and populating it with documents using Milvus

Simple example on how to ingest PDF documents, then web pages content into a Milvus VectorStore.

Requirements:
- A Milvus instance, either standalone or cluster.
- Connection credentials to Milvus must be available as environment variables: MILVUS_USERNAME and MILVUS_PASSWORD

### Needed packages and imports

In [None]:
!pip install einops==0.7.0 langchain==0.1.9 pypdf==4.0.2 pymilvus==2.3.6 sentence-transformers==2.4.0

In [None]:
import requests  # Used to download the PDF Documents
import os # Used for OS Commands to create folders
import shutil # Used to help delete folders

# Langchain is used to help process the PDF data and upload it to the VectorDB
from langchain.document_loaders import PyPDFDirectoryLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus
from langchain_community.document_loaders import PyPDFLoader

### Base parameters, the Milvus connection info

In [None]:
# Replace values according to your Milvus deployment
MILVUS_HOST = "vectordb-milvus"
MILVUS_PORT = 19530
MILVUS_USERNAME = "root"
MILVUS_PASSWORD = "Milvus"
MILVUS_COLLECTION = "state_driving_content"

# Set Variables to be used
pdf_folder_path = f"handbooks" # What Fold
pdfs = {
    "Missouri": {
        "Drivers Guide": "https://dor.mo.gov/forms/Driver%20Guide.pdf",
    },
    "California": {
        "Drivers Handbook": "https://www.dmv.ca.gov/portal/file/california-driver-handbook-pdf/",
    },
}

## Initial index creation and document ingestion

#### Download and load pdfs

In [None]:
# Cleanup Download Folder Location
if os.path.exists(pdf_folder_path):
    print(f"Directory, '{pdf_folder_path}', exists Cleaning up")
    shutil.rmtree(pdf_folder_path)
else:
    print("Directory does not exist")

# Create Download Folder
os.makedirs(pdf_folder_path)

# Create an empty list to put the PDF Data into
pdf_docs = []

# Start Downloading and Processing the PDF Data locally.
# Loop over each defined state
for state,pdf_dict in pdfs.items():
    # Loop over each URL Defined in the State
    for name,url in pdf_dict.items():
        # Create a variable for the PDF File Location
        file_path = f"{pdf_folder_path}/{state}-{name}.pdf"
        print(f"Downloading URL: {url} to {file_path}")

        # Download the PDF File
        response = requests.get(url)

        # Open the PDF File and load it into memory
        with open(file_path, 'wb') as f:
            f.write(response.content)

        # Process the PDF through Langchain's PDF Loader
        # This chunks the data into a list of pages with metadata about the PDF.
        loader = PyPDFLoader(file_path)
        pdf_doc = loader.load()

        # Loop through the chunked data and update the metadata
        for doc in pdf_doc:
            # Change the file path location to be the public URL of the PDF.
            doc.metadata["source"] = url
            # Add the Metadata field state so that the content can be filtered based on the source State
            doc.metadata["state"] = state
        # Append the list of pages processed to the larger list.
        pdf_docs = pdf_doc + pdf_docs

print(f"Done Processing PDFs with a page count of {len(pdf_docs)}")

#### Split documents into chunks with some overlap

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=128)
all_splits = text_splitter.split_documents(pdf_docs)

#### Create the index and ingest the documents

In [None]:
# If you don't want to use a GPU, you can remove the 'device': 'cuda' argument
model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(
    model_kwargs=model_kwargs,
    show_progress=True
)

# BEWARE: `drop_old` is set to True, so if the collection already existed it will deleted first.
db = Milvus(
    embedding_function=embeddings,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
    collection_name=MILVUS_COLLECTION,
    collection_description="State Based DMV Data",
    metadata_field="metadata",
    #partition_key_field="state",
    #partition_names=["Missouri", "California"],
    index_params=True,
    text_field="page_content",
    auto_id=True,
    drop_old=True,
    )

In [None]:
results = db.add_documents(all_splits)
print(f"Uploaded {len(results)} embeddings to the VectorDB")

#### Test query

Run the below cell as is with Missouri as the state, and run it again replacing `Missouri` with `California`.

You should notice the results differ between the two queries. This will be important to showcase how you can from an Application level switch the filters on a query to adjust which vectors are returned.

In [None]:
# Set your Query and State
query = "What are the concequences of a DUI "
state = "Missouri"

# Create the embeddings function
embeddings_search = HuggingFaceEmbeddings(
    model_kwargs={'device': 'cpu'},
    show_progress=True
)

# Create the Milvus Object to Search
db_search = Milvus(
    embedding_function=embeddings_search,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT, "user": MILVUS_USERNAME, "password": MILVUS_PASSWORD},
    collection_name=MILVUS_COLLECTION,
    metadata_field="metadata",
    text_field="page_content"
    )

# Obtain the Retreiver Object that is used to perform the search,
#   and give boundaries to the search. In our case limit to the
#   State selected
retriever = db_search.as_retriever(
    search_kwargs = {
        "expr": f'metadata["state"] == "{state}"'
    }
)

# Invoke the Query against the Milvus Database
result = retriever.invoke(input=query)

# Loop over the results and show the output
for doc in result:
    print("-" * 20)
    print("State: ", doc.metadata["state"])
    print(doc.page_content)