# Vector databases

Creating and configuring a Vector Database to Store Document Embeddings
  
Later we will do Similarity searches  


### Vector databases vs traditional databases like SQL
Vector databases can index and search quickly for similar vectors using similarity algorithms

In [12]:
# importing necessary libraries
from langchain_community.document_loaders import TextLoader
import urllib
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_core.documents import Document

In [2]:
# load and prepare some text
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/BYlUHaillwM8EUItaIytHQ/companypolicies.txt"

filename = 'data/companypolicies.txt'
urllib.request.urlretrieve(url, filename)
loader = TextLoader("data/companypolicies.txt")
data = loader.load()

# split the data
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
)

chunks = text_splitter.split_documents(data)
print("Length of chunks: ", len(chunks))

# embedding model
model_name = "sentence-transformers/all-mpnet-base-v2"
huggingface_embedding = HuggingFaceEmbeddings(model_name=model_name)

Length of chunks:  215


  from .autonotebook import tqdm as notebook_tqdm


## Vector store

### ChromaDB
We need to start by creating an ID list that will be used to assign each chunk a unique identifier, allowing you to track them later in the vector database. The length of this list should match the length of the chunks.  
  
The IDs should be in string format.
  

We then use the embedding model to create embeddings for each chunk and then store them in the Chroma database.  
  
We can then use the method .collection.get() to print some of the chunks indexed by their IDs.  
Although the chunks are stored in the database in embedding format, when you retrieve and print them by their IDs, the database will return the chunk text information instead of the embedding vectors.

   

FIASS is another vector database that is supported by LangChain.


In [3]:
# create id list
ids = [str(i) for i in range(0, len(chunks))]

In [4]:
# create embeddings for each chunk and then store them in the Chroma database.  
# dont need a separate embedding step like we did before 
vectordb = Chroma.from_documents(chunks, huggingface_embedding, ids=ids)
vectordb

<langchain_community.vectorstores.chroma.Chroma at 0x253cd05d7f0>

In [5]:
# print some of the chunks indexed by their IDs
for i in range(3):
    print(vectordb._collection.get(ids=str(i)))

{'ids': ['0'], 'embeddings': None, 'documents': ['1.\tCode of Conduct'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'data/companypolicies.txt'}]}
{'ids': ['1'], 'embeddings': None, 'documents': ['Our Code of Conduct outlines the fundamental principles and ethical standards that guide every'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'data/companypolicies.txt'}]}
{'ids': ['2'], 'embeddings': None, 'documents': ['that guide every member of our organization. We are committed to maintaining a workplace that is'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'data/companypolicies.txt'}]}


In [7]:
print(vectordb._collection.get(ids=["1","2","6"]))

{'ids': ['1', '2', '6'], 'embeddings': None, 'documents': ['Our Code of Conduct outlines the fundamental principles and ethical standards that guide every', 'that guide every member of our organization. We are committed to maintaining a workplace that is', 'clients, or the broader community. We respect and protect sensitive information, and we avoid'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'data/companypolicies.txt'}, {'source': 'data/companypolicies.txt'}, {'source': 'data/companypolicies.txt'}]}


In [8]:
# lets see the length of the vector database, which should be the same as the length of chunks.
vectordb._collection.count()

215

# Similarity search
Similarity search in a vector database involves finding items that are most similar to a given query item based on their vector representations.  

The search algorithm identifies and retrieves the vectors of the data objects with the closest vector distances to the query, enabling efficient and accurate identification of similar items in large datasets.

LangChain supports similarity search in vector stores using the method .similarity_search().



In [20]:
# sample query
query = "Email policy"

# by default, the top four closest vectors to the query are returned
docs = vectordb.similarity_search(query, k=2)
docs

[Document(metadata={'source': 'data/companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'data/companypolicies.txt'}, page_content='Our Internet and Email Policy aims to promote safe, responsible usage of digital communication')]

In [None]:
# can specify the top k results to return
vectordb.similarity_search(query, k = 1)

[Document(metadata={'source': 'data/companypolicies.txt'}, page_content='3.\tInternet and Email Policy')]

# Managing vector store: Adding, updating, and deleting entries

In [None]:
# to add a document

# sample text 
text = "This is a study notebook about building a LLM."

new_chunk =  Document(
    page_content=text,
    metadata={
        "source": "emma.com",
        "page": 1
    }
)

# has to be a list
new_chunks = [new_chunk]

# add the new document to the end of our db
vectordb.add_documents(
    new_chunks,
    ids=["215"]
)

vectordb._collection.count()

216

In [None]:
# the new document added to our db
print(vectordb._collection.get(ids=['215']))

{'ids': ['215'], 'embeddings': None, 'documents': ['This is a study notebook about building a LLM.'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'page': 1, 'source': 'ibm.com'}]}


In [17]:
# to update a document 

update_chunk =  Document(
    page_content="I am updating this document",
    metadata={
        "source": "emmajayne.com",
        "page": 1
    }
)

vectordb.update_document(
    '215',
    update_chunk,
)

print(vectordb._collection.get(ids=['215']))

{'ids': ['215'], 'embeddings': None, 'documents': ['I am updating this document'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'source': 'emmajayne.com', 'page': 1}]}


In [18]:
# to delete a document from the vector database
vectordb._collection.delete(ids=['215'])
print(vectordb._collection.get(ids=['215']))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}
