# Load data into a vector database

Load chunked context data embedding vectors into an indexed database.  

- **Data**: ReadtheDocs documentation pages.
- **Vector database**: FAISS and Milvus

Demo in progress...


In [1]:
# Import common libraries.
import time, os
import numpy as np

# Import langchain.
#!pip install langchain 
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

We need to first download the Milvus documentation to a local directory:

In [None]:
# Uncomment to download readthedocs page locally.

# DOCS_PAGE="https://pymilvus.readthedocs.io/en/latest/"
# !echo $DOCS_PAGE

# # Specify encoding to handle non-unicode characters in documentation.
# !wget -r -A.html -P rtdocs --header="Accept-Charset: UTF-8" $DOCS_PAGE

In [4]:
# For now, manually move all .html files directly under rtdocs/
!ls rtdocs/*

rtdocs/about.html      rtdocs/faq.html        rtdocs/param.html
rtdocs/api.html        rtdocs/genindex.html   rtdocs/results.html
rtdocs/changes.html    rtdocs/index.html      rtdocs/search.html
rtdocs/contribute.html rtdocs/install.html    rtdocs/tutorial.html


In [5]:
# TODO:  Milvus docs HTML header says they are utf-8, but apparently not!
# Figure out how to download Milvus ReadtheDocs as utf-8 encoded!

loader = ReadTheDocsLoader("rtdocs", features="html.parser")
docs = loader.load()

num_documents = len(docs)
print(f"loaded {num_documents} documents")
print(f"type: {type(docs)}, len: {len(docs)}, type: {type(docs[0])}")
docs[0]

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb8 in position 585: invalid start byte

In [8]:
# # Try a different docs page - this one works fine!

# DOCS_PAGE="https://api.python.langchain.com/en/latest/"
# !echo $DOCS_PAGE

# # Specify encoding to handle non-unicode characters in documentation.
# !wget -r -A.html -P rtdocs2 --header="Accept-Charset: UTF-8" $DOCS_PAGE

loader = ReadTheDocsLoader("rtdocs2", features="html.parser")
docs = loader.load()

num_documents = len(docs)
print(f"loaded {num_documents} documents")
print(f"type: {type(docs)}, len: {len(docs)}, type: {type(docs[0])}")
docs[0]

loaded 34 documents
type: <class 'list'>, len: 34, type: <class 'langchain.schema.document.Document'>


Document(page_content='langchain_experimental API Reference¶\nlangchain_experimental.autonomous_agents¶\nClasses¶\nautonomous_agents.autogpt.agent.AutoGPT(...)\nAgent class for interacting with Auto-GPT.\nautonomous_agents.autogpt.memory.AutoGPTMemory\nMemory for AutoGPT.\nautonomous_agents.autogpt.output_parser.AutoGPTAction(...)\nAction returned by AutoGPTOutputParser.\nautonomous_agents.autogpt.output_parser.AutoGPTOutputParser\nOutput parser for AutoGPT.\nautonomous_agents.autogpt.output_parser.BaseAutoGPTOutputParser\nBase Output parser for AutoGPT.\nautonomous_agents.autogpt.prompt.AutoGPTPrompt\nPrompt for AutoGPT.\nautonomous_agents.autogpt.prompt_generator.PromptGenerator()\nA class for generating custom prompt strings.\nautonomous_agents.baby_agi.baby_agi.BabyAGI\nController model for the BabyAGI agent.\nautonomous_agents.baby_agi.task_creation.TaskCreationChain\nChain generating tasks.\nautonomous_agents.baby_agi.task_execution.TaskExecutionChain\nChain to execute tasks.\nau

In [9]:
# Chunk the data using Langchain's RecursiveCharacterTextSplitter.
start_time = time.time()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap  = 50,
    length_function = len,
)

chunks = text_splitter.create_documents(
    [doc.page_content for doc in docs], 
    metadatas=[doc.metadata for doc in docs])

end_time = time.time()
print(f"chunking time: {end_time - start_time}")
# print(f"type: {type(chunks)}, len: {len(chunks)}, type: {type(chunks[0])}")
print(f"type: {type(chunks)}, len: {len(chunks)}") 

print()
print("Looking at a sample chunk...")
print(chunks[0].metadata)
print(chunks[0].page_content[:100])

chunking time: 0.011813163757324219
type: <class 'list'>, len: 700

Looking at a sample chunk...
{'source': 'rtdocs2/api.python.langchain.com/en/latest/experimental_api_reference.html'}
langchain_experimental API Reference¶
langchain_experimental.autonomous_agents¶
Classes¶
autonomous_


In [10]:
# Clean up the metadata urls
for doc in chunks:
    new_url = doc.metadata["source"]
    new_url = new_url.replace("rtdocs2", "https:/")
    doc.metadata.update({"source": new_url})

print(chunks[0].metadata)
print(chunks[0].page_content[:500])

{'source': 'https://api.python.langchain.com/en/latest/experimental_api_reference.html'}
langchain_experimental API Reference¶
langchain_experimental.autonomous_agents¶
Classes¶
autonomous_agents.autogpt.agent.AutoGPT(...)
Agent class for interacting with Auto-GPT.
autonomous_agents.autogpt.memory.AutoGPTMemory
Memory for AutoGPT.
autonomous_agents.autogpt.output_parser.AutoGPTAction(...)
Action returned by AutoGPTOutputParser.
autonomous_agents.autogpt.output_parser.AutoGPTOutputParser
Output parser for AutoGPT.
autonomous_agents.autogpt.output_parser.BaseAutoGPTOutputParser


## Chunk and embed data

**First, choose an embedding model** <br>
Most tutorials default to the OpenAI embedding model, which costs money.  You don't have to do that.

In the code below, we will use an open source SentenceTransformer embedding model, hosted on HuggingFace.  [SentenceTransformers](https://www.sbert.net/) is a python package that can generate text and image embeddings, originating from Sentence-BERT.

SentenceTransformers embeddings are called using the HuggingFaceEmbeddings integration. We have also added an alias for SentenceTransformerEmbeddings for users who are more familiar with directly using that package.


In [11]:
# Import torch.
#!pip install torch
import torch

# Initialize torch settings and get DEVICE.
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 415
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda:3' if torch.cuda.is_available() else 'cpu')
print(f"device: {DEVICE}")

device: cpu


In [22]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {"device": DEVICE}
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}

hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
type(hf)
# type: langchain.embeddings.huggingface.HuggingFaceEmbeddings


langchain.embeddings.huggingface.HuggingFaceEmbeddings

** TODO: **
- Find a better question to ask!
- Double-check chunk size and sequence lengths best practices.

In [36]:
# TODO:  How to inspect the embedding model parameters?
# In this cell, I'll import directly and inspect.

# Import HuggingFace and SentenceTransformers.
#!pip install transformers sentence-transformers
from sentence_transformers import SentenceTransformer

# load the retriever model from huggingface model hub
model_name = "BAAI/bge-base-en-v1.5"
# model_name = "thenlper/gte-large"
# model_name = "intfloat/e5-base-v2"
retriever = SentenceTransformer(model_name, device=DEVICE)
print(type(retriever))
print(retriever)

# Define a query for estimation purposes.
query = 'Which index does Langchain Milvus default to?'
QUERY_LENGTH = len(query)
TOKENIZER_EOS = 4 # HuggingFace default is 4 bytes

# Save params for later.
max_seq_length = retriever.get_max_seq_length() #128
CHUNK_SIZE = max_seq_length - QUERY_LENGTH - 2*TOKENIZER_EOS
chunk_overlap = np.round(CHUNK_SIZE * 0.10, 0) #19
EMBEDDING_LENGTH = retriever.get_sentence_embedding_dimension() #384
print(f"embedding vector length: {EMBEDDING_LENGTH}, max_seq_length: {max_seq_length}")
print(f"CHUNK_SIZE: {CHUNK_SIZE}, chunk_overlap: {chunk_overlap}")

# model_name = "BAAI/bge-base-en-v1.5"
# embedding vector length: 768, chunk size: 512, chunk_overlap: 51.0

# model_name = "thenlper/gte-large"
# embedding vector length: 1024, chunk size: 512, chunk_overlap: 51.0



<class 'sentence_transformers.SentenceTransformer.SentenceTransformer'>
SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': True}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)
embedding vector length: 768, max_seq_length: 512
CHUNK_SIZE: 459, chunk_overlap: 46.0


In [35]:
# Define a query for estimation purposes.
query = 'Which index does Langchain Milvus default to?'
QUERY_LENGTH = len(query)
print(QUERY_LENGTH)

45


## Save embeddings in a vector store.

In [38]:
# Upload data to FAISS vectorstore.
# !python -m pip install faiss-cpu
from langchain.vectorstores import FAISS

print("Embed and create vector index")
start_time = time.time()
vectorstore = FAISS.from_documents(chunks, embedding=hf)

# Persist the vector store.
vectorstore.save_local("faiss_index")

end_time = time.time()
print(f"FAISS insert time: {end_time - start_time}")
print(f"type: {type(vectorstore)}")
vectorstore

Embed and create vector index
FAISS insert time: 60.16475701332092
type: <class 'langchain.vectorstores.faiss.FAISS'>


<langchain.vectorstores.faiss.FAISS at 0x308522440>

### Now try Milvus

[In a previous notebook](milvus_connect.ipynb), I showed how to connect to milvus and invoke Python APIs from scratch.

The code below uses the [Langchain Milvus](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.milvus.Milvus.html) adapter.  Defaults are:

- embedding_function (Embeddings) – Function used to embed the text.
- collection_name (str) – Which Milvus collection to use. Defaults to “LangChainCollection”.
- connection_args (Optional[dict[str, any]]) – The connection args used for this class comes in the form of a dict.
- consistency_level (str) – The consistency level to use for a collection. Defaults to “Session”.
- index_params (Optional[dict]) – Which index params to use. Defaults to HNSW/AUTOINDEX depending on service.
- search_params (Optional[dict]) – Which search params to use. Defaults to default of index.
- drop_old (Optional[bool]) – Whether to drop the current collection. Defaults to False.
- primary_field (str) – Name of the primary key field. Defaults to “pk”.
- text_field (str) – Name of the text field. Defaults to “text”.
- vector_field (str) – Name of the vector field. Defaults to “vector”.

In [31]:
# Upload data to Milvus vectorstore.

#!pip install pymilvus
from langchain.vectorstores import Milvus
MILVUS_PORT = 19530
MILVUS_HOST = "127.0.0.1"

print("Embed and create vector index")
start_time = time.time()
vector_store = Milvus.from_documents(
    chunks,
    embedding=hf,
    connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
)

end_time = time.time()
print(f"vector database insert time: {end_time - start_time}")
print(f"type: {type(vector_store)}")
vector_store

# Inserting into Milvus, a full-fledged database is just as fast as FAISS!

Embed and create vector index
vector database insert time: 59.87231993675232
type: <class 'langchain.vectorstores.milvus.Milvus'>


<langchain.vectorstores.milvus.Milvus at 0x30845b8e0>

## Query the vectors

This is the "Retrieval" part of "RAG" (Retrieval Augmented Generator).

In [41]:
print(query)

Which index does Langchain Milvus default to?


In [47]:
# RETRIEVAL USING FAISS

# Always use the exact same LLM for both doc and query embeddings.
query_embedding = HuggingFaceEmbeddings(model_name=model_name)

db = FAISS.load_local("faiss_index", query_embedding)
documents = db.similarity_search(query=query, k=1)
print(f"source: {docs[0].metadata}")
print([doc.page_content for doc in documents])

source: {'source': 'https://api.python.langchain.com/en/latest/api_reference.html'}
['retrievers.self_query.milvus.MilvusTranslator()\nTranslate Milvus internal query language elements to valid filters.\nretrievers.self_query.myscale.MyScaleTranslator([...])\nTranslate MyScale internal query language elements to valid filters.\nretrievers.self_query.pinecone.PineconeTranslator()\nTranslate Pinecone internal query language elements to valid filters.\nretrievers.self_query.qdrant.QdrantTranslator(...)\nTranslate Qdrant internal query language elements to valid filters.']


In [48]:
# RETRIEVAL USING MILVUS
docs = vector_store.similarity_search(query)
print(f"source: {docs[0].metadata}")
print([doc.page_content for doc in documents])

# Retrieval using Milvus takes 0.2s vs 1.2s for FAISS !!

source: {'source': 'https://api.python.langchain.com/en/latest/api_reference.html'}
['retrievers.self_query.milvus.MilvusTranslator()\nTranslate Milvus internal query language elements to valid filters.\nretrievers.self_query.myscale.MyScaleTranslator([...])\nTranslate MyScale internal query language elements to valid filters.\nretrievers.self_query.pinecone.PineconeTranslator()\nTranslate Pinecone internal query language elements to valid filters.\nretrievers.self_query.qdrant.QdrantTranslator(...)\nTranslate Qdrant internal query language elements to valid filters.']


## Generate the chat answer using retrieved texts.

This is the "Generator" part of "RAG" (Retrieval Augmented Generator).


In [32]:
from langchain.chains import RetrievalQA

In [49]:
# Props to Sebastian Raschka for this handy watermark.
# !pip install watermark

%load_ext watermark
%watermark -a 'Christy Bergman' -v -p torch,transformers,pymilvus,langchain --conda

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Author: Christy Bergman

Python implementation: CPython
Python version       : 3.10.12
IPython version      : 8.15.0

torch       : 2.0.1
transformers: 4.33.2
pymilvus    : 2.3.0
langchain   : 0.0.292

conda environment: py310

