# Atlas Vector Search - LlamaIndex Integration

This notebook is a companion for the [LlamaIndex](https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/llamaindex/) page. Refer to the page for set up steps and explanation details.

In [None]:
pip install --quiet --upgrade llama-index llama-index-vector-stores-mongodb llama-index-embeddings-openai pymongo

In [None]:
import pymongo, pprint
from pymongo.operations import SearchIndexModel
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.settings import Settings
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.vector_stores import MetadataFilter, MetadataFilters, ExactMatchFilter, FilterOperator
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI
from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch

In [None]:
OPENAI_API_KEY = "<api-key>"
ATLAS_CONNECTION_STRING = "<connection-string>"

In [None]:
# Configure LlamaIndex Settings
Settings.llm = OpenAI()
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")
Settings.chunk_size = 100
Settings.chunk_overlap = 10

In [None]:
# Load the sample data
mkdir -p 'data/'
wget 'https://query.prod.cms.rt.microsoft.com/cms/api/am/binary/RE4HkJP' -O 'data/atlas_best_practices.pdf'

In [None]:
sample_data = SimpleDirectoryReader(input_files=["./data/atlas_best_practices.pdf"]).load_data()

# Print the first document
sample_data[0]

In [None]:
# Connect to your Atlas cluster
mongo_client = pymongo.MongoClient(ATLAS_CONNECTION_STRING)

# Instantiate the vector store
atlas_vector_store = MongoDBAtlasVectorSearch(
    mongo_client,
    db_name = "llamaindex_db",
    collection_name = "test",
    vector_index_name = "vector_index"
)
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_store)

In [None]:
# Store the data as vector embeddings
vector_store_index = VectorStoreIndex.from_documents(
   sample_data, storage_context=vector_store_context, show_progress=True
)

In [None]:
# Specify the collection for which to create the index
collection = mongo_client["llamaindex_db"]["test"]

# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition={
    "fields": [
      {
        "type": "vector",
        "path": "embedding",
        "numDimensions": 1536,
        "similarity": "cosine"
      },
      {
        "type": "filter",
        "path": "metadata.page_label"
      }
    ]
  },
  name="vector_index",
  type="vectorSearch",
)

collection.create_search_index(model=search_index_model)

## Semantic Search Query

In [None]:
retriever = vector_store_index.as_retriever(similarity_top_k=3)
nodes = retriever.retrieve("MongoDB Atlas security")

for node in nodes:
    print(node)

## Semantic Search with Filtering

In [None]:
# Specify metadata filters
metadata_filters = MetadataFilters(
   filters=[ExactMatchFilter(key="metadata.page_label", value="17")]
)
retriever = vector_store_index.as_retriever(similarity_top_k=3, filters=metadata_filters)
nodes = retriever.retrieve("MongoDB Atlas security")

for node in nodes:
    print(node)

## Basic RAG

In [None]:
# Instantiate Atlas Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)

# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)

# Prompt the LLM
response = query_engine.query('How can I secure my MongoDB Atlas cluster?')

print(response)
print("\nSource documents: ")
pprint.pprint(response.source_nodes)

## RAG with Filters

In [None]:
# Specify metadata filters
metadata_filters = MetadataFilters(
   filters=[ExactMatchFilter(key="metadata.page_label", value="17")]
)

# Instantiate Atlas Vector Search as a retriever
vector_store_retriever = VectorIndexRetriever(index=vector_store_index, filters=metadata_filters, similarity_top_k=5)

# Pass the retriever into the query engine
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)

# Prompt the LLM
response = query_engine.query('How can I secure my MongoDB Atlas cluster?')

print(response)
print("\nSource documents: ")
pprint.pprint(response.source_nodes)