In [1]:
!pip3 install langchain langchain_community langchain_core langchain_openai langchain_mongodb pymongo pypdf

Collecting langchain
  Downloading langchain-0.3.4-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.3-py3-none-any.whl.metadata (2.8 kB)
Collecting langchain_core
  Downloading langchain_core-0.3.12-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain_openai
  Downloading langchain_openai-0.2.3-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain_mongodb
  Downloading langchain_mongodb-0.2.0-py3-none-any.whl.metadata (2.4 kB)
Collecting pymongo
  Downloading pymongo-4.10.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (22 kB)
Collecting pypdf
  Downloading pypdf-5.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.137-py3-none-any.whl.metadata (13 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community

In [None]:
MONGODB_URI = "<your_atlas_connection_string>"
LLM_API_KEY = "<your_llm_api_key>"

In [None]:
import key_param
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_transformers.openai_functions import (
    create_metadata_tagger,
)
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from pymongo import MongoClient

In [None]:
# Set the MongoDB URI, DB, Collection Names

client = MongoClient(key_param.MONGODB_URI)
dbName = "book_mongodb_chunks"
collectionName = "chunked_data"
collection = client[dbName][collectionName]

In [None]:
loader = PyPDFLoader(".\sample_files\mongodb.pdf")
pages = loader.load()
cleaned_pages = []

In [None]:
for page in pages:
    if len(page.page_content.split(" ")) > 20:
        cleaned_pages.append(page)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150)

In [None]:
llm = ChatOpenAI(
    openai_api_key=key_param.LLM_API_KEY, temperature=0, model="gpt-3.5-turbo"
)

In [None]:
document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)

docs = document_transformer.transform_documents(cleaned_pages)

split_docs = text_splitter.split_documents(docs)

In [None]:
embeddings = OpenAIEmbeddings(openai_api_key=key_param.LLM_API_KEY)

In [None]:
vectorStore = MongoDBAtlasVectorSearch.from_documents(
    split_docs, embeddings, collection=collection
)

## Create the following vector search index, named vector_index, on the chunked_data collection in your Atlas Cluster:

```json
{
  "fields": [
    {
      "numDimensions": 1536,
      "path": "embedding",
      "similarity": "cosine",
      "type": "vector"
    },
    {
      "path": "hasCode",
      "type": "filter"
    }
  ]
}

## rag.py to use

In [None]:
import key_param
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

dbName = "book_mongodb_chunks"
collectionName = "chunked_data"
index = "vector_index"

vectorStore = MongoDBAtlasVectorSearch.from_connection_string(
    key_param.MONGODB_URI,
    dbName + "." + collectionName,
    OpenAIEmbeddings(disallowed_special=(), openai_api_key=key_param.LLM_API_KEY),
    index_name=index,
)


def query_data(query):
    retriever = vectorStore.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 3,
            "pre_filter": {"hasCode": {"$eq": False}},
            "score_threshold": 0.01,
        },
    )

    template = """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Do not answer the question if there is no given context.
    Do not answer the question if it is not related to the context.
    Do not give recommendations to anything other than MongoDB.
    Context:
    {context}
    Question: {question}
    """

    custom_rag_prompt = PromptTemplate.from_template(template)

    retrieve = {
        "context": retriever
        | (lambda docs: "\n\n".join([d.page_content for d in docs])),
        "question": RunnablePassthrough(),
    }

    llm = ChatOpenAI(openai_api_key=key_param.LLM_API_KEY, temperature=0)

    response_parser = StrOutputParser()

    rag_chain = retrieve | custom_rag_prompt | llm | response_parser

    answer = rag_chain.invoke(query)

    return answer


print(query_data("When did MongoDB begin supporting multi-document transactions?"))