In [None]:
import yaml
import add_packages
from pprint import pprint
import os
import pandas as pd
from tqdm.auto import tqdm

from toolkit.langchain import (
	document_loaders, text_splitters, text_embedding_models, stores, 
	prompts, utils, output_parsers, agents, documents, models,
	runnables, tools, chains
)

# [Vector stores](https://python.langchain.com/v0.2/docs/integrations/vectorstores/mongodb_atlas/)

In [None]:
from typing import List, Dict, Any, Optional
from pymongo import MongoClient
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings
from langchain.schema import Document
from langchain.tools.retriever import create_retriever_tool
from loguru import logger
from tqdm import tqdm
from pymongo.mongo_client import MongoClient
from pymongo.operations import SearchIndexModel


In [None]:
MONGODB_DB_NAME = "db_langchain"
MONGODB_COLLECTION_NAME = "coll_langchain"
MONGODB_ATLAS_VECTOR_SEARCH_INDEX_NAME = "idx_langchain"

In [None]:
class MongoStore:
		def __init__(
				self,
				mongodb_atlas_cluster_uri: str,
				db_name: str,
				collection_name: str,
				index_name: str,
				dimensions: int,
				embeddings: Optional[Any] = None,
				configs: Dict = None,
				default_search_type: str = "similarity",
				default_search_kwargs: Dict = {"k": 6},
				retriever_tool_name: str = "",
				retriever_tool_description: str = "",
		):
				self.client = MongoClient(mongodb_atlas_cluster_uri)
				self.db_name = db_name
				self.collection_name = collection_name
				self.index_name = index_name
				self.dimensions = dimensions
				self.collection = self.client[db_name][collection_name]
				
				# Set up embeddings
				self.embeddings = embeddings if embeddings else OpenAIEmbeddings(disallowed_special=())
				
				# Set up vector store
				self.vector_store = MongoDBAtlasVectorSearch(
						collection=self.collection,
						index_name=self.index_name,
						embedding=self.embeddings,
						relevance_score_fn="cosine",
				)
				
				# Set up retriever
				self.retriever = self.vector_store.as_retriever(
						search_type=default_search_type,
						search_kwargs=default_search_kwargs,
				)
				
				# Set up retriever tool
				if retriever_tool_name and retriever_tool_description:
						self.retriever_tool = create_retriever_tool(
								retriever=self.retriever,
								name=retriever_tool_name,
								description=retriever_tool_description,
						)
				else:
						self.retriever_tool = None

		def create_index(self):
				"""
				Not available in free-tier.
				https://www.mongodb.com/docs/atlas/atlas-vector-search/ai-integrations/langchain/#create-the-atlas-vector-search-index
				"""
				try:
						search_index_model = SearchIndexModel(
								definition={
										"fields": [
												{
														"type": "vector",
														"numDimensions": self.dimensions,
														"path": "embedding",
														"similarity": "cosine"
												},
												{
														"type": "filter",
														"path": "page"
												},
										]
								},
								name=self.index_name,
								type="vectorSearch",
						)

						result = self.collection.create_search_index(model=search_index_model)
						logger.info(f"Index '{self.index_name}' created successfully. Result: {result}")
						return result
				except Exception as e:
						if "already exists" in str(e):
								logger.info(f"Index '{self.index_name}' already exists.")
						else:
								logger.error(f"Error creating index: {e}")
						return None

		def add_documents(self, docs: List[Document]):
				for doc in tqdm(docs):
						self.vector_store.add_documents([doc])

		def invoke_retriever(self, query, **kwargs):
				result: List[Document] = self.retriever.invoke(query, **kwargs)
				result = [Document(res.page_content) for res in result]
				return result

In [None]:
mongo_store = MongoStore(
	mongodb_atlas_cluster_uri=os.getenv("MONGODB_ATLAS_CLUSTER_URI"),
	db_name=MONGODB_DB_NAME,
	collection_name=MONGODB_COLLECTION_NAME,
	index_name=MONGODB_ATLAS_VECTOR_SEARCH_INDEX_NAME,
	dimensions=1536,  # Set this to match your embedding size
	retriever_tool_name="retriever_mongodb",
	retriever_tool_description="Useful for retrieving information from MongoDB Atlas vector store."
)


In [None]:
loader = document_loaders.UnstructuredPDFLoader(f"{add_packages.APP_PATH}/data/gpt-4-technical-report.pdf")
doc = loader.load()

text_splitter = text_splitters.RecursiveCharacterTextSplitter(
  chunk_size=1000, chunk_overlap=150
)
docs = text_splitter.split_documents(doc)
docs = docs[:100]


In [None]:
# mongo_store.add_documents(docs)

In [None]:
query = "What were the compute requirements for training GPT 4"

results = mongo_store.invoke_retriever(query)

In [None]:
pprint(results)

# [Self-querying retrievers](https://python.langchain.com/v0.2/docs/integrations/retrievers/self_query/mongodb_atlas/)

# [Memory](https://python.langchain.com/v0.2/docs/integrations/memory/mongodb_chat_message_history/)

# [Document loaders](https://python.langchain.com/v0.2/docs/integrations/document_loaders/mongodb/)

# [MongoDBCache](https://python.langchain.com/v0.2/docs/integrations/providers/mongodb_atlas/)