In [1]:
import getpass, os, pymongo, pprint
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pymongo import MongoClient

In [2]:
from sentence_transformers import SentenceTransformer

In [3]:
# model = SentenceTransformer('BAAI/bge-large-en-v1.5')



In [4]:
# model.save('./model')

In [5]:
model = SentenceTransformer('./model')

In [6]:

MONGO_URI = "<connetion string>"
DB_NAME = ""
COLLECTION_NAME = ""

In [7]:

ATLAS_CONNECTION_STRING = getpass.getpass(MONGO_URI)

In [8]:
# Connect to your Atlas cluster
mongodb_client = pymongo.MongoClient(MONGO_URI)

# Define collection and index name
db_name = ""
collection_name = ""
atlas_collection = mongodb_client[db_name][collection_name]
vector_search_index = "vector_index"

In [9]:
# Load the PDF
loader = PyPDFLoader("../input/ucce_b_ucce_soldg-for-unified-cce-1262-pages-61-120-pages-1.pdf")
data = loader.load()

# Split PDF into documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=20)
docs = text_splitter.split_documents(data)

# Print the first document
docs[0]

Document(page_content='CHAPTER 3\nContact Center Enterprise Solutions Overview\n•ContactCenterSolutionsArchitecture ,onpage35\n•CoreComponents ,onpage37\n•OptionalCiscoComponents ,onpage67\n•Third-Party Components ,onpage72', metadata={'source': '../input/ucce_b_ucce_soldg-for-unified-cce-1262-pages-61-120-pages-1.pdf', 'page': 0})

In [10]:
class CustomEmbedder:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, documents):
        embeddings = []
        for doc in documents:
            embedding = self.model.encode(doc)
            embeddings.append(embedding.tolist())
        return embeddings

    def embed_query(self, query):
        return self.model.encode(query).tolist()

# Create an instance of the custom embedder
embedder = CustomEmbedder(model)

In [11]:
# model = SentenceTransformer('BAAI/bge-large-en-v1.5')
# Create the vector store
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents = docs,
    embedding = embedder,
    collection = atlas_collection,
    index_name = vector_search_index
)

In [None]:
query = ""

# Perform similarity search
results = vector_search.similarity_search(query)

#Print the results
pprint.pprint(results)


In [None]:
# semantic search with score
query = ""
results = vector_search.similarity_search_with_score(
   query = query, k = 3
)
pprint.pprint(results)

In [None]:
# semantic search with filtering
query = ""
results = vector_search.similarity_search_with_score(
   query = query,
   k = 3,
   pre_filter = { "page": { "$eq": 8 } }
)
pprint.pprint(results)