In [1]:
from pymongo import MongoClient
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Connect to your local MongoDB deployment
client = MongoClient("mongodb://localhost:62834/?directConnection=true&serverSelectionTimeoutMS=2000&appName=local-rag")
# Select the sample_airbnb.listingsAndReviews collection
collection = client["sample_airbnb"]["listingsAndReviews"]

In [3]:
model_path = "/Users/dharmendra.kumar/.cache/huggingface/hub/models--mixedbread-ai--mxbai-embed-large-v1"
model = SentenceTransformer('mixedbread-ai/mxbai-embed-large-v1')



In [8]:
print(model._first_module().auto_model.config)

BertConfig {
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "dtype": "float32",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": false,
  "vocab_size": 30522
}



In [10]:
transformer = model._first_module().auto_model
print(transformer)                 # Full transformer model
print(transformer.config)          # Transformer config (num_layers, hidden_size etc.)
print(transformer.num_parameters())

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [11]:
# Define function to generate embeddings
def get_embedding(text):
    return model.encode(text).tolist()

In [12]:
filter = { '$and': [ { 'summary': { '$exists': True, "$nin": [ None, "" ] } }, { 'embeddings': { '$exists': False } } ] }

In [13]:
updated_doc_count = 0
for document in collection.find(filter).limit(50):
    text = document['summary']
    embedding = get_embedding(text)
    collection.update_one({ '_id': document['_id'] }, { "$set": { 'embeddings': embedding } }, upsert=True)
    updated_doc_count += 1
print("Documents updated: {}".format(updated_doc_count))

Documents updated: 50


In [14]:
from pymongo.operations import SearchIndexModel

In [15]:
# Create your index model, then create the search index
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "numDimensions": 1024,
        "path": "embeddings",
        "similarity": "cosine"
      }
    ]
  },
  name = "vector_index",
  type = "vectorSearch" 
)
collection.create_search_index(model=search_index_model)


'vector_index'

In [16]:
# Function to get the results of a vector search query
def get_query_results(query):
   query_embedding = get_embedding(query)

   pipeline = [
      {
            "$vectorSearch": {
               "index": "vector_index",
               "queryVector": query_embedding,
               "path": "embeddings",
               "exact": True,
               "limit": 5
            }
      }, {
            "$project": {
               "_id": 0,
               "summary": 1,
               "listing_url": 1,
               "score": {
                  "$meta": "vectorSearchScore"
               }
            }
      }
   ]

   results = collection.aggregate(pipeline)

   array_of_results = []
   for doc in results:
      array_of_results.append(doc)
   return array_of_results

In [17]:
import pprint
pprint.pprint(get_query_results("beach house"))


[{'listing_url': 'https://www.airbnb.com/rooms/10186755',
  'score': 0.8083036541938782,
  'summary': 'Near to underground metro station. Walking distance to seaside. '
             '2 floors 1 entry. Husband, wife, girl and boy is living.'},
 {'listing_url': 'https://www.airbnb.com/rooms/10266175',
  'score': 0.8018995523452759,
  'summary': 'A beautiful and comfortable 1 Bedroom Air Conditioned Condo in '
             'Makaha Valley - stunning Ocean & Mountain views All the '
             'amenities of home, suited for longer stays. Full kitchen & large '
             "bathroom.  Several gas BBQ's for all guests to use & a large "
             'heated pool surrounded by reclining chairs to sunbathe.  The '
             'Ocean you see in the pictures is not even a mile away, known as '
             'the famous Makaha Surfing Beach. Golfing, hiking,snorkeling  '
             'paddle boarding, surfing are all just minutes from the front '
             'door.'},
 {'listing_url': 'https:/

In [None]:

from gpt4all import GPT4All

In [19]:
local_llm_path = "/Users/dharmendra.kumar/.cache/huggingface/hub/mistral-7b-openorca.gguf2.Q4_0.gguf/mistral-7b-openorca.gguf2.Q4_0.gguf"
local_llm = GPT4All(local_llm_path)

In [21]:
question = "What is HDFC Bank."
documents = get_query_results(question)

text_documents = ""
for doc in documents:
    summary = doc.get("summary", "")
    link = doc.get("listing_url", "")
    string = f"Summary: {summary} Link: {link}. \n"
    text_documents += string

prompt = f"""Use the following pieces of context to answer the question at the end.
    {text_documents}
    Question: {question}
"""

response = local_llm.generate(prompt)
cleaned_response = response.replace('\\n', '\n')
print(cleaned_response)

Answer: The question does not provide any information about HDFC Bank, as it only provides summaries of AirBnB listings and their links.


In [29]:
conda activate local-mongodb-rag-chatbot
conda install langchain -c conda-forge  

SyntaxError: invalid syntax (179765028.py, line 1)

In [None]:

conda install langchain-community -c conda-forge

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;32m2[0m[1;32m channel Terms of Service accepted[0m
Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.7.0
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /opt/miniconda3/envs/local-mongodb-rag-chatbot

  added / updated specs:
    - langchain-community


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    dataclasses-json-0.6.7     |     pyhd8ed1ab_1          30 KB  conda-forge
    httpx-sse-0.4.3            |     pyhd8ed1ab_0          14 KB  conda-forge
    langchain-community-0.3.30 |  py311hca03da5_0         3.7 MB
    marshmallow-3.26.1         |     pyhd8ed1ab_0          92 KB  conda-forge
    mypy_extensions-1.1.0      |     pyha770c72_0          11 KB  conda-forge
    p

In [None]:
# to load and split sample data by using the
# Loads a PDF 
# Splits the data into chunks, specifying the chunk size (number of characters) and chunk overlap (number of overlapping characters between consecutive chunks).
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [37]:


# if pypdf not present - conda install  pypdf -c conda-forge

loader = PyPDFLoader("https://www.idfcfirstbank.com/content/dam/idfcfirstbank/pdf/financial-results/Investor-Presentation-Q2FY26-181025-Final.pdf")
data = loader.load()

In [38]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
documents = text_splitter.split_documents(data)

In [39]:
# Prepare documents for insertion
docs_to_insert = [{
    "text": doc.page_content,
    "embedding": get_embedding(doc.page_content)
} for doc in documents]

In [40]:
from pymongo import MongoClient

# Connect to your MongoDB deployment
client = MongoClient("mongodb://localhost:62834/?directConnection=true&serverSelectionTimeoutMS=2000&appName=local-rag")
collection = client["local_rag_db1"]["test"]

# Insert documents into the collection
result = collection.insert_many(docs_to_insert)

In [43]:
from pymongo.operations import SearchIndexModel
import time

# Create your index model, then create the search index
index_name="vector_index1"
search_index_model = SearchIndexModel(
  definition = {
    "fields": [
      {
        "type": "vector",
        "numDimensions": 1024,
        "path": "embedding",
        "similarity": "cosine"
      }
    ]
  },
  name = index_name,
  type = "vectorSearch"
)
collection.create_search_index(model=search_index_model)

# Wait for initial sync to complete
print("Polling to check if the index is ready. This may take up to a minute.")
predicate=None
if predicate is None:
   predicate = lambda index: index.get("queryable") is True

while True:
   indices = list(collection.list_search_indexes(index_name))
   if len(indices) and predicate(indices[0]):
      break
   time.sleep(5)
print(index_name + " is ready for querying.")

Polling to check if the index is ready. This may take up to a minute.
vector_index1 is ready for querying.


In [44]:
# Define a function to run vector search queries
def get_query_results(query):
  """Gets results from a vector search query."""

  query_embedding = get_embedding(query)
  pipeline = [
      {
            "$vectorSearch": {
              "index": "vector_index1",
              "queryVector": query_embedding,
              "path": "embedding",
              "exact": True,
              "limit": 5
            }
      }, {
            "$project": {
              "_id": 0,
              "text": 1
         }
      }
  ]

  results = collection.aggregate(pipeline)

  array_of_results = []
  for doc in results:
      array_of_results.append(doc)
  return array_of_results

# Test the function with a sample query
import pprint
pprint.pprint(get_query_results("AI technology"))

[{'text': 'Our Vision\n'
          'BUILDING A WORLD CLASS BANK\n'
          'GUIDED BY\n'
          'ETHICS\n'
          'POWERED BY\n'
          'TECHNOLOGY\n'
          'BE A FORCE FOR\n'
          'SOCIAL GOOD\n'
          '2'},
 {'text': 'forward-looking statements by terminology such as “aim”, '
          '“anticipate”, “believe”, “continue”, “could”, “estimate”, “expect”, '
          '“intend”, “may”, “objective”, “goal”, \n'
          '“plan”, “potential”, “proforma”, “project”, “pursue”, “shall”, '
          '“should”, “will”, “would”, or other words or phrases of similar '
          'import. These forward-looking'},
 {'text': 'Branch embedded vicinity basis sourcing\n'
          'Strong portfolio management framework \n'
          'leading to healthy book\n'
          'Section 2: Building a Universal Bank\n'
          '13'},
 {'text': 'errors, systems malfunctions, or cyber security incidents; (f) '
          'volatility in interest rates and other market conditions; and(g) '

In [45]:
question = "What is Your Bank Name? and For whihc yesr this result is published"
documents = get_query_results(question)

text_documents = ""
for doc in documents:
    summary = doc.get("text", "")
    link = doc.get("_id", "")
    string = f"Summary: {summary} with _id: {link}. \n"
    text_documents += string

prompt = f"""Use the following pieces of context to answer the question at the end.
    {text_documents}
    Question: {question}
"""

response = local_llm.generate(prompt)
cleaned_response = response.replace('\\n', '\n')

pprint.pprint(cleaned_response)

('\n'
 'Answer: The bank name is not mentioned in the given context. However, it can '
 'be inferred that the results are for FY25 (Fiscal Year 2024-2025).')
