MongoDB, a widely adopted NoSQL document database, plays a pivotal role in the RAG system. It efficiently stores and retrieves embeddings and associated text data. MongoDB's adaptability and scalability make it an ideal choice for managing large text corpora and their corresponding embeddings in the RAG system.

In a RAG system integrated with MongoDB, the embeddings and text data are stored in a MongoDB collection. Each document in the collection represents a text passage or document. The embeddings can be stored within the document as arrays or binary data. This storage method enables efficient nearest neighbour search using MongoDB's geospatial indexing capabilities, a crucial aspect of the RAG system's functionality.

During the retrieval phase, the input text would be encoded into an embedding, and MongoDB's geospatial queries (e.g., $nearSphere) can be used to find the nearest neighbour embeddings in the database, effectively retrieving the most relevant documents or passages.

In [None]:
# !pip install datasets pandas openai pymongo

In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
df = load_dataset("MongoDB/embedded_movies", limit=10)

# Convert the dataset to a pandas dataframe
df = pd.DataFrame(df['train'])

df.head(5)

In [None]:
lst_cmd = ['df.columns', 'df.shape', 'df.describe()', 'df.isnull().sum()']

for cmd in lst_cmd:
    print(eval(cmd), '\n')

In [None]:
df = df.dropna(subset=['plot'])
df = df.drop(columns=['plot_embedding'])


df

In [None]:
from openai import OpenAI
import os

import openai

client = OpenAI()


openai.api_key = os.getenv("OPENAI_API_KEY")

if openai.api_key is None:
    raise Exception("API key is required. Set OPENAI_API_KEY environment variable.")


In [None]:
def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        embedding = client.embeddings.create(input=text, model="text-embedding-3-small").data[0].embedding
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None

df["plot_embedding_optimised"] = df['plot'].apply(get_embedding)

df.head(5)

In [None]:
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

uri = "mongodb+srv://erkanmalcok:2LtkNVWOuWnmSP9c@cluster0.grz4hfh.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"

# Create a new client and connect to the server
mongo_client = MongoClient(uri, server_api=ServerApi('1'))

# Send a ping to confirm a successful connection
try:
    mongo_client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

In [None]:
db = mongo_client['movies']
collection = db['movie_collection']

In [None]:
collection.delete_many({})

In [None]:
documents = df.to_dict('records')
collection.insert_many(documents)

print("Data ingestion into MongoDB completed")

In [None]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "plot_embedding_optimised",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 5  # Return top 5 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "plot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1, # Include the genres field
                "score": {
                    "$meta": "vectorSearchScore"  # Include the search score
                }
            }
        }
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [None]:
def handle_user_query(query, collection):

  get_knowledge = vector_search(query, collection)

  search_result = ''
  for result in get_knowledge:
      search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('plot', 'N/A')}\n"

  completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a movie recommendation system."},
            {"role": "user", "content": "Answer this user query: " + query + " with the following context: " + search_result}
        ]
    )
  
  print(completion.choices[0].message)
  print(search_result)

  return (completion.choices[0].message), search_result

In [None]:
query = "What is the best sci-fi and adventure movie to watch?"
response, source_information = handle_user_query(query, collection)

print(f"Response: {response}")
# print(f"Source Information: \n{source_information}")