In [7]:
# Load Dataset
from datasets import load_dataset
import pandas as pd
import textwrap

In [2]:
# dataset = load_dataset("MongoDB/embedded_movies")
# # Convert the dataset to a pandas DataFrame
# dataset_df = pd.DataFrame(dataset["train"])

In [3]:
# dataset_df = dataset_df.dropna(subset=["plot"])
# dataset_df = dataset_df.drop(columns=["plot_embedding"])

In [2]:
from dotenv import load_dotenv
import os
import openai

load_dotenv("../.env", override=True)


OPENAI_ENDPOINT = os.getenv("OPENAI_BASE_URL") + "/embeddings"


openai.api_key = os.getenv("OPENAI_API_KEY")

api_key = os.getenv("OPENAI_API_KEY")

DB_NAME = "New_Movies"

COLLECTION_NAME = "movie_collection"

ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

EMBEDDING_MODEL = "text-embedding-3-small"


def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""

    # Check for valid input
    if not text or not isinstance(text, str):
        return None

    try:
        # Call OpenAI API to get the embedding
        embedding = (
            openai.embeddings.create(input=text, model=EMBEDDING_MODEL)
            .data[0]
            .embedding
        )
        return embedding
    except Exception as e:
        print(f"Error in get_embedding: {e}")
        return None


# dataset_df["new_embedding"] = dataset_df['plot'].apply(get_embedding)

# dataset_df.head(2)

Create a database and a vector search index

In [4]:
import pymongo


def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


mongo_uri = os.getenv("MONGO_BASE_URL")

if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client["New_Movies"]

collection = db["movie_collection"]

Connection to MongoDB successful


In [6]:
# documents = dataset_df.to_dict("records")
# collection.insert_many(documents)
# print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [17]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "plot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1,  # Include the genres field
                # Include the search score
                "score": {"$meta": "vectorSearchScore"},
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)

In [18]:
def handle_user_query(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += (
            f"Title: {result.get('title', 'N/A')}, Plot: {result.get('plot', 'N/A')}\\n"
        )

    completion = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a movie recommendation system."},
            {
                "role": "user",
                "content": "Answer this user query: "
                + query
                + " with the following context: "
                + search_result,
            },
        ],
    )

    return (completion.choices[0].message.content), search_result


In [27]:
query = "What is the best romantic movie to watch and why?"

response, source_information = handle_user_query(query, collection)

In [28]:
print(f"Response: \n\n{textwrap.fill(response, 100)}")

print("\n")

print(f"Source Information: \n\n{textwrap.fill(source_information, 100)}")

Response: 

Based on the context provided, the best romantic movie to watch would be "Gorgeous". This movie
follows the story of a romantic girl who travels to Hong Kong in search of certain love but ends up
meeting a kind-hearted professional fighter with whom she begins to fall for instead. It offers a
mix of romance and adventure, providing a unique and engaging storyline for viewers to enjoy. So, if
you are looking for a romantic movie filled with excitement and charm, "Gorgeous" would be a great
choice to watch.


Source Information: 

Title: Run, Plot: This action movie is filled with romance and adventure. As Abhisek fights for his
life against the forces of crime and injustice, he meets Bhoomika, who captures his heart.\nTitle:
China Girl, Plot: A modern day Romeo & Juliet story is told in New York when an Italian boy and a
Chinese girl become lovers, causing a tragic conflict between ethnic gangs.\nTitle: Gorgeous, Plot:
A romantic girl travels to Hong Kong in search of certai

## Using LCEL

In [5]:
from langchain_openai import OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI
from langchain.prompts import (
    PromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    ChatPromptTemplate,
)
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

from langchain_core.output_parsers import StrOutputParser

# Using the text-embedding-ada-002 since that's what was used to create embeddings in the movies dataset
embeddings = OpenAIEmbeddings(openai_api_key=api_key, model=EMBEDDING_MODEL)

# Vector Store Creation
vector_store = MongoDBAtlasVectorSearch.from_connection_string(
    connection_string=mongo_uri,
    namespace="New_Movies.movie_collection",
    embedding=embeddings,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
    text_key="fullplot",
)

retriever = vector_store.as_retriever(
    search_type="similarity", search_kwargs={"k": 5})

# Create QA chain
# Generate context using the retriever, and pass the user question through. Assigning question to a RunnablePassthrough object ensures the question gets passed unchanged to the next step in the chain.


setup_and_retrieval = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
)

review_template_str = """You are a movie recommendation system. Use the following context to answer questions. Be as detailed as possible, but don't make up any information that's not from the context. If you don't know an answer, say you don't know."""

review_system_prompt = SystemMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["context"],
        template=review_template_str,
    )
)

review_human_prompt = HumanMessagePromptTemplate(
    prompt=PromptTemplate(
        input_variables=["question"],
        template="{question}",
    )
)
messages = [review_system_prompt, review_human_prompt]

review_prompt_template = ChatPromptTemplate(
    input_variables=["context", "question"],
    messages=messages,
)

# template = """Answer the question based only on the following context: \
# {context}

# Question: {question}
# """


# # Defining the chat prompt
# prompt_template = ChatPromptTemplate.from_template(template)


# Defining the model to be used for chat completion
model = ChatOpenAI(temperature=0, openai_api_key=api_key)


# Parse output as a string
parse_output = StrOutputParser()

# Naive RAG chain to answer questions
naive_rag_chain = setup_and_retrieval | review_prompt_template | model | parse_output

In [8]:
print(
    textwrap.fill(
        naive_rag_chain.invoke("""What is the best romantic movie to watch and why?"""),
        60,
    )
)

One highly recommended romantic movie to watch is "The
Notebook." This film, based on the novel by Nicholas Sparks,
tells the story of a young couple, Noah and Allie, who fall
in love one summer but are separated by their social
differences. The movie beautifully captures the enduring
power of love and the challenges that come with it. The
chemistry between the lead actors, Ryan Gosling and Rachel
McAdams, is palpable, drawing viewers into their emotional
journey. The film is known for its heartfelt storytelling,
poignant moments, and memorable quotes that have made it a
classic in the romance genre. If you enjoy a heartfelt love
story with emotional depth and strong performances, "The
Notebook" is a must-watch.


Building a chatbot

In [51]:
from langchain.chains import RetrievalQA

reviews_vector_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    chain_type="stuff",
    retriever=retriever,
)
reviews_vector_chain.combine_documents_chain.llm_chain.prompt = review_prompt_template

response = reviews_vector_chain.invoke(query)

print(textwrap.fill(response.get("result"), 60))

One of the best romantic movies to watch is "The Notebook."
This film, based on the novel by Nicholas Sparks, tells the
story of a young couple, Noah and Allie, who fall in love
one summer but are separated by their social differences.
The movie beautifully captures the enduring power of love,
the challenges of relationships, and the importance of
holding onto memories. The chemistry between the lead
actors, Ryan Gosling and Rachel McAdams, is palpable,
drawing viewers into their emotional journey. The film's
poignant storytelling, heartfelt moments, and beautiful
cinematography make it a classic romantic movie that has
touched the hearts of many viewers.
