In [56]:
# Load Dataset
from datasets import load_dataset
import pandas as pd

In [2]:
# https://huggingface.co/datasets/MongoDB/embedded_movies
dataset = load_dataset("MongoDB/embedded_movies")
# Convert the dataset to a pandas DataFrame
dataset_df = pd.DataFrame(dataset["train"])

Downloading readme:   0%|          | 0.00/6.18k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.3M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
dataset_df.head(2)

Unnamed: 0,type,plot_embedding,plot,cast,num_mflix_comments,writers,metacritic,fullplot,imdb,genres,countries,awards,rated,runtime,languages,poster,directors,title
0,movie,"[0.00072939653, -0.026834568, 0.013515796, -0....",Young Pauline is left a lot of money when her ...,"[Pearl White, Crane Wilbur, Paul Panzer, Edwar...",0,"[Charles W. Goddard (screenplay), Basil Dickey...",,Young Pauline is left a lot of money when her ...,"{'id': 4465, 'rating': 7.6, 'votes': 744}",[Action],[USA],"{'nominations': 0, 'text': '1 win.', 'wins': 1}",,199.0,[English],https://m.media-amazon.com/images/M/MV5BMzgxOD...,"[Louis J. Gasnier, Donald MacKenzie]",The Perils of Pauline
1,movie,"[-0.022837115, -0.022941574, 0.014937485, -0.0...",A penniless young man tries to save an heiress...,"[Harold Lloyd, Mildred Davis, 'Snub' Pollard, ...",0,[H.M. Walker (titles)],,As a penniless man worries about how he will m...,"{'id': 10146, 'rating': 7.0, 'votes': 639}","[Comedy, Short, Action]",[USA],"{'nominations': 1, 'text': '1 nomination.', 'w...",TV-G,22.0,[English],https://m.media-amazon.com/images/M/MV5BNzE1OW...,"[Alfred J. Goulding, Hal Roach]",From Hand to Mouth


In [5]:
dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 18 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   type                1500 non-null   object 
 1   plot_embedding      1472 non-null   object 
 2   plot                1473 non-null   object 
 3   cast                1499 non-null   object 
 4   num_mflix_comments  1500 non-null   int64  
 5   writers             1487 non-null   object 
 6   metacritic          572 non-null    float64
 7   fullplot            1452 non-null   object 
 8   imdb                1500 non-null   object 
 9   genres              1500 non-null   object 
 10  countries           1500 non-null   object 
 11  awards              1500 non-null   object 
 12  rated               1192 non-null   object 
 13  runtime             1485 non-null   float64
 14  languages           1499 non-null   object 
 15  poster              1411 non-null   object 
 16  direct

In [6]:
# Remove data point where plot column is missing
dataset_df = dataset_df.dropna(subset=["fullplot"])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with an open-source embedding model from Hugging Face: gte-large
dataset_df = dataset_df.drop(columns=["plot_embedding"])


Number of missing values in each column after removal:
type                    0
plot_embedding          1
plot                    0
cast                    1
num_mflix_comments      0
writers                13
metacritic            893
fullplot                0
imdb                    0
genres                  0
countries               0
awards                  0
rated                 279
runtime                14
languages               1
poster                 78
directors              12
title                   0
dtype: int64


The embedding model used in the RAG system is the Generate Text Embedding (GTE) model, based on the BERT model. The GTE embedding models come in three variants

In [5]:
from sentence_transformers import SentenceTransformer

# https://huggingface.co/thenlper/gte-large
embedding_model = SentenceTransformer("thenlper/gte-base")


def get_embedding(text: str) -> list[float]:
    if not text.strip():
        print("Attempted to get embedding for empty text.")
        return []
    embedding = embedding_model.encode(text)
    return embedding.tolist()


# dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)

# dataset_df.info()

In [16]:
len(dataset_df["embedding"][0])

768

In [2]:
import pymongo
from dotenv import load_dotenv
import os

load_dotenv("../.env", override=True)


def get_mongo_client(mongo_uri):
    """Establish connection to the MongoDB."""
    try:
        client = pymongo.MongoClient(mongo_uri)
        print("Connection to MongoDB successful")
        return client
    except pymongo.errors.ConnectionFailure as e:
        print(f"Connection failed: {e}")
        return None


mongo_uri = os.getenv("MONGO_BASE_URL")

if not mongo_uri:
    print("MONGO_URI not set in environment variables")

mongo_client = get_mongo_client(mongo_uri)

# Ingest data into MongoDB
db = mongo_client["movies"]

collection = db["movie_collection_2"]

Connection to MongoDB successful


In [11]:
# Delete any existing records in the collection
# collection.delete_many({})

DeleteResult({'n': 0, 'electionId': ObjectId('7fffffff0000000000000048'), 'opTime': {'ts': Timestamp(1711544371, 3), 't': 72}, 'ok': 1.0, '$clusterTime': {'clusterTime': Timestamp(1711544371, 3), 'signature': {'hash': b'\xd4\xca\xf3i\x90\n^4\xa0\xa8\xaeX\x10\xae\xf2\xa5c\xf8\\\x0c', 'keyId': 7306607937437302786}}, 'operationTime': Timestamp(1711544371, 3)}, acknowledged=True)

In [17]:
# documents = dataset_df.to_dict("records")
# collection.insert_many(documents)
# print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


In [3]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "fullplot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1,  # Include the genres field
                # Include the search score
                "score": {"$meta": "vectorSearchScore"},
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)


def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('fullplot', 'N/A')}\n"

    return search_result

In [6]:
# Conduct query with retrieval of sources
query = """What is the best romantic movie to watch and why?"""

source_information = get_search_result(query, collection)

combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."

print(combined_information)

Query: What is the best romantic movie to watch and why?
Continue to answer the query by using the Search Results:
Title: Shut Up and Kiss Me!, Plot: Ryan and Pete are 27-year old best friends in Miami, born on the same day and each searching for the perfect woman. Ryan is a rookie stockbroker living with his psychic Mom. Pete is a slick surfer dude yet to find commitment. Each meets the women of their dreams on the same day. Ryan knocks heads in an elevator with the gorgeous Jessica, passing out before getting her number. Pete falls for the insatiable Tiara, but Tiara's uncle is mob boss Vincent Bublione, charged with her protection. This high-energy romantic comedy asks to what extent will you go for true love?
Title: Titanic, Plot: The plot focuses on the romances of two couples upon the doomed ship's maiden voyage. Isabella Paradine (Catherine Zeta-Jones) is a wealthy woman mourning the loss of her aunt, who reignites a romance with former flame Wynn Park (Peter Gallagher). Meanwhi

In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM


tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [7]:
# Moving tensors to GPU
input_ids = tokenizer(combined_information, return_tensors="pt")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))

<bos>Query: What is the best romantic movie to watch and why?
Continue to answer the query by using the Search Results:
Title: Shut Up and Kiss Me!, Plot: Ryan and Pete are 27-year old best friends in Miami, born on the same day and each searching for the perfect woman. Ryan is a rookie stockbroker living with his psychic Mom. Pete is a slick surfer dude yet to find commitment. Each meets the women of their dreams on the same day. Ryan knocks heads in an elevator with the gorgeous Jessica, passing out before getting her number. Pete falls for the insatiable Tiara, but Tiara's uncle is mob boss Vincent Bublione, charged with her protection. This high-energy romantic comedy asks to what extent will you go for true love?
Title: Titanic, Plot: The plot focuses on the romances of two couples upon the doomed ship's maiden voyage. Isabella Paradine (Catherine Zeta-Jones) is a wealthy woman mourning the loss of her aunt, who reignites a romance with former flame Wynn Park (Peter Gallagher). Me