# Building a RAG System With Google's Gemma, Hugging Face and MongoDB

## Step 1: Installing libraries

In [1]:
# !pip install datasets pandas pymongo sentence_transformers
# !pip install -U transformers
# Install below if using GPU
# !pip install accelerate

## Step 2: data sourcing and preparation

In [2]:
# Load Dataset
from datasets import load_dataset
import pandas as pd
# https://huggingface.co/datasets/MongoDB/embedded_movies
dataset = load_dataset("MongoDB/embedded_movies")
# Convert the dataset to a pandas DataFrame
dataset_df = pd.DataFrame(dataset['train'])

In [3]:
# Remove data point where plot column is missing
dataset_df = dataset_df.dropna(subset=['fullplot'])
print("\nNumber of missing values in each column after removal:")
print(dataset_df.isnull().sum())

# Remove the plot_embedding from each data point in the dataset as we are going to create new embeddings with an open-source embedding model from Hugging Face: gte-large
dataset_df = dataset_df.drop(columns=['plot_embedding'])


Number of missing values in each column after removal:
plot                    0
runtime                14
genres                  0
fullplot                0
directors              12
writers                13
countries               0
poster                 78
languages               1
cast                    1
title                   0
num_mflix_comments      0
rated                 279
imdb                    0
awards                  0
type                    0
metacritic            893
plot_embedding          1
dtype: int64


## Step 3: generating embeddings

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import time
import numpy as np

print("Initializing Sentence Transformer model...")
try:
    # https://huggingface.co/thenlper/gte-large
    embedding_model = SentenceTransformer("thenlper/gte-large")
    print("Sentence Transformer model initialized successfully.")
except Exception as e:
    print(f"Error initializing Sentence Transformer: {e}")
    print("Please ensure PyTorch, transformers, and sentence-transformers versions are compatible.")
    # Có thể thêm exit() ở đây nếu không khởi tạo được model
    exit()


def get_embedding(text: str) -> list[float]:
    """Generates embedding for a given text using the loaded Sentence Transformer model."""
    # Kiểm tra input đầu vào cẩn thận hơn
    if not isinstance(text, str) or not text.strip():
        # print(f"Skipping embedding for empty or invalid text: {text}")
        return [] # Trả về list rỗng chuẩn dimension là tốt nhất, nhưng phức tạp hơn. Trả về list rỗng đơn giản hơn.

    try:
        embedding = embedding_model.encode(text)
        return embedding.tolist()
    except Exception as e:
        print(f"Error encoding text: '{text[:50]}...' - Error: {e}")
        return []


# --- Tạo embedding cho cột 'fullplot' ---
print("Generating embeddings for 'fullplot' column...")
start_time = time.time()

# Sử dụng apply để tạo embedding
# Lưu ý: .apply có thể chậm với dataset lớn.
# Cân nhắc dùng embedding_model.encode với cả list text để nhanh hơn nếu dataset lớn.
# Ví dụ tối ưu hóa (Optional - chỉ làm nếu apply quá chậm):
# texts_to_encode = dataset_df["fullplot"].tolist()
# batch_size = 32 # Tùy chỉnh batch size phù hợp với RAM/VRAM
# all_embeddings = embedding_model.encode(texts_to_encode, batch_size=batch_size, show_progress_bar=True)
# dataset_df["embedding"] = all_embeddings.tolist() # Chuyển numpy array thành list

# Cách dùng apply (như code gốc):
dataset_df["embedding"] = dataset_df["fullplot"].apply(get_embedding)

end_time = time.time()
print(f"Embedding generation completed in {end_time - start_time:.2f} seconds.")

# Kiểm tra xem có bao nhiêu embedding bị lỗi (trả về list rỗng)
num_empty_embeddings = dataset_df["embedding"].apply(lambda x: len(x) == 0).sum()
if num_empty_embeddings > 0:
    print(f"Warning: {num_empty_embeddings} rows failed to generate embeddings.")

# Hiển thị head để kiểm tra cột embedding mới
print("\nDataFrame head after generating embeddings:")
print(dataset_df.head())

# Kiểm tra dimension của một embedding hợp lệ (nếu có)
first_valid_embedding = dataset_df[dataset_df["embedding"].apply(lambda x: len(x) > 0)]["embedding"].iloc[0]
if first_valid_embedding:
    print(f"\nDimension of the first valid embedding: {len(first_valid_embedding)}") # Phải là 1024 cho gte-large


Initializing Sentence Transformer model...
Sentence Transformer model initialized successfully.
Generating embeddings for 'fullplot' column...


## Step 4: database setup and connection

## Step 5: create vector search index

## Step 6: establish data connection

In [None]:
import pymongo
import os
from dotenv import load_dotenv

load_dotenv()


def get_mongo_client(mongo_uri):
  if not mongo_uri:
      print("MongoDB URI is missing.")
      return None
  try:
      client = pymongo.MongoClient(mongo_uri)
      client.admin.command("ping")
      print("Pinged your deployment. You successfully connected to MongoDB!")
      return client
  except pymongo.errors.ConfigurationError as e:
      print(f"Configuration error: {e}")
      return None
  except pymongo.errors.ConnectionFailure as e:
      print(f"Connection failed: {e}")
      return None
  except Exception as e:
      print(f"An unexpected error occurred during connection: {e}")
      return None


mongo_uri = os.getenv("MONGO_URI")
if not mongo_uri:
  print("MONGO_URI not found. Please ensure it is set in your .env file.")
  exit()
mongo_client = get_mongo_client(mongo_uri)
if mongo_client:
  db_name = "movies"
  collection_name = "movie_collection_2"
  db = mongo_client[db_name]
  collection = db[collection_name]
  print(f"Selected database '{db_name}' and collection '{collection_name}'.")
else:
  print(
      "Could not establish MongoDB connection. Please check your MONGO_URI and network settings/IP whitelist."
  )


Pinged your deployment. You successfully connected to MongoDB!
Selected database 'movies' and collection 'movie_collection_2'.


In [None]:
# documents = dataset_df.to_dict('records')
# collection.insert_many(documents)
# print("Data ingestion into MongoDB completed")

Data ingestion into MongoDB completed


## Step 7: Perform Vector Search on User Queries

In [None]:
def vector_search(user_query, collection):
    """
    Perform a vector search in the MongoDB collection based on the user query.

    Args:
    user_query (str): The user's query string.
    collection (MongoCollection): The MongoDB collection to search.

    Returns:
    list: A list of matching documents.
    """

    # Generate embedding for the user query
    query_embedding = get_embedding(user_query)

    if query_embedding is None:
        return "Invalid query or embedding generation failed."

    # Define the vector search pipeline
    pipeline = [
        {
            "$vectorSearch": {
                "index": "vector_index",
                "queryVector": query_embedding,
                "path": "embedding",
                "numCandidates": 150,  # Number of candidate matches to consider
                "limit": 4,  # Return top 4 matches
            }
        },
        {
            "$project": {
                "_id": 0,  # Exclude the _id field
                "fullplot": 1,  # Include the plot field
                "title": 1,  # Include the title field
                "genres": 1,  # Include the genres field
                "score": {"$meta": "vectorSearchScore"},  # Include the search score
            }
        },
    ]

    # Execute the search
    results = collection.aggregate(pipeline)
    return list(results)


## Step 8: Handling user queries and loading Gemma

In [None]:
def get_search_result(query, collection):

    get_knowledge = vector_search(query, collection)

    search_result = ""
    for result in get_knowledge:
        search_result += f"Title: {result.get('title', 'N/A')}, Plot: {result.get('fullplot', 'N/A')}\n"

    return search_result


In [None]:
# Conduct query with retrival of sources
query = "What is the best romantic movie to watch and why?"
source_information = get_search_result(query, collection)
combined_information = f"Query: {query}\nContinue to answer the query by using the Search Results:\n{source_information}."

print(combined_information)

NameError: name 'get_embedding' is not defined

In [None]:
import os
from dotenv import load_dotenv
from transformers import AutoTokenizer, AutoModelForCausalLM

load_dotenv()
hf_token = os.getenv("HF_TOKEN")

if not hf_token:
    print("Hugging Face token (HF_TOKEN) not found in .env file. Please add it.")
    exit()
else:
    print("Hugging Face token loaded successfully.")

Hugging Face token loaded successfully.


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
# CPU Enabled uncomment below 👇🏽
# model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
# GPU Enabled use below 👇🏽
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="auto")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  41%|####1     | 2.04G/4.95G [00:00<?, ?B/s]

In [None]:
input_ids = tokenizer(combined_information, return_tensors="pt").to("cuda")
response = model.generate(**input_ids, max_new_tokens=500)
print(tokenizer.decode(response[0]))