In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bitext/bitext-gen-ai-chatbot-customer-support-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/bitext/bitext-gen-ai-chatbot-customer-support-dataset?dataset_version_number=1...


100%|██████████| 2.87M/2.87M [00:01<00:00, 2.05MB/s]

Extracting files...





Path to dataset files: C:\Users\Adonis\.cache\kagglehub\datasets\bitext\bitext-gen-ai-chatbot-customer-support-dataset\versions\1


In [58]:
# Cell 1: Setup - Importing Libraries and Configuration

# --- Core Python Libraries ---
import os  # For interacting with the operating system (e.g., checking file paths)
import pandas as pd  # For data manipulation, especially working with CSV files (like our Bitext dataset)
import numpy as np  # For numerical operations, particularly for handling embeddings as arrays
import faiss  # A library from Facebook AI for efficient similarity search in vector collections
from tqdm import tqdm  # A utility to show progress bars for loops, making long processes more user-friendly

# --- Google Cloud Vertex AI SDK ---
# This SDK allows us to interact with Google Cloud's AI services, including Gemini and Embedding models
import vertexai
from vertexai.language_models import TextEmbeddingModel # For generating text embeddings
from vertexai.generative_models import GenerativeModel, Part # For using Gemini to generate text

# --- Configuration ---
# These are settings you'll need to adjust for your environment.

# Replace with your Google Cloud Project ID. You can find this in your GCP console.
GCP_PROJECT_ID = "raggemini-459500"

# Replace with the Google Cloud region where you want to run Vertex AI services.
# 'us-central1' is a common choice, but others are available.
GCP_REGION = "europe-central2"

# This is the path to the Bitext dataset CSV file you downloaded from Kaggle.
# Make sure this file is in the same directory as your script, or provide the full path.
DATASET_PATH = "Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv"

# This specifies which embedding model we'll use from Vertex AI.
# "textembedding-gecko@003" is one of Google's text embedding models.
# Embeddings turn text into numerical vectors, capturing its meaning.
EMBEDDING_MODEL_NAME = "text-embedding-004"

# --- RAG Parameters ---
# When we retrieve information, how many of the most relevant pieces (chunks) should we get?
TOP_K_RETRIEVAL = 3 # We'll retrieve the top 3 most similar chunks for a given query.

print("Libraries imported and configuration set.")
print(f"GCP Project ID (placeholder): {GCP_PROJECT_ID}")
print(f"Dataset Path: {DATASET_PATH}")

Libraries imported and configuration set.
GCP Project ID (placeholder): raggemini-459500
Dataset Path: Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv


In [59]:
# Cell 2: Initialize Vertex AI

# This step connects our script to your Google Cloud project and prepares Vertex AI for use.
try:
    vertexai.init(project=GCP_PROJECT_ID, location=GCP_REGION)
    print(f"Vertex AI initialized successfully for project '{GCP_PROJECT_ID}' in region '{GCP_REGION}'.")
except Exception as e:
    print(f"Error initializing Vertex AI: {e}")
    print("Please ensure:")
    print(f"  1. You have replaced 'your-gcp-project-id' with your actual GCP Project ID in Cell 1.")
    print(f"  2. The Vertex AI API is enabled in your GCP project: https://console.cloud.google.com/apis/library/aiplatform.googleapis.com")
    print(f"  3. You have authenticated your environment (e.g., by running 'gcloud auth application-default login' in your terminal).")
    # exit() # Uncomment to stop execution if initialization fails

Vertex AI initialized successfully for project 'raggemini-459500' in region 'europe-central2'.


In [60]:
# Cell 3: Load and Preprocess the Bitext Dataset

def load_and_preprocess_data(file_path):
    """
    Loads the Bitext dataset from a CSV file, performs basic cleaning,
    and prepares it for use in our RAG system.

    The Bitext dataset structure:
    - "instruction": The user's query or question. (We'll use this for testing our RAG system).
    - "response": The chatbot's answer. (This is the core of our knowledge base for RAG).
    - "intent", "category": Additional metadata that can be useful for organizing or filtering.
    """
    print(f"\n--- Step 1: Loading and Preprocessing Data from {file_path} ---")

    # --- What is this dataset? ---
    # The Bitext dataset contains pairs of user questions ("instruction") and chatbot answers ("response").
    # This is perfect for RAG:
    # - The "response" texts are the factual information we want our LLM to use. These will form our knowledge base.
    # - The "instruction" texts can be used as example queries to test our RAG system.

    try:
        df = pd.read_csv(file_path)
        print(f"Successfully loaded dataset. Found {len(df)} rows and {len(df.columns)} columns.")
    except FileNotFoundError:
        print(f"ERROR: Dataset file not found at '{file_path}'")
        print("Please ensure the DATASET_PATH in Cell 1 is correct and the file exists.")
        print("You can download it from: https://www.kaggle.com/datasets/bitext/bitext-gen-ai-chatbot-customer-support-dataset")
        return None # Return None if file not found

    # --- Preprocessing ---
    # Preprocessing involves cleaning and preparing the text data.

    # 1. Handle missing values:
    #    For RAG, both 'response' (our knowledge) and 'instruction' (for testing queries) are crucial.
    #    If either is missing for a row, that row isn't very useful.
    print(f"Original number of rows: {len(df)}")
    df.dropna(subset=['response', 'instruction'], inplace=True)
    print(f"Number of rows after dropping those with missing 'response' or 'instruction': {len(df)}")

    # Reset row indices after dropping rows
    df.reset_index(drop=True, inplace=True)

    # 2. Clean the text: (Optional, dataset seems relatively clean)
    #    Remove any irrelevant formatting or metadata if necessary.
    #    Here, we'll just strip leading/trailing whitespace.
    df['response_cleaned'] = df['response'].astype(str).str.strip()
    df['instruction_cleaned'] = df['instruction'].astype(str).str.strip()

    # --- Chunking Strategy ---
    # "Chunking" means breaking down long texts into smaller, meaningful pieces.
    # Why? Because embedding models work best on smaller pieces of text, and retrieval
    # is more precise if chunks are focused.
    #
    # For this Bitext dataset, each "response" is already a Q&A pair's answer.
    # These answers are often naturally good "chunks" because they typically address a specific point.
    # If a single "response" was very long and covered many unrelated points, we might consider
    # splitting it further (e.g., by sentences or paragraphs). For now, we'll treat each 'response' as one chunk.
    print("Text cleaning (stripping whitespace) applied.")
    print(f"We will use 'response_cleaned' as our knowledge chunks.")

    # We can also keep 'intent' and 'category' as metadata associated with each chunk.
    # This could be useful for more advanced retrieval strategies later (e.g., filtering by category).
    print("Columns in the DataFrame after preprocessing:", df.columns.tolist())
    print("\nSample of the preprocessed data:")
    print(df[['instruction_cleaned', 'response_cleaned', 'intent', 'category']].head())

    return df

# --- Execute the function ---
# Check if GCP_PROJECT_ID is set
if GCP_PROJECT_ID == "your-gcp-project-id":
    print("STOP: Please set your 'GCP_PROJECT_ID' in Cell 1 before proceeding.")
elif not os.path.exists(DATASET_PATH):
    print(f"STOP: Dataset file '{DATASET_PATH}' not found. Please check the path in Cell 1.")
else:
    bitext_df = load_and_preprocess_data(DATASET_PATH)
    if bitext_df is not None:
        print(f"\nSuccessfully loaded and preprocessed {len(bitext_df)} entries.")


--- Step 1: Loading and Preprocessing Data from Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv ---
Successfully loaded dataset. Found 26872 rows and 5 columns.
Original number of rows: 26872
Number of rows after dropping those with missing 'response' or 'instruction': 26872
Text cleaning (stripping whitespace) applied.
We will use 'response_cleaned' as our knowledge chunks.
Columns in the DataFrame after preprocessing: ['flags', 'instruction', 'category', 'intent', 'response', 'response_cleaned', 'instruction_cleaned']

Sample of the preprocessed data:
                                 instruction_cleaned  \
0   question about cancelling order {{Order Number}}   
1  i have a question about cancelling oorder {{Or...   
2    i need help cancelling puchase {{Order Number}}   
3         I need to cancel purchase {{Order Number}}   
4  I cannot afford this order, cancel purchase {{...   

                                    response_cleaned        intent category  
0 

In [61]:
# Cell 4: Initialize AI Models (Embedding and Generative)
from vertexai.preview.generative_models import GenerativeModel
import vertexai

vertexai.init(project=GCP_PROJECT_ID, location=GCP_REGION)
def initialize_models():
    """
    Initializes the text embedding model (to convert text to vectors)
    and the generative LLM (to generate answers).
    """
    print("\n--- Step 2: Initializing AI Models ---")

    # --- 1. Embedding Model ---
    # What is an embedding model? It's a model that takes text as input and
    # outputs a list of numbers (a "vector" or "embedding"). This vector
    # numerically represents the meaning or semantic content of the text.
    # Texts with similar meanings will have vectors that are "close" to each other
    # in a high-dimensional space. This is key for finding relevant information.
    try:
        embedding_model = TextEmbeddingModel.from_pretrained(EMBEDDING_MODEL_NAME)
        print(f"Successfully loaded embedding model: '{EMBEDDING_MODEL_NAME}'")
    except Exception as e:
        print(f"Error loading embedding model '{EMBEDDING_MODEL_NAME}': {e}")
        print("Ensure the model name is correct and you have permissions.")
        return None, None

    # --- 2. Generative Model (LLM) ---
    # This is the Large Language Model that will generate the final answer.
    # We'll use one of Google's Gemini models.
    # "gemini-1.5-flash-001" is chosen for a good balance of speed, cost, and capability.
    # You could also use "gemini-1.0-pro" or "gemini-1.5-pro-001" (more capable, potentially slower/costlier).
    try:
        # generative_model = GenerativeModel("gemini-1.0-pro-001")
        generative_model = GenerativeModel("gemini-2.0-flash-lite-001")
        print(f"Successfully loaded generative model: gemini-2.0-flash-lite-001")
    except Exception as e:
        print(f"Error loading generative model: {e}")
        print("Ensure the model name is correct and you have permissions.")
        return embedding_model, None # Return embedding model if it loaded

    return embedding_model, generative_model

# --- Execute the function ---
if 'bitext_df' in locals() and bitext_df is not None: # Check if previous step was successful
    embedding_model, generative_model = initialize_models()
    if embedding_model and generative_model:
        print("\nBoth embedding and generative models initialized.")
    else:
        print("\nModel initialization failed. Please check error messages above.")
else:
    print("Skipping model initialization because data loading failed or was not run.")


--- Step 2: Initializing AI Models ---
Successfully loaded embedding model: 'text-embedding-004'
Successfully loaded generative model: gemini-2.0-flash-lite-001

Both embedding and generative models initialized.


In [62]:
from google.cloud import aiplatform

aiplatform.init(project=GCP_PROJECT_ID, location=GCP_REGION)

# List all models available in the region
models = aiplatform.Model.list()
for model in models:
    print(f"Model ID: {model.display_name}")


In [34]:
# Cell 5: Create Embeddings and Build the Vector Index (Knowledge Base)

# --- RAG Core Concept: Indexing the Knowledge Base ---
# To quickly find relevant information, we need to:
# 1. Convert all our knowledge "chunks" (the 'response_cleaned' texts) into embeddings (numerical vectors).
# 2. Store these embeddings in a special database called a "Vector Index" or "Vector Database".
#    This database is optimized for finding vectors that are "similar" to a query vector.
#
# We are using FAISS for this example. FAISS is a library for efficient similarity search.
# Other options include Pinecone, Weaviate, Chroma, or Vertex AI Vector Search (for cloud-native).

def create_embeddings_and_index(df_to_index, emb_model):
    """
    Generates embeddings for all 'response_cleaned' texts (our knowledge chunks)
    and stores them in a FAISS vector index.
    """
    print("\n--- Step 3: Creating Embeddings and FAISS Vector Index ---")
    if df_to_index is None or df_to_index.empty:
        print("DataFrame is empty or None. Cannot create embeddings.")
        return None, None, None, None

    # These lists will store the original texts and their metadata,
    # corresponding to the order of embeddings in the FAISS index.
    # This allows us to retrieve the actual text once we find a similar embedding.
    knowledge_base_texts = []
    knowledge_base_metadata = [] # e.g., intent, category, original instruction

    # Get the 'response_cleaned' column, which contains our knowledge chunks.
    responses_to_embed = df_to_index['response_cleaned'].tolist()
    print(f"Preparing to generate embeddings for {len(responses_to_embed)} response chunks...")

    all_embeddings_list = []
    # Process in batches to be efficient and avoid hitting API limits if any.
    # The get_embeddings method in Vertex AI SDK can handle batches.
    # textembedding-gecko models typically have a batch limit (e.g., 250 for @001, check docs for @003)
    batch_size = 50 # You can adjust this based on model limits and performance
    
    for i in tqdm(range(0, len(responses_to_embed), batch_size), desc="Generating Embeddings"):
        batch_texts = responses_to_embed[i:i+batch_size]
        try:
            # The get_embeddings() method returns a list of TextEmbedding objects.
            # Each object has a .values attribute containing the numerical vector.
            embeddings_result = emb_model.get_embeddings(batch_texts)
            batch_embeddings_vectors = [emb.values for emb in embeddings_result]
            all_embeddings_list.extend(batch_embeddings_vectors)

            # Store corresponding texts and metadata for this batch
            knowledge_base_texts.extend(batch_texts)
            for j in range(len(batch_texts)):
                original_df_index = df_to_index.index[i + j] # Get original index from df_to_index
                knowledge_base_metadata.append({
                    'intent': df_to_index.loc[original_df_index, 'intent'],
                    'category': df_to_index.loc[original_df_index, 'category'],
                    'original_instruction': df_to_index.loc[original_df_index, 'instruction_cleaned']
                })

        except Exception as e:
            print(f"Error embedding batch starting at index {i}: {e}")
            # For simplicity, we'll skip failed batches, but in a real app, you might retry or log.
            # If errors are frequent, check text lengths, API quotas, or special characters.
            continue # Skip to the next batch

    if not all_embeddings_list:
        print("No embeddings were generated. Cannot create FAISS index.")
        return None, None, None, None

    # Convert the list of embedding vectors into a NumPy array of type float32, required by FAISS.
    embeddings_np = np.array(all_embeddings_list).astype('float32')
    
    # The dimension of the embeddings (e.g., 768 for textembedding-gecko).
    embedding_dimension = embeddings_np.shape[1]
    print(f"\nSuccessfully generated {len(embeddings_np)} embeddings, each with dimension {embedding_dimension}.")

    # --- Create FAISS Index ---
    # faiss.IndexFlatL2: This creates a simple index that performs an exhaustive search
    # using L2 distance (Euclidean distance). For normalized embeddings (like those from
    # textembedding-gecko), L2 distance is equivalent to maximizing cosine similarity.
    # "Flat" means it doesn't use any complex structures to speed up search at the cost of accuracy.
    # For very large datasets, more advanced FAISS indexes (e.g., IndexIVFFlat) might be used.
    index = faiss.IndexFlatL2(embedding_dimension)
    index.add(embeddings_np) # Add all our generated embeddings to the FAISS index.

    print(f"FAISS index created and populated with {index.ntotal} vectors.")
    print("The knowledge base is now indexed and ready for searching.")

    return index, knowledge_base_texts, knowledge_base_metadata, embedding_dimension

# --- Execute the function ---
# For demonstration, let's use a smaller subset of the data to speed up the embedding process.
# In a real application, you'd likely use your full dataset.
if 'bitext_df' in locals() and bitext_df is not None and \
   'embedding_model' in locals() and embedding_model is not None:

    sample_size = 2000 # Number of responses to use for this demo. Adjust as needed.
                       # Set to len(bitext_df) to use the whole dataset (will take longer).
    if len(bitext_df) > sample_size:
        print(f"\nUsing a sample of {sample_size} entries from the dataset for faster demonstration.")
        # Make sure to use .copy() to avoid SettingWithCopyWarning if you modify df_sample later
        bitext_df_sample = bitext_df.sample(n=sample_size, random_state=42).reset_index(drop=True).copy()
    else:
        print(f"\nUsing the full dataset of {len(bitext_df)} entries.")
        bitext_df_sample = bitext_df.copy()

    faiss_index, kb_texts, kb_metadata, kb_embedding_dim = create_embeddings_and_index(
        bitext_df_sample,
        embedding_model
    )
    if faiss_index:
        print(f"\nFAISS index built successfully. Contains {faiss_index.ntotal} items.")
else:
    print("Skipping embedding creation because previous steps (data loading or model init) failed or were not run.")


Using a sample of 2000 entries from the dataset for faster demonstration.

--- Step 3: Creating Embeddings and FAISS Vector Index ---
Preparing to generate embeddings for 2000 response chunks...


Generating Embeddings: 100%|██████████| 40/40 [00:59<00:00,  1.48s/it]


Successfully generated 2000 embeddings, each with dimension 768.
FAISS index created and populated with 2000 vectors.
The knowledge base is now indexed and ready for searching.

FAISS index built successfully. Contains 2000 items.





In [63]:
# Cell 6: Implement the Retrieval Mechanism

# --- RAG Core Concept: Retrieval ---
# Now that our knowledge base is indexed, we can implement the "Retrieval" step.
# When a user asks a question (query):
# 1. We convert the user's query into an embedding (using the SAME embedding model we used for the knowledge base).
# 2. We use the FAISS index to search for the 'k' embeddings in our knowledge base
#    that are most similar (closest) to the query embedding.
# 3. We then retrieve the original text chunks corresponding to these similar embeddings.
#    These are the pieces of information most likely to help answer the user's query.

def retrieve_relevant_chunks(query_text, emb_model, vector_index,
                             original_texts, original_metadata,
                             num_chunks_to_retrieve):
    """
    1. Embeds the user's query.
    2. Searches the FAISS index for the most similar chunks from the knowledge base.
    3. Returns the text and metadata of these retrieved chunks.
    """
    # print(f"\n--- Step 4a: Retrieving relevant chunks for query: '{query_text[:100]}...' ---")

    if vector_index is None or not original_texts:
        print("Vector index or original texts not available. Cannot retrieve.")
        return []

    # 1. Generate embedding for the user's query.
    #    It's CRUCIAL to use the same embedding model that was used to create the knowledge base.
    try:
        query_embedding_obj = emb_model.get_embeddings([query_text]) # Must be a list
        query_vector = np.array(query_embedding_obj[0].values).astype('float32').reshape(1, -1)
        # .reshape(1, -1) ensures it's a 2D array, as FAISS expects a batch of query vectors.
    except Exception as e:
        print(f"Error embedding query '{query_text}': {e}")
        return []

    # 2. Perform similarity search in the FAISS index.
    #    `vector_index.search()` returns:
    #    - `distances`: The L2 distances of the found chunks to the query vector. Smaller is better.
    #    - `indices`: The indices (positions) of these chunks in the original `embeddings_np`
    #                 (and thus in our `original_texts` and `original_metadata` lists).
    # print(f"Searching for {num_chunks_to_retrieve} closest chunks in the index of {vector_index.ntotal} items.")
    distances, indices = vector_index.search(query_vector, num_chunks_to_retrieve)

    retrieved_chunks_info = []
    # print(f"Found {len(indices[0])} potential matches.")
    for i in range(len(indices[0])):
        idx = indices[0][i] # Get the index of the i-th retrieved chunk
        
        # FAISS can return -1 if it can't find enough valid neighbors (e.g. k > ntotal)
        if idx == -1: 
            # print(f"Warning: FAISS returned -1 for index {i}, skipping.")
            continue

        if idx >= len(original_texts):
            # print(f"Warning: Retrieved index {idx} is out of bounds for original_texts (len: {len(original_texts)}). Skipping.")
            continue
            
        dist = distances[0][i]
        
        # For L2 distance on normalized vectors, similarity can be represented as (e.g.) 1 - dist or 2 - dist^2.
        # A simple pseudo-similarity: higher is better.
        # Cosine similarity = (2 - D^2) / 2 for normalized vectors where D is Euclidean distance.
        # Or simply, smaller distance means more similar.
        # We'll store the distance itself, or a transformed score. Let's use 1 - distance as a rough similarity.
        # (Note: This isn't a true probability, just a score for ranking.)
        similarity_score = 1 - (dist / 2) # A rough score, max 1 if dist is 0. Max L2 dist for normalized vectors is 2.

        retrieved_chunks_info.append({
            'text': original_texts[idx],
            'metadata': original_metadata[idx],
            'distance': float(dist), # Store the actual distance
            'similarity_score': float(similarity_score) # Approximate similarity
        })
        # print(f"  Retrieved Chunk {i+1} (Index: {idx}, Distance: {dist:.4f}): {original_texts[idx][:100]}...")

    # Sort by distance (ascending) or similarity_score (descending) just to be sure,
    # though FAISS usually returns them sorted by distance.
    retrieved_chunks_info.sort(key=lambda x: x['distance'])
    
    return retrieved_chunks_info

# --- Example of using the retrieval function (will be used more in the next cell) ---
if 'faiss_index' in locals() and faiss_index is not None:
    test_query_for_retrieval = "How do I reset my password?"
    print(f"\n--- Testing Retrieval for query: '{test_query_for_retrieval}' ---")
    retrieved_for_test = retrieve_relevant_chunks(
        test_query_for_retrieval,
        embedding_model,
        faiss_index,
        kb_texts, # from Cell 5
        kb_metadata, # from Cell 5
        TOP_K_RETRIEVAL # from Cell 1
    )
    if retrieved_for_test:
        print(f"Retrieved {len(retrieved_for_test)} chunks for the test query:")
        for i, chunk_info in enumerate(retrieved_for_test):
            print(f"  Chunk {i+1} (Distance: {chunk_info['distance']:.4f}, Sim Score: {chunk_info['similarity_score']:.4f}):")
            print(f"    Text: {chunk_info['text'][:150]}...")
            print(f"    Intent: {chunk_info['metadata']['intent']}")
    else:
        print("No chunks retrieved for the test query, or an error occurred.")
else:
    print("Skipping retrieval test because FAISS index is not available from previous steps.")


--- Testing Retrieval for query: 'How do I reset my password?' ---
Retrieved 3 chunks for the test query:
  Chunk 1 (Distance: 0.4772, Sim Score: 0.7614):
    Text: I can see that you're unsure about how to reset the password for your user account. Don't worry, I'm here to assist you every step of the way! To init...
    Intent: recover_password
  Chunk 2 (Distance: 0.4820, Sim Score: 0.7590):
    Text: Assuredly! I'm here to assist you in resetting your account password. It's essential to keep your account secure and ensure that only you have access ...
    Intent: recover_password
  Chunk 3 (Distance: 0.4896, Sim Score: 0.7552):
    Text: For sure! I completely understand your situation and the importance of regaining access to your user account. Allow me to guide you through the proces...
    Intent: recover_password


In [64]:
# Cell 7: Integrate with Gemini for Generation (Augmentation & Generation)

# --- RAG Core Concept: Augmentation & Generation ---
# This is where the "Augmented Generation" part of RAG happens.
# 1. Augmentation: We take the original user query AND the relevant chunks we just retrieved.
#    We combine them into a single, detailed "prompt" for the LLM (Gemini).
#    This prompt essentially tells Gemini: "Here's the user's question, and here's some specific
#    information from our knowledge base that should help you answer it. Please use this information."
#
# 2. Generation: We send this "augmented prompt" to the LLM (Gemini).
#    The LLM then generates an answer, hopefully more accurate and context-aware
#    because it has been "grounded" in the retrieved factual information.

def generate_response_with_gemini(user_query, retrieved_chunks_list, gen_model):
    """
    Constructs a prompt for Gemini including the user query and retrieved context,
    then calls the Gemini API to generate a response.
    """
    # print("\n--- Step 4b: Generating response with Gemini using retrieved context ---")

    # 1. Construct the context string from the retrieved chunks.
    #    We'll present each chunk clearly to the LLM.
    context_str_parts = []
    if retrieved_chunks_list:
        for i, chunk_info in enumerate(retrieved_chunks_list):
            context_str_parts.append(f"Context Chunk {i+1} (Source: Customer Support Response):\n{chunk_info['text']}")
        context_for_prompt = "\n\n".join(context_str_parts)
    else:
        context_for_prompt = "No relevant context was found."
        # print("Warning: No context chunks provided to Gemini. LLM will answer from its general knowledge.")

    # 2. Construct the full prompt for Gemini.
    #    This is a critical step called "Prompt Engineering". The way you phrase the prompt
    #    can significantly affect the quality of the LLM's answer.
    #    We instruct the LLM on its role, how to use the context, and what to do if
    #    the context isn't sufficient.
    prompt = f"""You are a helpful and concise customer support assistant.
Your goal is to answer the user's query based *only* on the provided context.

Here is the user's query:
"{user_query}"

Here is the context retrieved from our knowledge base:
--- BEGIN CONTEXT ---
{context_for_prompt}
--- END CONTEXT ---

Please answer the user's query using *only* the information available in the 'BEGIN CONTEXT' and 'END CONTEXT' sections.
If the provided context does not contain the information needed to answer the query, please state:
"I'm sorry, but the provided information from our knowledge base does not seem to contain a direct answer to your question."
Do not make up information or use external knowledge. Be factual and stick to the provided text.
If the context is relevant, synthesize the information to provide a clear and direct answer to the user's query.

Answer:
"""

    # print("\n--- Augmented Prompt for Gemini (first 500 chars) ---")
    # print(prompt[:500] + "...") # For debugging, show the start of the prompt

    # 3. Send the prompt to the Gemini API and get the response.
    try:
        # For gemini-1.5-flash or pro, we can send just the string prompt
        response = gen_model.generate_content(prompt)
        # print("\n--- Gemini Raw Response Object ---")
        # print(response) # To see the full response object if needed

        generated_text = response.text
        # print("\n--- Gemini Generated Text ---")
        # print(generated_text)
        return generated_text
    except Exception as e:
        print(f"Error generating response with Gemini: {e}")
        # More detailed error information if available from the API response
        if hasattr(e, 'response') and e.response:
            print(f"API Error Details: {e.response}")
        elif hasattr(e, 'message'):
            print(f"Error Message: {e.message}")
        return "I apologize, but I encountered an error while trying to generate a response."

# --- Example of using the generation function (will be used more in the next cell) ---
if 'generative_model' in locals() and generative_model is not None and 'retrieved_for_test' in locals():
    if retrieved_for_test: # If we successfully retrieved chunks in the previous cell's test
        print(f"\n--- Testing Generation for query: '{test_query_for_retrieval}' with {len(retrieved_for_test)} retrieved chunks ---")
        rag_test_response = generate_response_with_gemini(
            test_query_for_retrieval,
            retrieved_for_test,
            generative_model
        )
        print("\n--- RAG System Generated Response (Test) ---")
        print(rag_test_response)
    else:
        print("\nSkipping generation test as no chunks were retrieved in the previous test.")
else:
    print("Skipping generation test because generative model or retrieved chunks are not available.")


--- Testing Generation for query: 'How do I reset my password?' with 3 retrieved chunks ---

--- RAG System Generated Response (Test) ---
To reset your password:

1.  Go to the login page.
2.  Click on the "Forgot Password" option.
3.  Enter the email address associated with your account.
4.  Check your inbox (and spam/junk folders) for an email with instructions.
5.  Follow the instructions in the email to create a new password.



In [65]:
# Cell 8: Run the Full RAG Pipeline and Test

# Now we'll put all the pieces together to create a complete RAG pipeline
# and test it with a few sample queries from our Bitext dataset.

def run_rag_query(query_text, emb_model, gen_model, vec_index,
                  knowledge_texts, knowledge_meta, top_k):
    """
    Executes the full RAG pipeline for a single query:
    1. Retrieves relevant chunks.
    2. Generates a response using Gemini with the retrieved context.
    """
    print(f"\n==================================================")
    print(f"Processing Query: \"{query_text}\"")
    print(f"==================================================")

    # 1. Retrieve relevant chunks
    print("Step A: Retrieving relevant chunks...")
    retrieved_chunks = retrieve_relevant_chunks(
        query_text,
        emb_model,
        vec_index,
        knowledge_texts,
        knowledge_meta,
        top_k
    )

    if not retrieved_chunks:
        print("No relevant chunks found for this query by the retrieval system.")
        # Optionally, you could still send to Gemini without context, or return a specific message.
        # For this demo, we'll let Gemini try without explicit context if retrieval fails.
        # However, the prompt in `generate_response_with_gemini` handles "No relevant context was found."
        # return "Could not find relevant information in the knowledge base to answer this query."

    print(f"\nRetrieved {len(retrieved_chunks)} chunks for the query.")
    for i, chunk_info in enumerate(retrieved_chunks):
        print(f"  Retrieved Chunk {i+1} (Sim Score: {chunk_info['similarity_score']:.4f}): {chunk_info['text'][:100]}...")
        # print(f"    Intent: {chunk_info['metadata']['intent']}, Category: {chunk_info['metadata']['category']}")


    # 2. Generate response with Gemini (Augmentation & Generation)
    print("\nStep B: Generating response with Gemini...")
    final_response = generate_response_with_gemini(
        query_text,
        retrieved_chunks,
        gen_model
    )

    print("\n--- Final RAG System Response ---")
    print(final_response)
    print("---------------------------------")
    return final_response

# --- Main Test Execution ---
if 'bitext_df' in locals() and bitext_df is not None and \
   'embedding_model' in locals() and embedding_model is not None and \
   'generative_model' in locals() and generative_model is not None and \
   'faiss_index' in locals() and faiss_index is not None and \
   'kb_texts' in locals() and kb_texts is not None and \
   'kb_metadata' in locals() and kb_metadata is not None:

    print("\n\n--- TESTING THE FULL RAG PIPELINE ---")

    # Select a few "instructions" from our original Bitext dataset to use as test queries.
    # We'll use instructions that were part of the sampled data used for indexing for a fair test of retrieval.
    # If bitext_df_sample was used for indexing:
    if 'bitext_df_sample' in locals() and not bitext_df_sample.empty:
        num_test_queries = 3 # How many test queries to run
        if len(bitext_df_sample) >= num_test_queries:
            test_queries_df = bitext_df_sample.sample(n=num_test_queries, random_state=101)
        else:
            print(f"Warning: Sampled data has less than {num_test_queries} entries. Using all available.")
            test_queries_df = bitext_df_sample
    else: # Fallback to original df if sample not available
        print("Warning: `bitext_df_sample` not found. Using original `bitext_df` for test queries.")
        test_queries_df = bitext_df.sample(n=min(3, len(bitext_df)), random_state=101)


    for index, row in test_queries_df.iterrows():
        user_query = row['instruction_cleaned']
        # The 'expected_response' is the original answer from the dataset.
        # We can use this for a rough comparison, though the RAG system might phrase it differently.
        expected_response = row['response_cleaned']

        print(f"\n\n--- Test Case: Query from Dataset ---")
        print(f"Original Instruction (User Query): {user_query}")
        print(f"Original Response (Expected): {expected_response[:250]}...") # Show a snippet

        _ = run_rag_query(
            user_query,
            embedding_model,
            generative_model,
            faiss_index,
            kb_texts, # The list of text chunks from our knowledge base
            kb_metadata, # Associated metadata
            TOP_K_RETRIEVAL # How many chunks to retrieve
        )

    print("\n--- RAG Pipeline Test Complete ---")
    print("\nConsiderations for Improvement & Iteration:")
    print("1. Chunking Strategy: If responses are very long, split them into smaller, more focused chunks.")
    print("2. TOP_K_RETRIEVAL: Experiment with the number of chunks retrieved (k).")
    print("3. Prompt Engineering: Refine the prompt sent to Gemini for better control over output.")
    print("4. Evaluation: Systematically evaluate responses (e.g., against 'expected_response' or with human review).")
    print("5. Vector Database: For larger scale, consider Vertex AI Vector Search or other managed vector DBs.")

else:
    print("\nSkipping RAG pipeline execution as one or more necessary components (data, models, index) are missing.")
    print("Please ensure all previous cells have run successfully and all variables are defined.")



--- TESTING THE FULL RAG PIPELINE ---


--- Test Case: Query from Dataset ---
Original Instruction (User Query): list your accepted payment options
Original Response (Expected): I'll make it happen! I'm here to provide you with a rundown of our accepted payment options. Here they are:

- **Credit/Debit Card:** We accept major card brands such as Visa, Mastercard, and American Express.
- **PayPal:** A secure and widely-used o...

Processing Query: "list your accepted payment options"
Step A: Retrieving relevant chunks...

Retrieved 3 chunks for the query.
  Retrieved Chunk 1 (Sim Score: 0.8719): Definitely! I'm here to provide you with the necessary guidance to view our accepted payment options...
  Retrieved Chunk 2 (Sim Score: 0.8562): Certainly! I'm here to assist you with listing our accepted payment methods. To view the available o...
  Retrieved Chunk 3 (Sim Score: 0.8542): No problem at all! I'll gladly assist you with finding the list of our accepted payment methods. To ...

S