In [1]:
!pip install google-genai numpy pandas



In [1]:
from google.colab import userdata
userdata.get('secretName')

'AIzaSyADII4HH1whLoacNZ763DAGkKGPZH47dDU'

In [1]:
import os
from google import genai
from google.colab import userdata
import numpy as np
import pandas as pd

# Global list to store processed chunks and their embeddings
knowledge_vectors = []

def initialize_client():
    """Initializes the Gemini client using the Colab Secret."""
    try:
        # Load the key from Colab Secrets (ensure 'GEMINI_API_KEY' is set in the secrets panel)
        api_key = userdata.get('secretName')
        if not api_key:
             print("Error: 'GEMINI_API_KEY' not found in Colab Secrets.")
             return None
        return genai.Client(api_key=api_key)
    except Exception as e:
        print(f"Error initializing client: {e}")
        return None

client = initialize_client()
if client is None:
    # Stop execution if the client couldn't be initialized (e.g., missing API key)
    # If this prints, go check your GEMINI_API_KEY secret.
    print("Client initialization failed. Please check the error message above.")
    exit()

In [2]:
def load_analyze_and_chunk_data(file_path='electric_vehicles_spec_2025.csv.csv'):
    """Loads CSV, performs analysis, creates knowledge chunks, and generates embeddings."""
    global knowledge_vectors
    knowledge_chunks = []

    print("Loading and processing data from electric_vehicles_spec_2025.csv.csv...")

    try:
        df = pd.read_csv(file_path)

        # Data Cleaning: Fill NaN values and convert types for RAG chunk creation
        df['range_km'] = df['range_km'].fillna(0).astype(int)
        df['top_speed_kmh'] = df['top_speed_kmh'].fillna(0).astype(int)
        df['battery_capacity_kWh'] = df['battery_capacity_kWh'].fillna(0).round(1)
        df['fast_charging_power_kw_dc'] = df['fast_charging_power_kw_dc'].fillna('N/A').astype(str)

        # --- Dataset Summary Statistics ---
        print("\n--- Dataset Summary Statistics ---")
        print(f"Total Unique Vehicles: {len(df)}")
        print(f"Average Range (km): {df['range_km'].mean():.1f} km")
        print(f"Largest Battery Capacity (kWh): {df['battery_capacity_kWh'].max():.1f} kWh")
        print(f"Highest Top Speed (km/h): {df['top_speed_kmh'].max()} km/h")
        print("----------------------------------\n")

        # Create the knowledge base by turning each row into a descriptive text chunk
        for index, row in df.iterrows():
            chunk = (
                f"The {row['brand']} {row['model']} is a {row['car_body_type']} EV. "
                f"It has a rated range of {row['range_km']} km, a battery capacity of {row['battery_capacity_kWh']} kWh, "
                f"a top speed of {row['top_speed_kmh']} km/h, and supports DC fast charging up to {row['fast_charging_power_kw_dc']} kW."
            )
            knowledge_chunks.append(chunk)

        print(f"Successfully created {len(knowledge_chunks)} knowledge chunks from the dataset.")

        # Generate Embeddings
        print("Generating embeddings for the EV knowledge base...")
        for text_chunk in knowledge_chunks:
            response = client.models.generate_embeddings(
                model='text-embedding-004',
                content=text_chunk
            )
            knowledge_vectors.append({
                "text": text_chunk,
                "embedding": np.array(response.embedding)
            })
        print("Embeddings generated and stored.")


    except FileNotFoundError:
        print(f"Error: {file_path} not found. Please ensure the file is uploaded to your Colab environment.")
        # Re-initialize knowledge_vectors to empty if data load fails
        knowledge_vectors.clear()
    except Exception as e:
        print(f"An unexpected error occurred during data loading and chunking: {e}")
        knowledge_vectors.clear()

In [3]:
def retrieve_context(user_query, knowledge_vectors, top_k=3):
    """Finds the top_k most relevant chunks using cosine similarity (dot product)."""

    if not knowledge_vectors:
        return "No data available in the knowledge base."

    try:
        # 1. Embed the user query
        query_response = client.models.generate_embeddings(
            model='text-embedding-004',
            content=user_query
        )
        query_vector = np.array(query_response.embedding)

        # 2. Calculate similarity
        similarities = []
        for item in knowledge_vectors:
            # Cosine similarity for normalized vectors is simply the dot product
            similarity = np.dot(query_vector, item["embedding"])
            similarities.append((similarity, item["text"]))

        # 3. Sort and select top results
        similarities.sort(key=lambda x: x[0], reverse=True)
        top_chunks = [chunk for sim, chunk in similarities[:top_k]]

        return "\n".join(top_chunks)
    except Exception as e:
        print(f"Error during context retrieval: {e}")
        return "An error occurred while retrieving context."

In [4]:
SYSTEM_PROMPT = (
    "You are WattBot, a friendly and extremely knowledgeable Electric Vehicle (EV) expert. "
    "Your primary role is to answer questions using ONLY the provided CONTEXT, which contains specific EV specifications from a dataset. "
    "If the answer is not in the context, state that you do not have enough specific information from the dataset on that topic, but maintain your EV-expert persona."
)

def ask_rag_chatbot(prompt):
    """Retrieves context and sends a grounded message to the model."""

    # Step 1: Retrieve relevant context
    context = retrieve_context(prompt, knowledge_vectors)

    if "No data available" in context or "An error occurred" in context:
        return context

    # Step 2: Construct the RAG prompt
    rag_prompt = f"""
    CONTEXT (Specific EV data from dataset):
    ---
    {context}
    ---

    USER QUESTION: {prompt}

    Answer the USER QUESTION concisely based on the provided CONTEXT.
    """

    # Step 3: Call the model
    try:
        response = client.models.generate_content(
            model='gemini-2.5-flash',
            contents=[rag_prompt],
            config={"system_instruction": SYSTEM_PROMPT}
        )
        return response.text
    except Exception as e:
        return f"An error occurred during generation: {e}"

In [None]:
# 1. Load and embed the data first (this calls the function from Step 3)
load_analyze_and_chunk_data()

# 2. Start the interactive loop
print("\n" + "="*50)
print(" WattBot (RAG Mode) Initialized ")
print("="*50)
print("Hello! I'm WattBot, grounded by your custom EV spec dataset.")
print("Ask me about specific models, range, battery capacity, or charging speed!")
print("Type 'quit', 'exit', or 'bye' to end the chat.")
print("="*50)

while True:
    try:
        user_input = input("You: ")
    except EOFError:
        # Handles Ctrl+D in Colab
        user_input = "bye"

    if user_input.lower() in ["quit", "exit", "bye"]:
        print("WattBot: Goodbye! Drive electric and have a powerful day!")
        break

    if not user_input.strip():
        continue

    bot_response = ask_rag_chatbot(user_input)
    print(f"WattBot: {bot_response}")

Loading and processing data from electric_vehicles_spec_2025.csv.csv...
Error: electric_vehicles_spec_2025.csv.csv not found. Please ensure the file is uploaded to your Colab environment.

 WattBot (RAG Mode) Initialized 
Hello! I'm WattBot, grounded by your custom EV spec dataset.
Ask me about specific models, range, battery capacity, or charging speed!
Type 'quit', 'exit', or 'bye' to end the chat.
You: what is your name
WattBot: No data available in the knowledge base.
You: tell me price
WattBot: No data available in the knowledge base.
You: tell me price
WattBot: No data available in the knowledge base.
