<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/GEMINI_MMLLM_AAI_DEMO.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import requests
from PIL import Image
from io import BytesIO
import google.generativeai as genai # Import the generativeai library

# --- 1. Configuration for Agent ---
class AgentConfig:
    LLM_MODEL_NAME: str = "gemini-2.5-flash"
    MAX_AGENT_RETRIES: int = 2 # Max attempts for the agent to refine its answer

# --- 2. Google Colab / Gemini API Imports and Configuration ---
GOOGLE_API_KEY = None
try:
    from google.colab import userdata
    GOOGLE_API_KEY = userdata.get('GEMINI')
    print("Google Generative AI configured successfully using Colab Secrets.")
except (ImportError, KeyError):
    print("Not running in Google Colab or 'GEMINI' secret not found. Attempting to get 'GEMINI' environment variable.")
    GOOGLE_API_KEY = os.getenv('GEMINI')

# Initialize Gemini API
if GOOGLE_API_KEY:
    genai.configure(api_key=GOOGLE_API_KEY)
    print(f"Gemini API configured with model: {AgentConfig.LLM_MODEL_NAME}")
else:
    print("Warning: GOOGLE_API_KEY not found. LLM calls will not work.")
    print("Please set your 'GEMINI' environment variable or Colab secret.")


# --- 3. Data Ingestion & Indexing (Multimodal RAG Setup) ---

# Define KNOWLEDGE_BASE_DIR and VECTOR_DB_PATH outside the class if they are global
KNOWLEDGE_BASE_DIR = "knowledge_base_documents/" # Directory for your documents (text, images, etc.)
VECTOR_DB_PATH = "your_vector_db_path/" # e.g., for FAISS local storage (not used in this simplified example)


class MultimodalKnowledgeBase:
    def __init__(self, vector_db_path):
        self.vector_db_path = vector_db_path
        self.text_retriever = None
        self.image_retriever = None
        if GOOGLE_API_KEY:
            # Use a currently supported multimodal model for captioning
            # Corrected: Using gemini-1.5-flash as recommended by deprecation message
            self.image_captioning_model = genai.GenerativeModel('gemini-1.5-flash')
        else:
            print("Error: GOOGLE_API_KEY not set. Cannot initialize image captioning model.")
            self.image_captioning_model = None

    def load_documents(self, directory):
        """Loads and processes documents (text, images) into a vector store."""
        text_documents = []
        image_documents = [] # Store image paths and potentially their generated captions

        for filename in os.listdir(directory):
            filepath = os.path.join(directory, filename)
            if filename.endswith(".txt") or filename.endswith(".pdf"):
                with open(filepath, 'r', encoding='utf-8') as f:
                    content = f.read()
                    text_documents.append({"content": content, "source": filepath})
            elif filename.endswith((".jpg", ".png", ".jpeg")):
                # Only try to generate caption if the captioning model is available
                if self.image_captioning_model:
                    caption = self._generate_image_caption(filepath)
                    image_documents.append({"path": filepath, "caption": caption})
                else:
                    print(f"Skipping caption for {filepath} as captioning model is not initialized.")


        # Create/Load Vector Stores (placeholders for actual implementation)
        print(f"Loaded {len(text_documents)} text documents and {len(image_documents)} image documents.")

    def _generate_image_caption(self, image_path):
        """Generates image captions using a multi-modal LLM."""
        if not self.image_captioning_model:
            return f"Captioning model not available for image at {image_path}"
        try:
            img = Image.open(image_path).convert('RGB') # Ensure consistent image format
            response = self.image_captioning_model.generate_content(["Describe this image in detail for a flight planning context.", img])
            return response.text
        except Exception as e:
            print(f"Error generating caption for {image_path}: {e}")
            return f"Could not generate caption for image at {image_path}"

    def retrieve_text(self, query: str, k: int = 3):
        """Retrieves relevant text chunks based on query."""
        print(f"Retrieving text for query: '{query}'")
        return [f"Relevant text chunk 1 for '{query}' (from regulations.txt - Pilots must have valid licenses. Weather conditions must be VFR or IFR compliant. Fuel reserves are mandatory. Airspace classifications dictate specific procedures.)",
                f"Relevant text chunk 2 for '{query}' (from schedules.txt - Montreal (YUL) operates 24/7. Toronto (YYZ) has peak hours from 0600-0900 and 1700-2000 local time.)"]

    def retrieve_images(self, query: str, k: int = 3):
        """Retrieves relevant image captions/paths based on query.
           Only includes images that are actually expected to exist.
        """
        print(f"Retrieving images for query: '{query}'")
        # Ensure the path exists for the dummy image that WE explicitly created.
        dummy_image_path = os.path.join(KNOWLEDGE_BASE_DIR, "yul_airport_map.png")
        retrieved_image_list = []

        if os.path.exists(dummy_image_path):
            # We assume a caption has been generated for it during load_documents
            # In a real RAG, this would come from your vector store, associated with the image.
            # For this simplified example, we'll use a hardcoded caption or regenerate if needed.
            # For demonstration, we'll use a placeholder caption if _generate_image_caption
            # was not called or failed during load_documents.
            caption = "A map of Montreal Trudeau International Airport (YUL), showing runways, terminals, and taxiways, with a red circular marker and a gray rectangular building."
            retrieved_image_list.append({"path": dummy_image_path, "caption": caption})

        return retrieved_image_list

# --- 4. Agentic LLM for Multi-Modal Reasoning & Planning ---

class FlightPlanningAgent:
    def __init__(self, knowledge_base: MultimodalKnowledgeBase):
        self.knowledge_base = knowledge_base
        if GOOGLE_API_KEY:
            self.llm = genai.GenerativeModel(AgentConfig.LLM_MODEL_NAME)
        else:
            print(f"Error: GOOGLE_API_KEY not set. Cannot initialize agent LLM ({AgentConfig.LLM_MODEL_NAME}).")
            self.llm = None

    def _call_llm(self, prompt, image_inputs=None):
        """Helper to interact with the LLM, handling multimodal input."""
        if not self.llm: # Check if LLM was initialized
            print("LLM not initialized due to missing API key. Cannot make API call.")
            return "ERROR: LLM not available."

        contents = [prompt]
        if image_inputs:
            for img_data in image_inputs:
                try:
                    img = Image.open(img_data['path']).convert('RGB')
                    contents.append(img)
                    print(f"Adding image: {img_data['path']} (caption: {img_data['caption'][:50]}...)")
                except FileNotFoundError:
                    print(f"Warning: Image file not found at {img_data['path']}. Skipping.")
                except Exception as e:
                    print(f"Error loading image {img_data['path']}: {e}. Skipping.")

        print(f"\n--- LLM Call ---")
        print(f"Prompt (first 200 chars): {prompt[:200]}...")

        try:
            response = self.llm.generate_content(contents)
            llm_response_text = response.text
        except Exception as e:
            print(f"LLM API error: {e}")
            llm_response_text = "ERROR: Could not get a response from the LLM."

        print(f"LLM Response (first 200 chars): {llm_response_text[:200]}...")
        return llm_response_text

    def plan_flight(self, user_query: str):
        """
        Main agentic loop for flight planning.
        Incorporates reasoning, RAG, and multi-modal handling.
        """
        print(f"\n--- Agent Starting: Processing query '{user_query}' ---")

        for attempt in range(AgentConfig.MAX_AGENT_RETRIES):
            print(f"\n--- Attempt {attempt + 1}/{AgentConfig.MAX_AGENT_RETRIES} ---")

            initial_prompt = f"""
            You are an AI flight planning assistant using Gemini 2.5 Flash. Your goal is to provide accurate, detailed, and hallucination-free flight plans.
            The user has provided the following query: "{user_query}"

            First, analyze the query to determine what information is needed and what tools (retrieval from knowledge base, external APIs) might be useful.
            Consider if multi-modal information (e.g., airport maps, weather charts) would be beneficial.
            Formulate a detailed plan to gather necessary information and then synthesize it into a comprehensive flight plan.
            Your response should be a plan of action, not the final flight plan yet.
            """
            agent_plan = self._call_llm(initial_prompt)
            print(f"Agent's Initial Plan: {agent_plan}")

            retrieval_query_text = f"Flight regulations, schedules, and airport information for {user_query}."
            retrieved_text = self.knowledge_base.retrieve_text(retrieval_query_text)
            print(f"Retrieved Text: {retrieved_text}")

            retrieval_query_image = f"Airport maps or relevant weather charts for {user_query}."
            retrieved_images = self.knowledge_base.retrieve_images(retrieval_query_image)
            print(f"Retrieved Images (Captions): {[img['caption'] for img in retrieved_images]}")

            retrieved_text_formatted = '\n- '.join(retrieved_text)
            retrieved_images_formatted = '\n- '.join([img['caption'] for img in retrieved_images]) if retrieved_images else "No relevant images found."

            reasoning_prompt = f"""
            Based on the user query: "{user_query}"
            And the following retrieved text information:
            - {retrieved_text_formatted}
            And the following retrieved visual information (from image captions and the images themselves, if provided):
            - {retrieved_images_formatted}

            Please synthesize all this information.
            Formulate a comprehensive flight plan or a detailed answer to the user's query.
            Crucially, ensure ALL claims are strictly grounded in the provided retrieved data. DO NOT introduce any information not present in the retrieved text or images.
            If you cannot fully answer or if information is contradictory, state that clearly and explain why.
            """
            final_response = self._call_llm(reasoning_prompt, image_inputs=retrieved_images)

            retrieved_text_for_verification = '\n- '.join(retrieved_text)
            retrieved_images_for_verification = '\n- '.join([img['caption'] for img in retrieved_images]) if retrieved_images else "No relevant images found."


            verification_prompt = f"""
            Review the following proposed flight plan/response:
            "{final_response}"

            And compare it strictly against the retrieved information provided earlier:
            Retrieved Text:
            - {retrieved_text_for_verification}
            Retrieved Image Descriptions:
            - {retrieved_images_for_verification}

            Does this proposed response strictly adhere to the retrieved information?
            Are there any unsupported claims, details that seem fabricated, or inconsistencies?
            Provide a "VERIFICATION STATUS: OK" if it's perfectly grounded, or "VERIFICATION STATUS: NEEDS REVISION" followed by specific points of discrepancy and suggestions for correction.
            Be very critical and explicit about any potential hallucinations or ungrounded statements.
            """
            verification_result = self._call_llm(verification_prompt)
            print(f"\n--- Verification Result: {verification_result} ---")

            if "VERIFICATION STATUS: OK" in verification_result:
                print("\nFlight plan verified. Ready to present.")
                return final_response
            else:
                print("\nWARNING: Hallucination detected or potential inaccuracy. Agent is attempting to re-evaluate and refine.")
                user_query = f"Refine the flight plan for: '{user_query}' based on this feedback: {verification_result}"
                final_response = "Refinement in progress..."
                if attempt == AgentConfig.MAX_AGENT_RETRIES - 1:
                    print(f"\nMax retries reached. Final (potentially unverified) response:")
                    return final_response + f"\n\nNote: The agent could not fully verify this response due to: {verification_result}"
        return "Failed to generate a verified flight plan after multiple retries."


# --- Main Execution ---
if __name__ == "__main__":
    # Ensure google.generativeai is imported
    import google.generativeai as genai

    # Create a dummy knowledge base directory and files for demonstration
    os.makedirs(KNOWLEDGE_BASE_DIR, exist_ok=True)
    with open(os.path.join(KNOWLEDGE_BASE_DIR, "flight_regulations.txt"), "w") as f:
        f.write("Standard flight regulations: Pilots must have valid licenses. Weather conditions must be VFR or IFR compliant. Fuel reserves are mandatory. Airspace classifications dictate specific procedures.")
    with open(os.path.join(KNOWLEDGE_BASE_DIR, "airport_schedules.txt"), "w") as f:
        f.write("Montreal (YUL) operates 24/7. Toronto (YYZ) has peak hours from 0600-0900 and 1700-2000 local time. New York (JFK) has special noise abatement procedures.")

    # Explicitly create ONLY the yul_airport_map.png as previously agreed.
    yul_map_path = os.path.join(KNOWLEDGE_BASE_DIR, 'yul_airport_map.png')
    try:
        from PIL import Image
        from PIL import ImageDraw
        img = Image.new('RGB', (600, 400), color = 'lightblue')
        draw = ImageDraw.Draw(img)
        draw.ellipse((100, 100, 200, 200), fill='red', outline='red') # Mock runway
        draw.rectangle((300, 150, 400, 250), fill='gray', outline='black') # Mock terminal
        img.save(yul_map_path)
        print(f"Created dummy image: {yul_map_path}")
    except ImportError:
        print("Pillow not installed. Cannot create dummy image. Image retrieval will be less illustrative.")


    kb = MultimodalKnowledgeBase(VECTOR_DB_PATH)
    kb.load_documents(KNOWLEDGE_BASE_DIR)

    agent = FlightPlanningAgent(kb)

    user_query_1 = "Plan a flight from Montreal to Toronto tomorrow, considering weather and airspace."
    plan_1 = agent.plan_flight(user_query_1)
    print(f"\nFinal Flight Plan 1: {plan_1}")

    user_query_2 = "Tell me about the historical average rainfall in Paris in 1850 for flight planning purposes."
    plan_2 = agent.plan_flight(user_query_2)
    print(f"\nFinal Flight Plan 2: {plan_2}")

Google Generative AI configured successfully using Colab Secrets.
Gemini API configured with model: gemini-2.5-flash
Created dummy image: knowledge_base_documents/yul_airport_map.png
Loaded 2 text documents and 1 image documents.

--- Agent Starting: Processing query 'Plan a flight from Montreal to Toronto tomorrow, considering weather and airspace.' ---

--- Attempt 1/2 ---

--- LLM Call ---
Prompt (first 200 chars): 
            You are an AI flight planning assistant using Gemini 2.5 Flash. Your goal is to provide accurate, detailed, and hallucination-free flight plans.
            The user has provided the foll...
LLM Response (first 200 chars): The user requires a flight plan from Montreal to Toronto for "tomorrow," incorporating weather and airspace considerations. This involves gathering dynamic, real-time aviation data and applying a stru...
Agent's Initial Plan: The user requires a flight plan from Montreal to Toronto for "tomorrow," incorporating weather and airspace consider