In [60]:
import os
from IPython.display import display

from dotenv import load_dotenv

load_dotenv()

openrouter_api_key = os.getenv("OPENROUTER_API_KEY")

import gradio as gr

In [None]:
##An agent that takes in technical questions which may invove generating audio and video abd responds with explanation where nececessary
import os
import base64
from io import BytesIO
from PIL import Image
import requests

# Assuming you already have:
# openrouter_api_key = os.getenv('OPENROUTER_API_KEY')

def artist(city):
    # function that takes a city name and returns an image of the city
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {openrouter_api_key}",
        "Content-Type": "application/json",
        "HTTP-Referer": "http://localhost",  # Optional: helps with OpenRouter rankings
        "X-Title": "Artist App"
    }

    prompt_text = (
        f"Generate a vibrant pop-art style vacation scene in {city}, "
        f"showing famous tourist spots, landmarks, unique culture, food, "
        f"energy, and atmosphere of the city. Bold colors, high contrast, fun and dynamic composition."
    )

    payload = {
        "model": "google/gemini-2.5-flash-image",  # Stable version — try "-preview" only if this fails
        "modalities": ["text", "image"],           # Order often matters — text first helps
        "messages": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt_text}
                ]
            }
        ]
    }

    response = requests.post(url, headers=headers, json=payload)

    if response.status_code != 200:
        raise Exception(f"API error {response.status_code}: {response.text}")

    result = response.json()

    # Extract image if present
    choices = result.get("choices", [])
    if not choices:
        raise ValueError("No choices in response")

    message = choices[0].get("message", {})
    images = message.get("images", [])

    if not images:
        # Debug fallback — print what we got
        print("No images found. Message content:", message.get("content"))
        print("Full message keys:", list(message.keys()))
        raise ValueError("No image generated — check prompt/model/credits")

    # Take the first image (usually only one)
    image_obj = images[0]
    image_data_url = image_obj.get("image_url", {}).get("url", "")

    if not image_data_url.startswith("data:image"):
        raise ValueError(f"Unexpected image format: {image_data_url[:50]}...")

    # Decode base64
    base64_str = image_data_url.split(",", 1)[1]
    image_bytes = base64.b64decode(base64_str)
    img = Image.open(BytesIO(image_bytes))

    from pathlib import Path
    img_dir = Path.cwd() / "images"
    img_dir.mkdir(exist_ok=True)
    img_path = img_dir / f"{city.replace(' ', '_')}.png"
    img.save(str(img_path))
    print(f"Image saved to: {img_path}")

    return img


In [73]:
# Tool: Text-to-Speech Converter
# Converts text into speech audio using Google Text-to-Speech (gTTS) library.

from pathlib import Path
from typing import Optional
from gtts import gTTS


def talker(text: str, lang: str = "en", output_path: Optional[str] = None) -> str:
    """Generate speech from text using Google Text-to-Speech (gTTS) and return the file path."""
    if not text.strip():
        raise ValueError("Text cannot be empty.")

    output_dir = Path.cwd() / "audio"
    output_dir.mkdir(exist_ok=True)
    audio_path = output_path or str(output_dir / "response.mp3")

    tts = gTTS(text=text, lang=lang)
    tts.save(audio_path)

    return audio_path


In [None]:
ticket_prices = {"london": "$799", "paris": "$899", "tokyo": "$1400", "new york": "$999", "nairobi": "$100", "berlin": "$499"}

def get_ticket_price(destination_city):
    # function that takes a city name and returns the price of a ticket to that city, it uses a predefined dictionary of ticket prices for simplicity, in a real application this could be replaced with an API call to a flight booking service
    print(f"Tool called for city {destination_city}")
    price = ticket_prices.get(destination_city.lower(), "Unknown ticket price")
    return f"The price of a ticket to {destination_city} is {price}"

### Tools for our multmodal AI

In [42]:
price_function = {
    "name": "get_ticket_price",
    "description": "Get the price of a return ticket to the destination city.",
    "parameters": {
        "type": "object",
        "properties": {
            "destination_city": {
                "type": "string",
                "description": "The city that the customer wants to travel to",
            },
        },
        "required": ["destination_city"],
        "additionalProperties": False
    }
}


artist_function = {
    "name": "artist",
    "description": "Generate a vibrant pop-art style vacation scene in a city, showing famous tourist spots, landmarks, unique culture, food, energy, and atmosphere of the city. Bold colors, high contrast, fun and dynamic composition.",
    "parameters": {
        "type": "object",
        "properties": {
            "city": {"type": "string", "description": "The name of the city to generate an image for."}
        }
    }   
}

talker_function = {
    "name": "talker",
    "description": "Convert text into speech audio using Google Text-to-Speech (gTTS) library.",
    "parameters": {
        "type": "object",
        "properties": {"text": {"type": "string", "description": "The text to convert to speech."}}
    }
}   

# And this is included in a list of tools:

tools = [
    {"type": "function", "function": price_function},
    {"type": "function", "function": artist_function},
    {"type": "function", "function": talker_function}
    ]



In [None]:
import json

def handle_tool_call(message):
    # function to handle the tool call, it takes in the message from the model which contains the tool call information and then it executes the corresponding function based on the name of the tool called and returns the result to the model
    tool_call = message.tool_calls[0]
    arguments = json.loads(tool_call.function.arguments)
    name = tool_call.function.name

    if name == "get_ticket_price":
        city = arguments.get('destination_city')
        content = get_ticket_price(city)

    elif name == "artist":
        city = arguments.get('city')
        artist(city)
        img_path = f"images/{city.replace(' ', '_')}.png"
        content = f"Image of {city} has been generated and saved to {img_path}"

    elif name == "talker":
        text = arguments.get('text')
        audio_path = talker(text)
        content = f"Audio has been saved to {audio_path}"

    else:
        content = f"Unknown tool: {name}"

    return {
        "role": "tool",
        "content": content,
        "tool_call_id": tool_call.id
    }


In [None]:
from openai import OpenAI


base_url = "https://openrouter.ai/api/v1"	

api_key = os.getenv("OPENROUTER_API_KEY")

openai = OpenAI(base_url=base_url, api_key=api_key)

system_message = """You are a helpful assistant that can answer questions and help with tasks that require text to speech and image generation.
If user waants an image or audio, you should use the tools provided to you before responding to the user. 
Provide path to the image and audio files in your response.
Always provie path to the image and audio files in your response if generated e.g., Audio has been saved to audio/response.mp3
"""


def chat(message, history):
    # function to handle the chat with the user, it takes in the message and the history of the conversation and returns the response from the model. It also handles tool calls if the model decides to use a tool.
    history = [{"role":h["role"], "content":h["content"]} for h in history]
    messages = [{"role": "system", "content": system_message}] + history + [{"role": "user", "content": message}]
    response = openai.chat.completions.create(model="gpt-4o-mini", messages=messages, tools=tools)

    if response.choices[0].finish_reason=="tool_calls":
        message = response.choices[0].message
        response = handle_tool_call(message)
        messages.append(message)
        messages.append(response)
        response = openai.chat.completions.create(model="gpt-4o-mini", messages=messages)
    
    return response.choices[0].message.content

In [None]:
gr.ChatInterface(fn=chat).launch()