<a href="https://colab.research.google.com/github/frank-morales2020/MLxDL/blob/main/LLM_Business_Task_Executor_(Python)_with_Metrics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install xai-sdk -q

In [5]:
import requests
import json
import time
import os # Import os for environment variables
import re # For regular expressions to parse code blocks
import io # For capturing stdout/stderr
import contextlib # For redirecting stdout/stderr


# Import for Google Colab user data (for API key retrieval)
try:
    from google.colab import userdata
    COLAB_ENV = True
except ImportError:
    COLAB_ENV = False
    print("Not in Google Colab environment. API keys will be read from environment variables or require manual setting.")

# Import for OpenAI API
try:
    from openai import OpenAI
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False
    print("OpenAI library not found. OpenAI models will not be available.")

# Import for xAI SDK (Grok)
try:
    from xai_sdk import Client
    from xai_sdk.chat import user, system # Assuming these are used as per your example
    XAI_AVAILABLE = True
except ImportError:
    XAI_AVAILABLE = False
    print("xai_sdk library not found. Grok models will not be available.")


class AgentConfig:
    # Default model names if not specified
    GEMINI_MODEL_NAME: str = "gemini-2.5-flash"
    # Using gpt-4o-mini as gpt-5 is not a public model
    # Note: 'gpt-5' is used as per user's request, but if not available,
    # it might default to other gpt models or return an error.
    GPT_MODEL_NAME: str = "gpt-5"
    # Grok model name as per your example
    GROK_MODEL_NAME: str = "grok-4-0709"

# Global dictionary to store overall metrics
overall_metrics_report = {}


def generate_content(prompt: str, model_name: str) -> str:
    """
    Calls the specified LLM API (Gemini, OpenAI, or xAI Grok) to generate content.
    Includes exponential backoff for retries.

    Args:
        prompt: The text prompt to send to the LLM.

    Returns:
        The generated text from the LLM, or an error message if generation fails.
    """
    MAX_RETRIES = 5
    retries = 0
    delay = 1  # 1 second

    while retries < MAX_RETRIES:
        try:
            if model_name.startswith("gemini"):
                # --- Gemini API Call ---
                google_api_key = ""
                if COLAB_ENV:
                    try:
                        google_api_key = userdata.get('GEMINI')
                    except Exception as e:
                        return f"Error: Could not retrieve GEMINI API key from Colab userdata. Please ensure it's set. {e}"
                else:
                    google_api_key = os.getenv('GOOGLE_API_KEY') # For local execution

                if not google_api_key:
                    return "Error: GOOGLE_API_KEY is not set for Gemini. Please provide your Google API key (e.g., in Colab Secrets as 'GEMINI' or as an env var)."

                chat_history = []
                chat_history.append({"role": "user", "parts": [{"text": prompt}]})
                payload = {"contents": chat_history}
                api_url = f"https://generativelanguage.googleapis.com/v1beta/models/{model_name}:generateContent?key={google_api_key}"

                response = requests.post(
                    api_url,
                    headers={'Content-Type': 'application/json'},
                    data=json.dumps(payload)
                )
                response.raise_for_status()

                result = response.json()

                # Robustly access Gemini response content
                if result.get('candidates'):
                    candidate = result['candidates'][0]
                    if candidate and candidate.get('content'):
                        content = candidate['content']
                        if content.get('parts'):
                            parts = content['parts']
                            if parts and len(parts) > 0:
                                return parts[0].get('text', '')

                return f"Error: Unexpected Gemini API response structure or missing content for model {model_name}."


            elif model_name.startswith("gpt"):
                # --- OpenAI API Call ---
                if not OPENAI_AVAILABLE:
                    return "Error: OpenAI library not available. Cannot use GPT models."

                openai_api_key = ""
                if COLAB_ENV:
                    try:
                        openai_api_key = userdata.get('OPENAI_API_KEY')
                    except Exception as e:
                        return f"Error: Could not retrieve OPENAI_API_KEY from Colab userdata. Please ensure it's set. {e}"
                else:
                    openai_api_key = os.getenv('OPENAI_API_KEY') # For local execution

                if not openai_api_key:
                    return "Error: OPENAI_API_KEY is not set for GPT. Please provide your OpenAI API key (e.g., in Colab Secrets as 'OPENAI_API_KEY' or as an env var)."

                client = OpenAI(api_key=openai_api_key)

                response = client.chat.completions.create(
                    model=model_name,
                    messages=[
                        {"role": "user", "content": prompt}
                    ]
                )

                if response.choices and len(response.choices) > 0 and response.choices[0].message:
                    return response.choices[0].message.content
                else:
                    return f"Error: Unexpected OpenAI API response structure or missing content for model {model_name}."

            elif model_name.startswith("grok"):
                # --- xAI Grok API Call (Corrected based on user reference) ---
                if not XAI_AVAILABLE:
                    return "Error: xai_sdk library not available. Cannot use Grok models."

                xai_api_key = ""
                if COLAB_ENV:
                    try:
                        xai_api_key = userdata.get('XAI_KEY')
                    except Exception as e:
                        return f"Error: Could not retrieve XAI_KEY from Colab userdata. Please ensure it's set. {e}"
                else:
                    xai_api_key = os.getenv('XAI_KEY') # For local execution

                if not xai_api_key:
                    return "Error: XAI_KEY is not set for Grok. Please provide your xAI API key (e.g., in Colab Secrets as 'XAI_KEY' or as an env var)."

                client = Client(
                    api_host="api.x.ai",
                    api_key=xai_api_key
                )

                # 1. Create a chat session with the Grok model
                chat_session = client.chat.create(model=model_name, temperature=0)

                # 2. Append the user message
                chat_session.append(user(prompt))

                # 3. Sample (generate) a response from the model
                grok_response = chat_session.sample()

                # 4. Extract the generated response content
                if grok_response and hasattr(grok_response, 'content'):
                    return grok_response.content
                else:
                    # If content extraction fails, print the full response object for debugging
                    print(f"DEBUG: Grok response object structure: {grok_response}")
                    return f"Error: Unexpected Grok API response structure or missing content for model {model_name}. See console for DEBUG info."

            else:
                return f"Error: Unsupported model name: {model_name}. Please use 'gemini-...', 'gpt-...','grok-...' models."

        except requests.exceptions.RequestException as e:
            retries += 1
            if retries < MAX_RETRIES:
                print(f"Warning: API request failed for {model_name}. Retrying in {delay} seconds... (Attempt {retries}/{MAX_RETRIES})")
                time.sleep(delay)
                delay *= 2
            else:
                print(f"Error: Max retries reached. Failed to generate content for {model_name}. {e}")
                return f"Error: Failed to get response after multiple retries for {model_name}. {e}"
        except json.JSONDecodeError as e:
            print(f"Error: Failed to parse JSON response for {model_name}. {e}")
            return f"Error: Failed to parse JSON response for {model_name}. {e}"
        except Exception as e: # Catch broader exceptions for other API errors
            retries += 1
            if retries < MAX_RETRIES:
                print(f"Warning: An unexpected error occurred for {model_name}. Retrying in {delay} seconds... (Attempt {retries}/{MAX_RETRIES})")
                time.sleep(delay)
                delay *= 2
            else:
                print(f"Error: Max retries reached. An unexpected error occurred for {model_name}. {e}")
                return f"Error: An unexpected error occurred after multiple retries for {model_name}. {e}"


def extract_python_code(text: str) -> str:
    """
    Extracts the first Python code block from a given text.
    Made more robust to handle various markdown code block formats and heuristics.
    """
    # Regex 1: Find ```python\n[CODE]```
    match = re.search(r'```python\n(.*?)```', text, re.DOTALL)
    if match:
        return match.group(1).strip()

    # Regex 2: Fallback to generic ```[optional_lang]\n[CODE]```
    match = re.search(r'```(?:\w*\s*\n)?(.*?)```', text, re.DOTALL)
    if match:
        return match.group(1).strip()

    # Heuristic Fallback: Try to identify code blocks based on Python syntax and indentation
    lines = text.splitlines()
    potential_code_lines = []
    in_potential_code_block = False

    # Common Python keywords to look for at the start of a line (after stripping whitespace)
    python_keywords = ["def ", "import ", "class ", "for ", "while ", "if ", "return ", "print(", "#", "try:", "except:", "finally:", "with "]

    for i, line in enumerate(lines):
        stripped_line = line.strip()

        # Check if the line starts with a known Python keyword or is indented
        is_python_start = any(stripped_line.startswith(kw) for kw in python_keywords)
        is_indented = line.startswith(" ") or line.startswith("\t")

        if not in_potential_code_block:
            # Look for the start of a code block
            if is_python_start or (is_indented and i > 0 and (lines[i-1].strip() == "" or lines[i-1].startswith(" ") or lines[i-1].startswith("\t"))):
                in_potential_code_block = True
                potential_code_lines.append(line)
        elif in_potential_code_block:
            # Continue collecting lines that are indented or blank (part of the code block)
            if is_indented or stripped_line == "":
                potential_code_lines.append(line)
            else:
                # If we were in a code block and found a non-indented, non-empty line, assume block ended
                break

    # If we found a potential code block, return it
    if potential_code_lines:
        # Before returning, perform a basic sanity check: does it contain at least one 'def', 'import', or other strong indicator?
        combined_code = "\n".join(potential_code_lines).strip()
        if re.search(r'\b(def|import|class|for|while|if|return)\b', combined_code):
            return combined_code

    return ""


def execute_python_code(code_string: str) -> tuple[str, str]:
    """
    Executes a Python code string and captures its stdout and stderr.
    Returns a tuple of (stdout, stderr).
    """
    if not code_string:
        return "", "No code to execute."

    original_stdout = io.StringIO()
    original_stderr = io.StringIO()

    try:
        # Redirect stdout and stderr
        with contextlib.redirect_stdout(original_stdout):
            with contextlib.redirect_stderr(original_stderr):
                exec(code_string)
    except Exception as e:
        original_stderr.write(f"Execution Error: {e}\n")

    return original_stdout.getvalue(), original_stderr.getvalue()


def evaluate_response(task_name: str, model_name: str, response_text: str, original_prompt: str):
    """
    Calculates and prints metrics for the LLM response and stores them globally.
    """
    print(f"\n--- Metrics for {model_name} on '{task_name}' ---")

    # Initialize task and model entry in overall_metrics_report if not exists
    if task_name not in overall_metrics_report:
        overall_metrics_report[task_name] = {}
    if model_name not in overall_metrics_report[task_name]:
        overall_metrics_report[task_name][model_name] = {}

    # Metric 1: Response Length (Word Count)
    word_count = len(response_text.split())
    print(f"Word Count: {word_count}")
    overall_metrics_report[task_name][model_name]["word_count"] = word_count
    overall_metrics_report[task_name][model_name]["response"] = response_text

    # Metric 2: Code Executability (for 'One-shot Vibe Coding')
    if task_name == "One-shot Vibe Coding":
        print("Attempting to execute generated Python code...")
        extracted_code = extract_python_code(response_text)
        stdout, stderr = "", ""
        code_executable = False

        if extracted_code:
            print(f"Extracted Code:\n{extracted_code[:200]}...") # Print first 200 chars
            stdout, stderr = execute_python_code(extracted_code)
            if stdout:
                print(f"Code Output (stdout):\n{stdout}")
            if stderr:
                print(f"Code Errors (stderr):\n{stderr}")
            if not stdout and not stderr:
                print("Code executed with no output and no errors.")
                code_executable = True
            elif stdout and not stderr:
                print("Code executed with output and no errors.")
                code_executable = True # Consider it executable if no stderr
        else:
            print("No Python code block found in the response.")

        overall_metrics_report[task_name][model_name]["code_extracted"] = bool(extracted_code)
        overall_metrics_report[task_name][model_name]["code_output"] = stdout
        overall_metrics_report[task_name][model_name]["code_error"] = stderr
        overall_metrics_report[task_name][model_name]["code_executable"] = code_executable

    print("-" * 50)


def execute_client_emails(prompt: str, model_name: str) -> str:
    """Executes the Client Emails task using a specified model."""
    print(f"\n--- Executing Client Emails Task with {model_name} ---")
    print(f"Prompt: {prompt}")
    response = generate_content(prompt, model_name)
    evaluate_response("Client Emails", model_name, response, prompt)
    return response

def execute_ad_analysis(prompt: str, model_name: str) -> str:
    """Executes the Ad Analysis task using a specified model."""
    print(f"\n--- Executing Ad Analysis Task with {model_name} ---")
    print(f"Prompt: {prompt}")
    response = generate_content(prompt, model_name)
    evaluate_response("Ad Analysis", model_name, response, prompt)
    return response

def execute_social_writing(prompt: str, model_name: str) -> str:
    """Executes the Social Writing Task using a specified model."""
    print(f"\n--- Executing Social Writing Task with {model_name} ---")
    print(f"Prompt: {prompt}")
    response = generate_content(prompt, model_name)
    evaluate_response("Social Writing", model_name, response, prompt)
    return response

def execute_strategic_planning(prompt: str, model_name: str) -> str:
    """Executes the Strategic Planning task using a specified model."""
    print(f"\n--- Executing Strategic Planning Task with {model_name} ---")
    print(f"Prompt: {prompt}")
    response = generate_content(prompt, model_name)
    evaluate_response("Strategic Planning", model_name, response, prompt)
    return response

def execute_info_retrieval(prompt: str, model_name: str) -> str:
    """Executes the Info Retrieval task using a specified model."""
    print(f"\n--- Executing Info Retrieval Task with {model_name} ---")
    print(f"Prompt: {prompt}")
    response = generate_content(prompt, model_name)
    evaluate_response("Info Retrieval", model_name, response, prompt)
    return response

def execute_one_shot_vibe_coding(prompt: str, model_name: str) -> str:
    """Executes the One-shot Vibe Coding task using a specified model."""
    print(f"\n--- Executing One-shot Vibe Coding Task with {model_name} ---")
    print(f"Prompt: {prompt}")
    response = generate_content(prompt, model_name)
    evaluate_response("One-shot Vibe Coding", model_name, response, prompt)
    return response

def generate_overall_report():
    """Generates and prints an overall report based on collected metrics."""
    print("\n\n" + "="*70)
    print("                 ✨ Overall LLM Performance Report ✨")
    print("="*70 + "\n")

    for task, models_data in overall_metrics_report.items():
        print(f"--- Task: {task} ---")
        best_min_word_count_model = None
        min_word_count = float('inf')
        best_max_word_count_model = None
        max_word_count = 0

        # For coding task, track executability
        best_coding_model = None

        # Collect data for summary
        summary_lines = []
        for model, metrics in models_data.items():
            word_count = metrics.get("word_count", "N/A")
            summary_lines.append(f"  - {model}: Word Count = {word_count}")

            if isinstance(word_count, int):
                if word_count < min_word_count:
                    min_word_count = word_count
                    best_min_word_count_model = model
                if word_count > max_word_count:
                    max_word_count = word_count
                    best_max_word_count_model = model

            if task == "One-shot Vibe Coding":
                is_executable = metrics.get("code_executable", False)
                summary_lines[-1] += f", Executable = {is_executable}"
                if is_executable:
                    if best_coding_model is None: # Prioritize the first one that works
                        best_coding_model = model
                elif not best_coding_model: # If no executable model yet, note down the errors
                    error_msg = metrics.get("code_error", "No error reported").strip()
                    if error_msg:
                        summary_lines[-1] += f" (Error: {error_msg.splitlines()[0]})"


        # Print summary for the task
        for line in summary_lines:
            print(line)

        # Print "best" based on metrics
        if best_min_word_count_model and min_word_count != float('inf'):
            print(f"  📝 **Least verbose response:** {best_min_word_count_model} (Word Count: {min_word_count})")
        if best_max_word_count_model and max_word_count != 0: # Ensure max_word_count is not initial 0
            print(f"  📈 **Most verbose response:** {best_max_word_count_model} (Word Count: {max_word_count})")

        if task == "One-shot Vibe Coding":
            if best_coding_model:
                print(f"  ✅ **Best for Code Executability:** {best_coding_model} (Code executed successfully)")
            else:
                print("  ❌ **No model generated executable code for this task.**")
        print("\n" + "-"*60 + "\n") # Separator between tasks


if __name__ == "__main__":
    print("Welcome to the LLM Business Task Executor (Python) with GPT, Gemini, and Grok!")

    # Define tasks and their corresponding prompts
    tasks = {
        "Client Emails": "Draft an email to a client named Alex confirming the receipt of their inquiry about our new software and informing them that a specialist will contact them within 24 hours.",
        "Ad Analysis": "Analyze the following ad copy: 'Revolutionize your workflow with FlowPro! Sign up now and get 20% off.' Evaluate its strengths, weaknesses, and target audience appeal.",
        "Social Writing": "Create a LinkedIn post announcing a new whitepaper on 'The Future of AI in Business.' Include a call to action to download it.",
        "Strategic Planning": "Propose three innovative ideas for a small e-commerce business to increase customer engagement in the next six months.",
        "Info Retrieval": "Summarize the key differences between agile and waterfall project management methodologies.",
        "One-shot Vibe Coding": "Write a Python function to calculate the factorial of a number. Include an example usage of the function that prints the factorial of 5. **Please enclose all code within a Markdown code block, like this: ```python\\n[your code here]\\n```**"
    }

    # Define models to test
    models_to_test = {
        "Gemini": AgentConfig.GEMINI_MODEL_NAME,
        "GPT": AgentConfig.GPT_MODEL_NAME,
        "Grok": AgentConfig.GROK_MODEL_NAME
    }

    # Execute all tasks for all models
    for task_name, prompt_text in tasks.items():
        for model_label, model_name in models_to_test.items():
            # Dynamically call the correct execute_task function
            if task_name == "Client Emails":
                execute_client_emails(prompt_text, model_name)
            elif task_name == "Ad Analysis":
                execute_ad_analysis(prompt_text, model_name)
            elif task_name == "Social Writing":
                execute_social_writing(prompt_text, model_name)
            elif task_name == "Strategic Planning":
                execute_strategic_planning(prompt_text, model_name)
            elif task_name == "Info Retrieval":
                execute_info_retrieval(prompt_text, model_name)
            elif task_name == "One-shot Vibe Coding":
                execute_one_shot_vibe_coding(prompt_text, model_name)

    # Generate the final overall report
    generate_overall_report()

    print("\n--- All tasks and report generation complete ---")


Welcome to the LLM Business Task Executor (Python) with GPT, Gemini, and Grok!

--- Executing Client Emails Task with gemini-2.5-flash ---
Prompt: Draft an email to a client named Alex confirming the receipt of their inquiry about our new software and informing them that a specialist will contact them within 24 hours.

--- Metrics for gemini-2.5-flash on 'Client Emails' ---
Word Count: 104
--------------------------------------------------

--- Executing Client Emails Task with gpt-5 ---
Prompt: Draft an email to a client named Alex confirming the receipt of their inquiry about our new software and informing them that a specialist will contact them within 24 hours.

--- Metrics for gpt-5 on 'Client Emails' ---
Word Count: 71
--------------------------------------------------

--- Executing Client Emails Task with grok-4-0709 ---
Prompt: Draft an email to a client named Alex confirming the receipt of their inquiry about our new software and informing them that a specialist will contact 