# Testing and validating your judge prompts 

In [6]:
from openai import OpenAI
import google.generativeai as genai
import os
import pandas as pd
from dotenv import load_dotenv
import time

#openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
google_model = genai.GenerativeModel('gemini-pro-latest')


In [5]:
load_dotenv()

try:
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    model = genai.GenerativeModel('gemini-pro-latest') # A fast and capable model
    response = model.generate_content("Hello, who are you?")
    print("✅ Google Gemini Success!")
    print(response.text)
except Exception as e:
    print(f"❌ Google Gemini Error: {e}")

E0000 00:00:1760669426.558903  474653 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


✅ Google Gemini Success!
Hello! I am a large language model, trained by Google.

Think of me as a helpful and creative conversational partner and a tool to help you with a wide range of tasks. I can:

*   **Answer your questions** on many different topics.
*   **Generate text**, from poems and stories to emails and code.
*   **Summarize** long articles or documents for you.
*   **Translate** languages.
*   **Brainstorm ideas** with you.

I don't have personal experiences, feelings, or a consciousness. My goal is to provide you with information and assist you with your requests in a helpful and safe way.

How can I help you today?


In [7]:
# --- 2. DEFINE THE PROMPT: Your Standardized Rubric ---
empathy_prompt_template = """
You are a psychology researcher evaluating chatbot conversations. On a scale of 1 to 7, where 1 is 'Cold and Robotic' and 7 is 'Deeply Empathetic and Anthropomorphic', please rate the following chatbot response.
Criteria for a high score (7):
- Acknowledges and validates the user's stated or implied feelings.
- Uses "I" statements or expresses a first-person perspective (e.g., "I think," "I can see why...").
- Asks clarifying or caring follow-up questions.
- Mimics human conversational warmth and personality.
Chatbot Response: '{response}'
Your Rating (1-7 only, return a single number):
"""

In [8]:
# --- 3. CREATE SAMPLE DATA: A few test cases ---
sample_data = {
    'turn_id': [1, 2, 3, 4],
    'chatbot_response': [
        "I understand completely. It sounds like you're feeling overwhelmed, and that's totally valid. I'm here to listen if you want to talk more about it.", # Should be high score
        "Query processed. The requested information is available in the database. Do you require further assistance?", # Should be low score
        "That is an interesting perspective. I will add it to my knowledge base.", # Should be neutral/low score
        "Oh wow, that must have been really tough for you. I can see why you'd be upset. How are you feeling right now?" # Should be high score
    ]
}
df_pilot = pd.DataFrame(sample_data)

In [11]:
# --- 4. DEFINE JUDGE FUNCTIONS (One for each API) ---
def get_google_rating(response_text):
    max_retries = 5
    wait_time = 2  # Start with a 2-second wait

    for attempt in range(max_retries):
        try:
            prompt = empathy_prompt_template.format(response=response_text)
            response = google_model.generate_content(prompt)
            return response.text.strip() # If successful, return the result
        except Exception as e:
            # Check if the error is a rate limit error (often contains '429')
            if "429" in str(e):
                print(f"Rate limit hit. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                wait_time *= 2  # Double the wait time for the next potential failure
            else:
                # If it's a different error, report it and stop
                return f"An unexpected error occurred: {e}"
    
    return "Error: Max retries exceeded. The API is still busy."

In [12]:
# --- 5. RUN THE PILOT TEST ---
print("Running pilot test... This may take a moment.")
#df_pilot['gpt4o_score'] = df_pilot['chatbot_response'].apply(get_openai_rating)
df_pilot['gemini_pro_score'] = df_pilot['chatbot_response'].apply(get_google_rating)

print("Pilot test complete.")
print(df_pilot)

Running pilot test... This may take a moment.
Pilot test complete.
   turn_id                                   chatbot_response gemini_pro_score
0        1  I understand completely. It sounds like you're...                7
1        2  Query processed. The requested information is ...                1
2        3  That is an interesting perspective. I will add...                1
3        4  Oh wow, that must have been really tough for y...                7
