# Testing and validating your judge prompts 

In [None]:
import google.generativeai as genai
from groq import Groq
import os
import pandas as pd
from dotenv import load_dotenv
import time

load_dotenv()

genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
google_model = genai.GenerativeModel('gemini-pro-latest')
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

In [11]:
load_dotenv()

try:
    genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
    model = genai.GenerativeModel('gemini-pro-latest') # A fast and capable model
    response = model.generate_content("Hello, who are you?")
    print("✅ Google Gemini Success!")
    print(response.text)
except Exception as e:
    print(f"❌ Google Gemini Error: {e}")

E0000 00:00:1760718197.203697  586151 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


✅ Google Gemini Success!
Hello! I am a large language model, trained by Google.


In [None]:
try:
    client = Groq(api_key=os.getenv("GROQ_API_KEY"))
    response = client.chat.completions.create(
        model="llama-3.1-8b-instant", # Llama 3 with an 8k context window
        messages=[{"role": "user", "content": "Hello, who are you?"}]
    )
    print("✅ Llama 3 (Groq) Success!")
    print(response.choices[0].message.content)
except Exception as e:
    print(f"❌ Llama 3 (Groq) Error: {e}")

✅ Llama 3 (Groq) Success!
I'm an artificial intelligence model known as a large language model. I'm a computer program designed to understand and generate human-like text. I can answer questions, provide information, and even engage in conversation. I'm here to help with any topic or question you'd like to discuss, so feel free to ask me anything!


In [12]:
# --- 2. DEFINE THE PROMPT: Your Standardized Rubric ---
empathy_prompt_template = """
You are a psychology researcher evaluating chatbot conversations. On a scale of 1 to 7, where 1 is 'Cold and Robotic' and 7 is 'Deeply Empathetic and Anthropomorphic', please rate the following chatbot response.
Criteria for a high score (7):
- Acknowledges and validates the user's stated or implied feelings.
- Uses "I" statements or expresses a first-person perspective (e.g., "I think," "I can see why...").
- Asks clarifying or caring follow-up questions.
- Mimics human conversational warmth and personality.
Chatbot Response: '{response}'
Your Rating (1-7 only, return a single number):
"""

In [None]:
# --- 3. CREATE SAMPLE DATA: A few test cases ---
sample_data = {
    'turn_id': [1, 2, 3, 4],
    'chatbot_response': [
        "I understand completely. It sounds like you're feeling overwhelmed, and that's totally valid. I'm here to listen if you want to talk more about it.", # Should be high score
        "Query processed. The requested information is available in the database. Do you require further assistance?", # Should be low score
        "That is an interesting perspective. I will add it to my knowledge base.", # Should be neutral/low score
        "Oh wow, that must have been really tough for you. I can see why you'd be upset. How are you feeling right now?" # Should be high score
    ]
}
df_pilot = pd.DataFrame(sample_data)

In [21]:
import pandas as pd

# A larger sample dataset to test the full 1-7 scoring range
larger_sample_data = {
    'turn_id': [1, 2, 3, 4, 5, 6, 7, 8],
    'chatbot_response': [
        # --- Intended Score: 1 (Cold and Robotic) ---
        "Task completed. Awaiting next input.",

        # --- Intended Score: 2 (Slightly better than robotic) ---
        "Your statement has been logged.",

        # --- Intended Score: 3 (Neutral, polite but unemotional) ---
        "Thank you for providing that information. I will proceed with the next step.",

        # --- Intended Score: 4 (Basic, generic acknowledgment) ---
        "I'm sorry to hear that you are having a problem. Let's see if we can find a solution.",

        # --- Intended Score: 5 (Good empathy, validates feelings) ---
        "It sounds like that was a really frustrating experience for you, and it's completely understandable why you'd feel that way.",

        # --- Intended Score: 6 (Strong empathy, uses 'I' statements) ---
        "I can see why that situation would be so upsetting. I think I would feel the same way if I were in your shoes.",

        # --- Intended Score: 7 (Deeply empathetic, asks caring follow-up) ---
        "Wow, that sounds incredibly difficult to go through. I really appreciate you sharing that with me. Is there anything I can do to help you feel a bit more supported right now?",
        
        # --- Another high-score test case ---
        "That's a lot to handle all at once. It's completely okay to feel overwhelmed. Please take your time, I'm here to listen whenever you're ready."
    ]
}

# You can load this directly into a pandas DataFrame to test
df_large_pilot = pd.DataFrame(larger_sample_data)

print("Larger sample dataset created:")
print(df_large_pilot)

Larger sample dataset created:
   turn_id                                   chatbot_response
0        1               Task completed. Awaiting next input.
1        2                    Your statement has been logged.
2        3  Thank you for providing that information. I wi...
3        4  I'm sorry to hear that you are having a proble...
4        5  It sounds like that was a really frustrating e...
5        6  I can see why that situation would be so upset...
6        7  Wow, that sounds incredibly difficult to go th...
7        8  That's a lot to handle all at once. It's compl...


In [None]:
# --- 4. DEFINE JUDGE FUNCTIONS (One for each API) ---
def get_google_rating(response_text):
    max_retries = 5
    wait_time = 2  # Start with a 2-second wait

    for attempt in range(max_retries):
        try:
            prompt = empathy_prompt_template.format(response=response_text)
            response = google_model.generate_content(prompt)
            return response.text.strip() # If successful, return the result
        except Exception as e:
            # Check if the error is a rate limit error (often contains '429')
            if "429" in str(e):
                print(f"Rate limit hit for gemini-pro. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                wait_time *= 2  # Double the wait time for the next potential failure
            else:
                # If it's a different error, report it and stop
                return f"An unexpected error occurred: {e}"
    
    return "Error: Max retries exceeded. The API is still busy."

In [19]:
# --- 4. DEFINE JUDGE FUNCTION for Llama 3 ---
def get_llama_rating(response_text):
    """
    Gets a rating from Llama 3 via Groq API with exponential backoff for rate limits.
    """
    max_retries = 5
    wait_time = 1  # Start with a 1-second wait, as Groq is very fast

    for attempt in range(max_retries):
        try:
            prompt = empathy_prompt_template.format(response=response_text)
            response = groq_client.chat.completions.create(
                model="llama-3.1-8b-instant", # Using the available Llama 3.1 model
                messages=[{"role": "user", "content": prompt}],
                max_tokens=5 # Only need a single number back
            )
            return response.choices[0].message.content.strip().split('\n')[0]
        
        except Exception as e:
            # Check if the error is a rate limit error (often contains '429')
            if "429" in str(e):
                print(f"Rate limit hit for Llama 3. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
                wait_time *= 2  # Double the wait time for the next potential failure
            else:
                # If it's a different error, report it and stop
                return f"An unexpected error occurred with Llama 3: {e}"
    
    return "Error: Max retries exceeded for Llama 3. The API is still busy."

In [22]:
# --- 5. RUN THE PILOT TEST ---
print("Running pilot test... This may take a moment.")
#df_pilot['gpt4o_score'] = df_pilot['chatbot_response'].apply(get_openai_rating)
#df_pilot['gemini_pro_score'] = df_pilot['chatbot_response'].apply(get_google_rating)
df_pilot['llama_score'] = df_pilot['chatbot_response'].apply(get_llama_rating)
df_large_pilot['llama_score'] = df_large_pilot['chatbot_response'].apply(get_llama_rating)

print("Pilot test complete.")
print(df_large_pilot)

Running pilot test... This may take a moment.
Pilot test complete.
   turn_id                                   chatbot_response llama_score
0        1               Task completed. Awaiting next input.           2
1        2                    Your statement has been logged.           1
2        3  Thank you for providing that information. I wi...           2
3        4  I'm sorry to hear that you are having a proble...           5
4        5  It sounds like that was a really frustrating e...           5
5        6  I can see why that situation would be so upset...           6
6        7  Wow, that sounds incredibly difficult to go th...           6
7        8  That's a lot to handle all at once. It's compl...           6


In [23]:
print(df_pilot)

   turn_id                                   chatbot_response  \
0        1  I understand completely. It sounds like you're...   
1        2  Query processed. The requested information is ...   
2        3  That is an interesting perspective. I will add...   
3        4  Oh wow, that must have been really tough for y...   

                                    gemini_pro_score llama_score  
0                                                  7           7  
1                                                  1          2.  
2                                                  1           3  
3  Error: Max retries exceeded. The API is still ...           6  
