In [20]:
# Load env variables and create client
import re
import ast
from dotenv import load_dotenv
from statistics import mean
from anthropic import Anthropic

load_dotenv()

client = Anthropic()
model = "claude-3-5-haiku-latest"

In [2]:
# Helper functions
def add_user_message(messages, text):
    user_message = {"role": "user", "content": text}
    messages.append(user_message)


def add_assistant_message(messages, text):
    assistant_message = {"role": "assistant", "content": text}
    messages.append(assistant_message)


def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
        "model": model,
        "max_tokens": 1000,
        "messages": messages,
        "temperature": temperature,
        "stop_sequences": stop_sequences,
    }

    if system:
        params["system"] = system

    message = client.messages.create(**params)
    return message.content[0].text

In [38]:
import json


def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "python|json|regex",
        "solution_criteria": "Key criteria to evaluate the solution"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    response = chat(messages, stop_sequences=["```"])
    return json.loads(response)

In [39]:
dataset = generate_dataset()

dataset

[{'task': 'Create a regular expression to validate an AWS IAM username, which must be between 1-64 characters, can only contain alphanumeric characters, underscores, and hyphens',
  'format': 'regex',
  'solution_criteria': 'Regex should match valid IAM usernames and reject names with special characters or outside length range'},
 {'task': "Write a Python function to convert an AWS EC2 instance type (e.g. 't2.micro') into a dictionary with size and family components",
  'format': 'python',
  'solution_criteria': "Function should correctly parse instance type string into dictionary with 'family' and 'size' keys"},
 {'task': 'Create a JSON object representing the minimum configuration for an AWS S3 bucket policy that allows public read access',
  'format': 'json',
  'solution_criteria': 'JSON should define a bucket policy with a statement allowing GetObject action for all principals'}]

In [40]:
with open("dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)

In [26]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    prompt = f"""
    Please solve the following task:

    {test_case['task']}

    * Respond only with Python, JSON, or Regex as specified.
    * Do not include any explanations or additional text.
    """

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")
    output = chat(messages, stop_sequences=["```"])
    return output

In [None]:
def grade_by_model(test_case, output):
    """Grades the output of a test case using another model or the model itself"""
    eval_prompt = f"""
    You are an expert AWS code reviewer. Your task is to evaluate the following AI-generated solution.

    Original Task:
    <task>
    {test_case["task"]}
    </task>

    Solution to Evaluate:
    <solution>
    {output}
    </solution>

    Criteria for Evaluation:
    <criteria>
    {test_case["solution_criteria"]}
    </criteria>

    Output Format
    Provide your evaluation as a structured JSON object with the following fields, in this specific order:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement
    - "reasoning": A concise explanation of your overall assessment
    - "score": A number between 1-10

    Respond with JSON. Keep your response concise and direct.
    Example response shape:
    {{
        "strengths": string[],
        "weaknesses": string[],
        "reasoning": string,
        "score": number
    }}
    """

    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    eval_response = chat(messages, stop_sequences=["```"])
    return json.loads(eval_response)

In [33]:
def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0

def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0

def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0
    
def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)

In [34]:
def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    # TODO - Grading
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)
    score = (model_score + syntax_score) / 2  # Average the two scores

    return {
        "output": output,
        "test_case": test_case,
        "score": score,
        "reasoning": reasoning
    }


In [35]:
def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""

    results = []
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean(result["score"] for result in results)
    print(f"Average Score: {average_score}")

    return results

In [36]:
with open("dataset.json", "r") as f:
    dataset = json.load(f)

results = run_eval(dataset)

Average Score: 8.333333333333334


In [37]:
results

[{'output': '\n^[a-zA-Z][a-zA-Z0-9+=,.@_]{0,63}$\n',
  'test_case': {'task': 'Create a regex to validate an AWS IAM username that must start with a letter, can include letters, numbers, and these special characters: +, =, ,, ., @, _, and must be between 1-64 characters long',
   'format': 'regex'},
  'score': 9.0,
  'reasoning': "The regex mostly meets AWS IAM username requirements, capturing the core validation rules. However, it has a subtle issue allowing a zero-length username after the first character, which deviates from the '1-64 characters' specification. A minor adjustment would improve precision."},
 {'output': "\ndef get_ec2_instance_state(state_code):\n    ec2_states = {\n        0: 'pending',\n        16: 'running',\n        32: 'shutting-down',\n        48: 'terminated',\n        64: 'stopping',\n        80: 'stopped'\n    }\n    return ec2_states.get(state_code, 'unknown')\n",
  'test_case': {'task': 'Write a Python function that converts an AWS EC2 instance state code (