In [None]:
from untils import gpt_call, gemini_call
from pydantic import BaseModel

# Response format

class GeneratorResponse(BaseModel):
    thoughts: list[str]
    response: str



def generate(task: str, context: str = "") -> GeneratorResponse:
    """Generate and improve solution base on feedback"""
    # Generator prompt
    generator_system_prompt = """
    Your goal is to complete the task based on <user input>. If there are feedbacks provided in the <context> from your previous generations, 
    you should reflect on them to improve your solution.

    Output your answer concisely in the following format:

    <thoughts>
    [Your understanding of the task and feedback and how you plan to improve]
    </thoughts>

    <response>
    [Your code implementation here]
    </response>
    """
    if context:
        generator_system_prompt +=f"""
        <context>
        {context}
        </context>
        """
    
    result = gpt_call(prompt= task, system_prompt= generator_system_prompt,response_format=GeneratorResponse)
    print("\n=== GENERATION START ===")
    print(f"Thoughts:\n{result.thoughts}\n")
    print(f"Generated:\n{result.response}\n")
    print("=== GENERATION END ===\n")
    print(result)
    return result

In [None]:
task = """
<user input>
Implement a Stack with:
1. push(x)
2. pop()
3. getMin()
All operations should be O(1).
</user input>
"""

# generator_result = generate(task)

In [None]:
# Structured response from evaluator

class EvaluatorResponse(BaseModel):
    is_accepted: bool
    feedback: str

def evaluate(task: str, response_from_generator: str) -> EvaluatorResponse:
    """Evaluate if a solution meets requirements."""
    evaluator_system_prompt="""
    Evaluate this following code implementation for:
    1. code correctness
    2. time complexity
    3. style and best practices

    You should be evaluating only and not attemping to solve the task.
    Only output "is_accepted = true" if all criteria are met and you have no further suggestions for improvements.
    Output your evaluation concisely in the following format.

    <is_accepted>true or false</is_accepted>
    <feedback>
    What needs improvement and why.
    </feedback>
    """
    evaluator_user_prompt=f"""
    Original task:\n
    {task}\n
    Content to evaluate:\n
    {response_from_generator}
    """
    result = gemini_call(prompt= evaluator_user_prompt, system_prompt= evaluator_system_prompt, response_format= EvaluatorResponse)
    print("=== EVALUATION START ===")
    print(f"Is Accepted:\n {result.is_accepted}\n")
    print(f"Feedback:\n {result.feedback}\n")
    print("=== EVALUATION END ===\n")
    print(result)

    return result
    

In [None]:
evaluate(task, generator_result.response)

In [None]:
def loop(task: str)->tuple[str, list[dict]]:
    """Keep generating and evaluating until requirements are met."""
    memory = []
    chain_of_thought = []

    generator_result = generate(task)
    memory.append(generator_result.response)
    chain_of_thought.append({"thoughts": generator_result.thoughts, "result": generator_result.response})

    while True:
        evaluator_result = evaluate(task, generator_result.response)
        if evaluator_result.is_accepted:
            print("===Passed evaluation===")
            return generator_result.response, chain_of_thought
        
        print("===Failed evaluation===")
        context = "\n".join([
                "Previous attempts:",
                *[f"- {m}" for m in memory],
                f"\nFeedback: {evaluator_result.feedback}"
            ])
            
        generator_result = generate(task, context)
        memory.append(generator_result.response)
        chain_of_thought.append({"thoughts": generator_result.thoughts, "result": generator_result.response})

In [None]:
loop(task)