In [None]:
# We break down a complex task into a sequence of smaller, focused sub-tasks. Each sub-task is handled by its own dedicated prompt, creating a "pipeline" where the output of one step becomes the input for the next

In [1]:
# Identify bugs in a piece of code.

import litellm
import inspect
import json
from IPython.display import display, Markdown
from textwrap import dedent
from dotenv import load_dotenv

load_dotenv()

MODEL_NAME = "openai/gpt-4o-mini"
MAX_TOKENS_DEFAULT = 500

def get_completion(
    prompt,
    model=MODEL_NAME,
    max_tokens=MAX_TOKENS_DEFAULT,
    **kwargs
):
    if "gpt-5" in model:
        kwargs["max_completion_tokens"] = max_tokens
    else:
        kwargs["max_tokens"] = max_tokens
        
    parsed_messages = []

    if type(prompt) is str:
        parsed_messages = [
            {
                "role": "user",
                "content": prompt
            }
        ]
    else:
        parsed_messages = prompt

    response = litellm.completion(
        model=model,
        messages=parsed_messages,
        **kwargs
    )

    return response.choices[0].message.content

def get_average_score(student_records):
    total_score = 0
    for record in student_records:
        total_score += record['score']

    return total_score / len(student_records)

CODE_TO_PROCESS = inspect.getsource(get_average_score)

print("Setup complete. Helper functions and code context are ready.")


Setup complete. Helper functions and code context are ready.


In [2]:
# step 1: prompt to identity bugs in code (in QA can be to identify bugs in req, in TD, etc..)
prompt_bug_identification = [
    {
        "role": "system",
        "content": dedent("""
        You are a master code auditor.
        Your sole task is to analyze Python code and identify potential bugs, errors, and edge-cases.

        Do NOT suggest fixes.

        Output your findings as a JSON object with the following schema:
        
        {
            "bugs": list(str)
        }
        """)
    },
    {
        "role": "user",
        "content": dedent(f"""
        Please analyze the following Python code:

        ```python
        {CODE_TO_PROCESS}
        ```
        """)
    }
]

print("--- Step 1: Identifying bugs ---")
identified_bugs_json = get_completion(
    prompt_bug_identification,
    response_format={"type": "json_object"}
)

if identified_bugs_json:
    print("Identified issues:")
    print(identified_bugs_json)
    identified_bugs = json.loads(identified_bugs_json).get("bugs", [])
else:
    identified_bugs = []

--- Step 1: Identifying bugs ---
Identified issues:
{
    "bugs": [
        "The code does not handle the case where student_records is an empty list, which would lead to a ZeroDivisionError when dividing by len(student_records).",
        "There is no check to ensure that each record in student_records contains the key 'score', which could lead to a KeyError if a record is missing this key.",
        "The indentation is not consistent, making the function definition and the for loop improperly formatted, which will result in an IndentationError when executing the code."
    ]
}


In [3]:
# step 2. Having bugs identified, we want to fix the code. 
prompt_refactoring = [
    {
        "role": "system",
        "content": dedent("""
        You are a senior Python developer.
        Your task is to refactor code to fix identified issues and improve its quality and robustness.
        """)
    },
    {
        "role": "user",
        "content": dedent(f"""
        Please refactor the following Python code to fix the idenfied issues.
        Add a docstring and type hints.

        ## Original code
        ```python
        {CODE_TO_PROCESS}
        ```

        ## Identified issues
        {"* ".join(identified_bugs)}

        ## Response instructions
        * Your response ahould be only the complete, refactored Python code.
        * Do not output any triple backticks nor code blocks, just directly the code.
        * Output only valid Python code.
        """)
    }
]

print("--- Step 2: Refactoring the code ---")
refactored_code = get_completion(prompt_refactoring)

if refactored_code:
    print("Refactored code:")
    print(refactored_code)

--- Step 2: Refactoring the code ---
Refactored code:
def get_average_score(student_records: list[dict[str, float]]) -> float:
    """
    Calculate the average score from a list of student records.

    Each student record must be a dictionary containing a 'score' key. 
    An empty list or missing 'score' keys will raise appropriate exceptions.

    :param student_records: A list of dictionaries containing student scores
    :return: The average score of the students
    :raises ValueError: If student_records is empty or if any record is missing the 'score' key
    """
    if not student_records:
        raise ValueError("The student_records list is empty.")

    total_score = 0
    for record in student_records:
        if 'score' not in record:
            raise KeyError("Each record must contain a 'score' key.")
        total_score += record['score']

    return total_score / len(student_records)


In [4]:
# step 3: create tests for fixed code. 
prompt_generate_unit_tests = [
    {
        "role": "system",
        "content": dedent("""
        You are an expert QA engineer specializing in Python.
        Your task is to write a comprehensive suite of unit tests for a given function.
        You always use the pytest framework.
        """)
    },
    {
        "role": "user",
        "content": dedent(f"""
        Please write unit tests for the following refactored Python function.

        The tests should cover the main functionality and edge-cases.
        
        Output the content in XML strictly following the structure in the Output section.
        Do not output any additional content outside of the XML tags.

        ## Refactored code
        {refactored_code}

        ## Output
        <preamble>
            Add your preamble here.
        </preamble>
        <reasoning>
            Add your reasoning here.
        </reasoning>
        <edge_cases>
            Add all identified edge-cases here.
        </edge_cases>
        <python_code>
            Add the test code here.
        </python_code>
        <conclusion>
            Add the conclusion here
        </conclusion>
        """)
    }
]

print("--- Step 3: Generating unit tests ---")
unit_tests = get_completion(prompt_generate_unit_tests, max_tokens=2000)

if unit_tests:
    print("Generated tests:")
    print(unit_tests)

--- Step 3: Generating unit tests ---
Generated tests:
```xml
<preamble>
    Unit tests for the get_average_score function that computes the average score from student records.
</preamble>
<reasoning>
    The tests are designed to verify the main functionality of the function, ensuring that it correctly calculates the average score, handles empty lists, and raises exceptions for missing 'score' keys. These tests will ensure robustness against typical user errors and edge cases.
</reasoning>
<edge_cases>
    1. An empty list of student records, which should raise a ValueError.
    2. A list containing one or more records that do not include the 'score' key, which should raise a KeyError.
    3. A list with only one student record, ensuring it calculates the average correctly as the score of that single student.
    4. A list with valid scores, including both positive and negative values, to confirm correct averaging.
</edge_cases>
<python_code>
    import pytest

    from your_module im