# Autofix Evaluation
This initial preliminary high-level evaluation for Autofix runs on a dataset of Sentry Issues <-> Github Commits.

It is graded by a sending the expected diff vs the predicted diff to n GPTs with a prompt to evaluate whether the diff is a good fix or not.

Returns the average score of the GPTs as a float between 0 and 1.

Install the seer requirements:

In [None]:
!pip install -r ../requirements.txt

A couple more libraries are needed for running the eval:

In [None]:
!pip install python-dotenv 'psycopg[binary,pool]' langchain langchain-openai

In [4]:
import os
os.environ['DATABASE_URL'] = "postgresql+psycopg://root:seer@localhost:5433/seer"
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "ai-autofix-evals"

from dotenv import load_dotenv
load_dotenv('../.env')

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))

import logging

logger = logging.getLogger('autofix')
logger.setLevel(logging.DEBUG)
logger.handlers = []
logger.addHandler(logging.StreamHandler())

from github import Github
from github.Auth import Token

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))
repo = github.get_repo('getsentry/sentry')

from seer.bootup import bootup

bootup(__name__)

  from .autonotebook import tqdm as notebook_tqdm


<Flask '__main__'>

In [5]:
from pydantic import field_serializer, BaseModel
from github.Commit import Commit
from typing import Any
from pydantic import ConfigDict, field_validator

from seer.automation.autofix.models import IssueDetails, EventDetails

class EvalItem(BaseModel):
    raw_data: dict[str, Any]
    commit: Commit
    issue: IssueDetails
    event: EventDetails

    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )

    @field_serializer('commit')
    def serialize_commit(self, commit: Commit, _info):
        return commit.sha
    
    @field_validator('commit', mode="before")
    @classmethod
    def validate_commit(cls, commit: Commit | str):
        return commit if isinstance(commit, Commit) else repo.get_commit(commit)
    
class EvalItemWithDiff(EvalItem):
    diff: str

Create a predict function to be called during the eval:

In [7]:
from seer.automation.autofix.autofix import Autofix
from seer.automation.autofix.tasks import ContinuationState
from seer.rpc import DummyRpcClient
from seer.automation.autofix.models import (
    AutofixContinuation,
    AutofixRequest,
    RepoDefinition,
)
from sentence_transformers import SentenceTransformer
from seer.automation.autofix.autofix_context import AutofixContext
from seer.automation.autofix.event_manager import AutofixEventManager

embedding_model = SentenceTransformer("../models/autofix_embeddings_v0", trust_remote_code=True)
embedding_model.max_seq_length = 4096

def predict_result(input_: dict) -> dict:
    run_item = EvalItemWithDiff.model_validate(input_)

    # Initializes the rpc client in DRY RUN mode
    rpc_client = DummyRpcClient()
    rpc_client.dry_run = True

    request = AutofixRequest(
        organization_id=1,
        project_id=1,
        repos=[RepoDefinition(provider="github", owner="getsentry", name="sentry")],
        base_commit_sha=run_item.commit.parents[0].sha,
        issue=run_item.issue,
    )

    state = ContinuationState(
        val=AutofixContinuation(request=AutofixRequest.model_validate(request)), rpc_client=rpc_client
    )

    event_manager = AutofixEventManager(state)
    context = AutofixContext(
        organization_id=request.organization_id,
        project_id=request.project_id,
        repos=request.repos,
        event_manager=event_manager,
        state=state,
        embedding_model=embedding_model,
    )
    context.commit_changes = False
    autofix = Autofix(context)

    response = autofix.invoke(request)

    if response is None:
        return {"output": None}

    return {"output": response['outputs'][0]}

Create the scoring prompt:

In [8]:
from langsmith import traceable
from langchain.chat_models.openai import ChatOpenAI
from github.Commit import Commit
from github.File import File
from xml.etree import ElementTree as ET

from seer.automation.autofix.models import AutofixOutput
from seer.automation.autofix.prompts import format_exceptions
from seer.automation.autofix.utils import extract_xml_element_text, escape_multi_xml

n_panel = 3
model = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.8)

def score_fix_single_it(eval_item: EvalItemWithDiff, predicted_output: AutofixOutput) -> float:
    completion = model.invoke(f"""<issue>
<error_message>
{eval_item.event.title}
</error_message>
<exceptions>
{format_exceptions(eval_item.event.exceptions)}
</exceptions>
</issue>

Given the above issue, we know the correct fix is:

<expected_solution>
<description>
{eval_item.commit.commit.message}
</description>
<changes>
{eval_item.diff}
</changes>
</expected_solution>

The model outputted the following solution:

<predicted_solution>
{predicted_output.diff_str}
</predicted_solution>

Score how well the predicted solution matches the expected solution with a float score from 0 to 1, where 1 means the solution fully fixes the issue and 0 means the solution does not fix the issue at all.
- Consider the context of the issue and the diff
- Consider that there are multiple ways to fix an issue

Think step-by-step inside a <thoughts> tag before giving a score.
Return the score inside a <score> tag.""")
    tree = ET.fromstring(f"<root>{escape_multi_xml(completion.content, ['score'])}</root>")
    score_str = extract_xml_element_text(tree, 'score')
    score = float(score_str) if score_str else 0

    return score

@traceable(name="Score 1 item", run_type="chain")
def score_one(eval_item: EvalItemWithDiff, predicted_output: AutofixOutput) -> float:
    return round(sum([score_fix_single_it(eval_item, predicted_output) for _ in range(n_panel)]) / n_panel, 2)

  warn_deprecated(


Run the eval:

In [None]:
from langsmith import Client
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run
from langchain.smith import RunEvalConfig

@run_evaluator
def gpt_panel(run: Run, example: Example | None = None):
    eval_item = EvalItemWithDiff.model_validate(run.inputs)
    score = score_one(eval_item, AutofixOutput.model_validate(run.outputs.get('output')))
    return EvaluationResult(key="diff_gpt_panel_n3_score", score=score)

langsmith_client = Client()
dataset_name = "Autofix Eval Full 240314"

eval_config = RunEvalConfig(
    custom_evaluators=[gpt_panel]
)

langsmith_client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=predict_result,
    evaluation=eval_config,
    verbose=True,
    project_name="Autofix v1.1",
    concurrency_level=1,
)