# Autofix Evaluation
This initial preliminary high-level evaluation for Autofix runs on a dataset of Sentry Issues <-> Github Commits.

It is graded by a sending the expected diff vs the predicted diff to n GPTs with a prompt to evaluate whether the diff is a good fix or not.

Returns the average score of the GPTs as a float between 0 and 1.

# Notebook Setup

Install the seer requirements:

In [None]:
# %pip install -r ../requirements.txt

A couple more libraries are needed for running the eval:

In [None]:
# %pip install python-dotenv 'psycopg[binary,pool]' langchain langchain-openai

In [None]:
%pip install pysqlite3-binary

__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
# %load_ext autoreload
# %autoreload 2

import os
os.environ['DATABASE_URL'] = "postgresql+psycopg://root:seer@localhost:5433/seer"
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "ai-autofix-evals"

os.environ['CODEBASE_STORAGE_TYPE'] = 'filesystem'
os.environ['CODEBASE_WORKSPACE_DIR'] = '../data/chroma/workspaces'
os.environ['CODEBASE_STORAGE_DIR'] = '../data/chroma/storage'

from dotenv import load_dotenv
load_dotenv('../.env')

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))

import logging

for logger_name in ['autofix']:
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    logger.handlers = []
    logger.addHandler(logging.StreamHandler())

from github import Github
from github.Auth import Token

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))
# repo = github.get_repo('getsentry/sentry')

from seer.bootup import bootup

bootup(__name__)

from langsmith import Client
langsmith_client = Client()

from seer.automation.autofix.pipelines import AutofixRootCause, AutofixExecution
from seer.automation.autofix.tasks import ContinuationState
from seer.rpc import DummyRpcClient
from seer.automation.autofix.models import (
    AutofixContinuation,
    AutofixRequest,
    ChangesStep,
    RepoDefinition,
    RootCauseStep,
    SuggestedFixRootCauseSelection,
)
from pydantic import field_serializer, BaseModel
from github.Commit import Commit
from typing import Any, Optional
from pydantic import ConfigDict, field_validator

from seer.automation.models import IssueDetails, EventDetails

from sentence_transformers import SentenceTransformer
from seer.automation.autofix.autofix_context import AutofixContext, AutofixCodebaseStateManager
from seer.automation.autofix.event_manager import AutofixEventManager
import numpy as np
from seer.automation.codebase.codebase_index import CodebaseIndex
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


dataset_name = "Autofix Eval 100 240423"

In [None]:
class EvalItem(BaseModel):
    raw_data: dict[str, Any]
    organization_id: int
    project_id: int
    repo_name: Optional[str] = None
    commit_hash: Optional[str] = None
    # Field order matters as commit is dependent on repo_name and commit_hash, it should come later down the order.
    commit: Commit | str
    issue: IssueDetails
    event: EventDetails
    
    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )
    
    @field_serializer('commit')
    def serialize_commit(self, commit: Commit, _info):
        return commit.sha
    
    @field_validator('commit', mode="after")
    @classmethod
    def validate_commit(cls, commit: Commit | str, values, **kwargs):
        if isinstance(commit, Commit):
            return commit
        if 'repo_name' in values.data and values.data['repo_name'] is not None :
            repo_name = values.data['repo_name']
        else:
            repo_name = 'getsentry/sentry'
            values.data['repo_name'] = repo_name
        repo = github.get_repo(repo_name)
        values.data['commit_hash'] = commit
        return repo.get_commit(commit)
        
class EvalItemWithDiff(EvalItem):
    diff: str

# Sanity Check Embeddings Data

In [None]:
def check_embeddings():
    examples = langsmith_client.list_examples(dataset_name=dataset_name)
    summary = []
    
    for example in examples:
        run_item = EvalItem.model_validate(example.inputs)
        [repo_owner, repo_name] = run_item.repo_name.split('/')
        request = AutofixRequest(
            organization_id=run_item.organization_id,
            project_id=run_item.project_id,
            repos=[RepoDefinition(provider="github", owner=repo_owner, name=repo_name, external_id=repo_name)],
            base_commit_sha=run_item.commit.parents[0].sha,
            issue=run_item.issue
        )

        state = ContinuationState.new(AutofixContinuation(request=AutofixRequest.model_validate(request)), group_id=run_item.issue.id)
        try:
            codebase_index = CodebaseIndex.from_repo_definition(
                run_item.organization_id,
                run_item.project_id,
                RepoDefinition(provider="github", owner=repo_owner, name=repo_name, external_id=run_item.repo_name),
                run_item.commit.parents[0].sha,
                None,
                state=state,
                state_manager_class=AutofixCodebaseStateManager,
                embedding_model=None)
            summary.append([repo_owner, repo_name, run_item.repo_name, run_item.organization_id, run_item.project_id, run_item.commit.parents[0].sha, 'success'])
        except Exception as e:
            summary.append([repo_owner, repo_name, run_item.repo_name, run_item.organization_id, run_item.project_id, run_item.commit.parents[0].sha, str(e)])
        # [repo_owner, repo_name] = run_item.repo_name.split('/')
        
        # state = ContinuationState.new(AutofixContinuation(request=AutofixRequest.model_validate(request)), group_id=run_item.issue.id)
     
        # event_manager = AutofixEventManager(state)
        # context = AutofixContext(
        #     state=state,
        #     sentry_client=rpc_client,
        #     event_manager=event_manager,
        #     embedding_model=embedding_model,
        # )
    return pd.DataFrame(summary, columns=['repo_owner', 'repo_name', 'repo_full_name', 'organization_id', 'project_id', 
                                          'sha', 'error'])
        
df = check_embeddings()
df

In [None]:
df_queried = df
print(df_queried.error.value_counts())
print(df_queried.repo_full_name.value_counts())

df_queried[df_queried.error=='No repo'].repo_full_name.value_counts()


In [None]:
for v, c in df_queried.error.value_counts().items():
    if 'No such file or directory:' in v:
        path = v.replace("[Errno 2] No such file or directory: '", "")
        path = path.replace("'", "")
        print(f'cp -r {path.replace("/data/", "/old-data/")} {path}')

# Run Evaluation On Dataset

Create a predict function to be called during the eval:

In [None]:
import torch
def get_device():
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"
    return device

embedding_model = SentenceTransformer("../models/autofix_embeddings_v0", trust_remote_code=True)
embedding_model.max_seq_length = 4096
embedding_model.to(device = torch.device(get_device()))

def predict_result(input_: dict) -> dict:
    run_item = EvalItem.model_validate(input_)

    # Initializes the rpc client in DRY RUN mode
    rpc_client = DummyRpcClient()
    rpc_client.dry_run = True
    [repo_owner, repo_name] = run_item.repo_name.split('/')
    request = AutofixRequest(
        organization_id=run_item.organization_id,
        project_id=run_item.project_id,
        repos=[RepoDefinition(provider="github", owner=repo_owner, name=repo_name, external_id=run_item.repo_name)],
        base_commit_sha=run_item.commit.parents[0].sha,
        issue=run_item.issue,
    )
    
    state = ContinuationState.new(AutofixContinuation(request=AutofixRequest.model_validate(request)), group_id=run_item.issue.id)
 
    event_manager = AutofixEventManager(state)
    context = AutofixContext(
        state=state,
        sentry_client=rpc_client,
        event_manager=event_manager,
        embedding_model=embedding_model,
    )
   
    AutofixRootCause(context).invoke()
    
    root_cause_step = state.get().find_step(id='root_cause_analysis')
    if not root_cause_step or not isinstance(root_cause_step, RootCauseStep):
        return {"output": None}
    
    cause = root_cause_step.causes[0]
    if not cause.suggested_fixes:
        return {"output": None}

    event_manager.set_selected_root_cause(SuggestedFixRootCauseSelection(
        cause_id=cause.id,
        fix_id=cause.suggested_fixes[0].id
    ))

    AutofixExecution(context).invoke()

    changes_step = state.get().find_step(id='changes')
    if not changes_step or not isinstance(changes_step, ChangesStep):
        return {"output": None}
    if 'changes' not in changes_step or len(changes_step.changes) == 0:
        return {"output": None}

    return {"output": {
        "diff_str": changes_step.changes[0].diff_str
    }}

Create the scoring prompt:

In [None]:
from langsmith import traceable
from langchain.chat_models.openai import ChatOpenAI
from xml.etree import ElementTree as ET

from seer.automation.autofix.prompts import format_exceptions
from seer.automation.autofix.utils import extract_xml_element_text, escape_multi_xml

n_panel = 3
model = ChatOpenAI(model_name="gpt-4-0125-preview", temperature=0.8)

def score_fix_single_it(eval_item: EvalItemWithDiff, predicted_diff_str: str) -> float:
    completion = model.invoke(f"""<issue>
<error_message>
{eval_item.event.title}
</error_message>
<exceptions>
{format_exceptions(eval_item.event.exceptions)}
</exceptions>
</issue>

Given the above issue, we know the correct fix is:

<expected_solution>
<description>
{eval_item.commit.commit.message}
</description>
<changes>
{eval_item.diff}
</changes>
</expected_solution>

The model outputted the following solution:

<predicted_solution>
{predicted_diff_str}
</predicted_solution>

Score how well the predicted solution matches the expected solution with a float score from 0 to 1, where 1 means the solution fully fixes the issue and 0 means the solution does not fix the issue at all.
- Consider the context of the issue and the diff
- Consider that there are multiple ways to fix an issue

Think step-by-step inside a <thoughts> tag before giving a score.
Return the score inside a <score> tag.""")
    tree = ET.fromstring(f"<root>{escape_multi_xml(completion.content, ['score'])}</root>")
    score_str = extract_xml_element_text(tree, 'score')
    score = float(score_str) if score_str else 0

    return score

@traceable(name="Score 1 item", run_type="chain")
def score_one(eval_item: EvalItemWithDiff, predicted_diff_str: str) -> float:
    return round(sum([score_fix_single_it(eval_item, predicted_diff_str) for _ in range(n_panel)]) / n_panel, 2)

Run the eval:

In [None]:
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run
from langchain.smith import RunEvalConfig

@run_evaluator
def gpt_panel(run: Run, example: Example | None = None):
    eval_item = EvalItem.model_validate(run.inputs)
    with_diff = EvalItemWithDiff.model_validate(dict(**dict(eval_item), diff=example.outputs.get('diff')))
    diff_str = run.outputs.get('output', {}).get('diff_str', None)

    if not diff_str:
        return EvaluationResult(key="diff_gpt_panel_n3_score", score=None)

    score = score_one(with_diff, run.outputs.get('output', {}).get('diff_str', None))
    return EvaluationResult(key="diff_gpt_panel_n3_score", score=score)

eval_config = RunEvalConfig(
    custom_evaluators=[gpt_panel]
)

ds = langsmith_client.read_dataset(dataset_name=dataset_name)

langsmith_client.run_on_dataset(
    dataset_name=dataset_name,
    llm_or_chain_factory=predict_result,
    evaluation=eval_config,
    verbose=True,
    project_name="Autofix v2 rev:10",
    concurrency_level=6
)

Below is a test to just run one example:

In [None]:
examples = langsmith_client.list_examples(dataset_name=dataset_name)

for example in examples:
    predict_result(example.inputs)
    break

# Analysis

## Analyze Dataset

In [None]:
def ds_plots():
    examples = langsmith_client.list_examples(dataset_name=dataset_name)
    repo_names = [example.inputs['repo_name'] for example in examples]
    df_repo_names = pd.DataFrame(repo_names, columns=['repo_name'])
    
    df_repos_hist = df_repo_names.repo_name.value_counts().sort_values(ascending=False)
    sns.set_context("paper", rc={"font.size":8,"axes.titlesize":14,"axes.labelsize":12})
    sns.color_palette("muted")
    plt.figure(figsize=(8, 6))
    ax = sns.barplot(y=df_repos_hist.index, x=df_repos_hist.values, order=df_repos_hist.index, palette="flare")
    sns.despine(fig=None, ax=None, top=True, right=True, left=False, bottom=False, offset=None, trim=False)
    ax.bar_label(ax.containers[0])
    plt.title('Resolved Issues With Github Commit')
    plt.xlabel('Count')
    plt.ylabel('Github Repo')
    plt.grid(False)
    plt.tight_layout()
    plt.savefig('issues_repos_barplot.png', dpi=150)
    plt.show()
    
ds_plots()

## Analyze Validation Run

In [None]:
df_orig = langsmith_client.get_test_results(project_name='Autofix v2 rev:10')

# ['input.repo_name', 'execution_time', 'error']

In [None]:
df_runs = df_orig
cols_of_interest = ['reference.diff',
                    'outputs.output.diff_str',
                    'input.repo_name', 
                    'execution_time', 
                    'error', 
                    'id', 
                    'feedback.diff_gpt_panel_n3_score', 
                    'input.event.title', 
                    'input.event.exceptions', 
                    'input.issue.id', 
                    'input.issue.title', 
                    'input.issue.events',
                    'input.raw_data.metadata.severity',
                    'input.raw_data.metadata.initial_priority',
                    'input.raw_data.platform']
df_proj = df_runs[cols_of_interest]
df_proj.to_csv('last_run.csv', index=False)


### Errors

In [None]:
df_proj.error.str.split('\n').str.get(0).value_counts(dropna=False).to_frame().style


In [None]:
from pandas.plotting import table 
def errors_table():
    ax = plt.subplot(111, frame_on=False) # no visible frame
    ax.xaxis.set_visible(False)  # hide the x axis
    ax.yaxis.set_visible(False)  # hide the y axis
    df_errors = df_proj.error.str.split('\n').str.get(0).to_frame()
    # df_errors.loc[df_errors['error'].str. == 0, "error"] = 'Successful'
    df_errors.error.replace('', 'Successful', inplace=True)
    df_errors = df_errors.error.value_counts(dropna=False).to_frame().reset_index()
    
    params = {'figure.figsize': (12,6),}
    plt.rcParams.update(params)
    tabla = table(ax, df_errors[['error', 'count']], loc='upper right', colWidths=[.99, 0.1])  # where df is your data frame
    tabla.auto_set_font_size(False) # Activate set fontsize manually
    tabla.set_fontsize(12) # if ++fontsize is necessary ++colWidths
    tabla.scale(1.2, 1.2) # change size table
    
    # table(ax, df_errors)  # where df is your data frame
    plt.tight_layout()
    plt.savefig('validation_errors.png', dpi=150)
    plt.show()
    return df_errors

errors_table()    # plt.savefig('mytable.png')

In [None]:
# Hacky fix for runs that error out when we get no fix from the model. In these cases we set the score to 0.
df_proj.error = df_proj.error.fillna('')
df_proj.loc[(df_proj.error.str.len() == 0), 'failed'] = False
df_proj.loc[(df_proj.error.str.startswith("IndexError('list index out of range')")), 'failed'] = False
df_proj.loc[(df_proj.failed.isna()), 'failed'] = True
df_proj.loc[(df_proj.failed == False) & (df_proj['feedback.diff_gpt_panel_n3_score'].isna()), 'feedback.diff_gpt_panel_n3_score'] = 0

In [None]:
df_successful = df_proj[df_proj.failed == False]

### Distribution of Execution Times

In [None]:
fig, ax = plt.subplots(3, 1, sharey=True, sharex=False)
df_successful.execution_time.plot.hist(title='Execution Times (All)', ax=ax[0], ylabel='')
df_successful[df_successful['input.repo_name'].str.startswith('getsentry/')].execution_time.plot.hist(title='Execution Times (Sentry)', ax=ax[1], ylabel='')
df_successful[~df_successful['input.repo_name'].str.startswith('getsentry/')].execution_time.plot.hist(title='Execution Times (Open Source)', ax=ax[2], ylabel='')
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=.75, wspace=0.4)
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":14,"axes.labelsize":12})
params = {
          'figure.figsize': (8,6),
          'axes.labelsize': 12,
          'axes.titlesize': 14,
}
plt.rcParams.update(params)
# plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
fig.supxlabel('Execution Time (Seconds)')
fig.supylabel('Count')
plt.tight_layout()
plt.savefig('execution_times.png', dpi=150)

plt.show()


### Distribution Of Scores

In [None]:
fig, ax = plt.subplots(3, 1, sharey=True, sharex=False)
df_successful['feedback.diff_gpt_panel_n3_score'].plot.hist(title='GPT Panel Scores (All)', ax=ax[0], ylabel='')
df_successful[df_successful['input.repo_name'].str.startswith('getsentry/')]['feedback.diff_gpt_panel_n3_score'].plot.hist(title='GPT Panel Scores (Sentry)', ax=ax[1], ylabel='')
df_successful[~df_successful['input.repo_name'].str.startswith('getsentry/')]['feedback.diff_gpt_panel_n3_score'].plot.hist(title='GPT Panel Scores (Open Source)', ax=ax[2], ylabel='')
plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=.75, wspace=0.4)
sns.set_context("paper", rc={"font.size":8,"axes.titlesize":14,"axes.labelsize":12})
params = {
          'figure.figsize': (8,6),
          'axes.labelsize': 12,
          'axes.titlesize': 14,
}
plt.rcParams.update(params)
# plt.tick_params(labelcolor='none', which='both', top=False, bottom=False, left=False, right=False)
fig.supxlabel('Score')
fig.supylabel('Count')
plt.tight_layout()
plt.savefig('scores.png', dpi=150)
plt.show()


In [None]:
df_successful[df_successful['feedback.diff_gpt_panel_n3_score'] > 0.0]['feedback.diff_gpt_panel_n3_score'].mean()