In [2]:
import os
from github import Github
from github.Auth import Token
from tqdm import tqdm

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))

from dotenv import load_dotenv
load_dotenv('../.env')

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))
repo = github.get_repo('getsentry/sentry')

import logging

logger = logging.getLogger('autofix')
logger.setLevel(logging.DEBUG)
logger.handlers = []
logger.addHandler(logging.StreamHandler())

import os
os.environ['DATABASE_URL'] = "postgresql+psycopg://root:seer@localhost:5433/seer"
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "ai-autofix-evals"

from seer.bootup import bootup

bootup(__name__)

  from .autonotebook import tqdm as notebook_tqdm


<Flask '__main__'>

In [3]:
from pydantic import field_serializer, BaseModel
from github.Commit import Commit
from typing import Any
from pydantic import ConfigDict, field_validator

from seer.automation.autofix.models import IssueDetails, EventDetails

class EvalItem(BaseModel):
    raw_data: dict[str, Any]
    commit: Commit
    issue: IssueDetails
    event: EventDetails

    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )

    @field_serializer('commit')
    def serialize_commit(self, commit: Commit, _info):
        return commit.sha
    
    @field_validator('commit', mode="before")
    @classmethod
    def validate_commit(cls, commit: Commit | str):
        return commit if isinstance(commit, Commit) else repo.get_commit(commit)
    
class EvalItemWithDiff(EvalItem):
    diff: str

In [4]:
from langsmith import Client
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run
from langchain.smith import RunEvalConfig


langsmith_client = Client()
dataset_name = "Autofix Eval Full 240314"

examples = langsmith_client.list_examples(dataset_name=dataset_name)
dataset = langsmith_client.read_dataset(dataset_name=dataset_name)


Create codebase indexes for each sha in the evaluation set.

In [None]:
from seer.automation.autofix.models import RepoDefinition
from seer.automation.codebase.codebase_index import CodebaseIndex
import uuid
from sentence_transformers import SentenceTransformer

repo_definition = RepoDefinition(provider="github", owner="getsentry", name="sentry")

embedding_model = SentenceTransformer("../models/autofix_embeddings_v0", trust_remote_code=True)
embedding_model.max_seq_length = 4096

with tqdm(total=dataset.example_count) as pbar:
    for example in examples:
        eval_item = EvalItem.model_validate(example.inputs)
        try:
            codebase = CodebaseIndex.create(1, 1, repo_definition, uuid.uuid4(), embedding_model=embedding_model, sha=eval_item.commit.parents[0].sha)
            codebase.cleanup()
        except Exception as e:
            print(f"Failed to create codebase for {eval_item.commit.sha}: {e}")
        
        pbar.update(1)

Dump the db repos and namespaces into jsons

In [6]:

import json
from seer.automation.codebase.models import CodebaseNamespace, RepositoryInfo
from seer.db import DbCodebaseNamespace, DbRepositoryInfo, Session

def get_namespace_dumps():
    with Session() as session:
        repository_info = session.query(DbRepositoryInfo).all()
        codebase_namespaces = session.query(DbCodebaseNamespace).all()
        
        repo_infos = [RepositoryInfo.from_db(repo_info).model_dump_json() for repo_info in repository_info]
        namespaces = [CodebaseNamespace.from_db(codebase_namespace).model_dump_json() for codebase_namespace in codebase_namespaces]

    return repo_infos, namespaces

repo_infos, namespaces = get_namespace_dumps()

with open('../data/repo_infos.json', 'w') as f:
    f.write(json.dumps(repo_infos))

with open('../data/namespaces.json', 'w') as f:
    f.write(json.dumps(namespaces))
    

Load the repos and codebase namespaces into your postgres. This assumes your postgres is clean because the ids will probably conflict otherwise.

In [11]:
import json
from seer.automation.codebase.models import RepositoryInfo, CodebaseNamespace
from seer.db import Session

def load_json_into_db(repo_info_file='../data/repo_infos.json', namespace_file='../data/namespaces.json'):
    with open(repo_info_file, 'r') as f:
        repo_infos_json = json.load(f)
    
    with open(namespace_file, 'r') as f:
        namespaces_json = json.load(f)
    
    with Session() as session:
        for repo_info in repo_infos_json:
            db_repo_info = RepositoryInfo.model_validate_json(repo_info).to_db_model()
            session.merge(db_repo_info)

        session.flush()
        
        for namespace in namespaces_json:
            db_namespace = CodebaseNamespace.model_validate_json(namespace).to_db_model()
            session.merge(db_namespace)
        
        session.commit()

load_json_into_db()
