In [1]:
import os
from github import Github
from github.Auth import Token
from tqdm import tqdm

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))

from dotenv import load_dotenv
load_dotenv('../.env')

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))
repo = github.get_repo('getsentry/sentry')

import logging

logger = logging.getLogger('autofix')
logger.setLevel(logging.DEBUG)
logger.handlers = []
logger.addHandler(logging.StreamHandler())

import os
os.environ['DATABASE_URL'] = "postgresql+psycopg://root:seer@localhost:5433/seer"
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "ai-autofix-evals"

from seer.bootup import bootup

bootup(__name__)

  from .autonotebook import tqdm as notebook_tqdm


<Flask '__main__'>

In [2]:
from pydantic import field_serializer, BaseModel
from github.Commit import Commit
from typing import Any
from pydantic import ConfigDict, field_validator

from seer.automation.autofix.models import IssueDetails, EventDetails

class EvalItem(BaseModel):
    raw_data: dict[str, Any]
    commit: Commit
    issue: IssueDetails
    event: EventDetails

    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )

    @field_serializer('commit')
    def serialize_commit(self, commit: Commit, _info):
        return commit.sha
    
    @field_validator('commit', mode="before")
    @classmethod
    def validate_commit(cls, commit: Commit | str):
        return commit if isinstance(commit, Commit) else repo.get_commit(commit)
    
class EvalItemWithDiff(EvalItem):
    diff: str

In [3]:
from langsmith import Client
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run
from langchain.smith import RunEvalConfig


langsmith_client = Client()
dataset_name = "Autofix Eval Full 240314"

examples = langsmith_client.list_examples(dataset_name=dataset_name)
dataset = langsmith_client.read_dataset(dataset_name=dataset_name)


In [4]:
from seer.automation.autofix.models import RepoDefinition
from seer.automation.codebase.codebase_index import CodebaseIndex
import uuid
from sentence_transformers import SentenceTransformer

repo_definition = RepoDefinition(provider="github", owner="getsentry", name="sentry")

embedding_model = SentenceTransformer("../models/autofix_embeddings_v0", trust_remote_code=True)
embedding_model.max_seq_length = 4096

with tqdm(total=dataset.example_count) as pbar:
    for example in examples:
        eval_item = EvalItem.model_validate(example.inputs)
        try:
            codebase = CodebaseIndex.create(1, 1, repo_definition, uuid.uuid4(), embedding_model=embedding_model, sha=eval_item.commit.parents[0].sha)
            codebase.cleanup()
        except Exception as e:
            print(f"Failed to create codebase for {eval_item.commit.sha}: {e}")
        
        pbar.update(1)

  return self.fget.__get__(instance, owner)()
  0%|          | 0/36 [00:00<?, ?it/s]Loading repository to /var/tmp/getsentry-sentry_a398ecd268682765d800de79b5b58ba46645cbfdlgouxn79/repo
Loaded repository to /var/tmp/getsentry-sentry_a398ecd268682765d800de79b5b58ba46645cbfdlgouxn79/repo
Read 11078 documents:
  typescript: 166
  markdown: 20
  json: 645
  yaml: 53
  bash: 15
  python: 5473
  javascript: 48
  tsx: 4473
  rst: 9
  lua: 7
  css: 2
  html: 163
  embedded_template: 3
  toml: 1
100%|██████████| 11078/11078 [09:27<00:00, 19.50it/s]
Embedding 115802 chunks...
116736it [08:35, 226.45it/s]
Embedded 115802 chunks
Processed 115802 chunks
Saved workspace for namespace 1
Create Step: Inserted 115802 chunks into the database
Loaded codebase index for getsentry/sentry, with existing data
Cleaned up workspace for namespace 1
  3%|▎         | 1/36 [20:09<11:45:20, 1209.17s/it]Loading repository to /var/tmp/getsentry-sentry_86bc0ad93482ff72499211a4ea877a5d9dd1c399tiso2q9b/repo
Loaded repos

In [5]:

import json
from seer.automation.codebase.models import CodebaseNamespace, RepositoryInfo
from seer.db import DbCodebaseNamespace, DbRepositoryInfo, Session


def get_namespace_dumps():
    with Session() as session:
        repository_info = session.query(DbRepositoryInfo).all()
        codebase_namespaces = session.query(DbCodebaseNamespace).all()
        
        repo_infos = [RepositoryInfo.from_db(repo_info).model_dump_json() for repo_info in repository_info]
        namespaces = [CodebaseNamespace.from_db(codebase_namespace).model_dump_json() for codebase_namespace in codebase_namespaces]

    return repo_infos, namespaces

repo_infos, namespaces = get_namespace_dumps()

with open('repo_infos.json', 'w') as f:
    f.write(json.dumps(repo_infos))

with open('namespaces.json', 'w') as f:
    f.write(json.dumps(namespaces))
    