In [None]:
%load_ext autoreload
%autoreload 2

import os
from github import Github
from github.Auth import Token
from tqdm.notebook import tqdm

import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))

from dotenv import load_dotenv
load_dotenv('../.env')

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))

import logging

logger = logging.getLogger('autofix')
logger.setLevel(logging.DEBUG)
logger.handlers = []
logger.addHandler(logging.StreamHandler())

import os
os.environ['DATABASE_URL'] = "postgresql+psycopg://root:seer@localhost:5433/seer"
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_PROJECT'] = "ai-autofix-evals"
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['CODEBASE_STORAGE_TYPE'] = 'filesystem'
os.environ['CODEBASE_WORKSPACE_DIR'] = '../data/chroma/workspaces'
os.environ['CODEBASE_STORAGE_DIR'] = '../data/chroma/storage'

from seer.bootup import bootup

bootup(__name__)

In [None]:
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [None]:
# !pip install ipywidgets

In [None]:
from pydantic import field_serializer, BaseModel
from github.Commit import Commit
from typing import Any, Optional
from pydantic import ConfigDict, field_validator

from seer.automation.models import IssueDetails, EventDetails

class EvalItem(BaseModel):
    raw_data: dict[str, Any]
    organization_id: int
    project_id: int
    repo_name: Optional[str] = None
    commit_hash: Optional[str] = None
    # Field order matters as commit is dependent on repo_name and commit_hash, it should come later down the order.
    commit: Commit | str
    issue: IssueDetails
    event: EventDetails
    
    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )
    
    @field_serializer('commit')
    def serialize_commit(self, commit: Commit, _info):
        return commit.sha
    
    @field_validator('commit', mode="after")
    @classmethod
    def validate_commit(cls, commit: Commit | str, values, **kwargs):
        if isinstance(commit, Commit):
            return commit
        if 'repo_name' in values.data and values.data['repo_name'] is not None :
            repo_name = values.data['repo_name']
        else:
            repo_name = 'getsentry/sentry'
            values.data['repo_name'] = repo_name
        repo = github.get_repo(repo_name)
        values.data['commit_hash'] = commit
        return repo.get_commit(commit)
        
class EvalItemWithDiff(EvalItem):
    diff: str

In [None]:
from langsmith import Client
from langsmith.evaluation import EvaluationResult, run_evaluator
from langsmith.schemas import Example, Run
from langchain.smith import RunEvalConfig


Create codebase indexes for each sha in the evaluation set.

In [None]:
import torch
def get_device():
    if torch.cuda.is_available():
        device = "cuda"
    elif torch.backends.mps.is_available():
        device = "mps"
    else:
        device = "cpu"
    return device


In [None]:
from seer.automation.autofix.models import RepoDefinition
from seer.automation.codebase.codebase_index import CodebaseIndex
import uuid
from sentence_transformers import SentenceTransformer

langsmith_client = Client()
dataset_name = "Autofix Eval 100 240423"

examples = langsmith_client.list_examples(dataset_name=dataset_name)
dataset = langsmith_client.read_dataset(dataset_name=dataset_name)
embedding_model = SentenceTransformer("../models/autofix_embeddings_v0", trust_remote_code=True)
embedding_model.max_seq_length = 4096
embedding_model.to(device = torch.device(get_device()))
successful = []
skipped = []
errored = []

with (tqdm(examples, total=dataset.example_count, position=0, leave=True) as pbar,
    tqdm(desc="Skipped (pre-existing)", position=1, leave=True) as ctr1, tqdm(desc="Skipped (error)", position=2, leave=True) as ctr2):
    for example in pbar:
        eval_item = EvalItem.model_validate(example.inputs)
        if eval_item.repo_name == 'inikoo/aurora':
            print(f'Skipping inikoo/aurora as it takes too long')
            errored.append({'repo':eval_item.repo_name, 'error':'chunking gets stuck at 17%'})
            ctr2.update(1)
            continue
            
        [repo_owner, repo_name] = eval_item.repo_name.split('/')
        pbar.set_description(eval_item.repo_name)


        repo_definition = RepoDefinition(provider="github", owner=repo_owner, name=repo_name, external_id=eval_item.repo_name)
        try:
            if CodebaseIndex.has_repo_been_indexed(
                organization=eval_item.organization_id, 
                project=eval_item.project_id, 
                repo=repo_definition,
                sha=eval_item.commit.parents[0].sha):
                ctr1.update(1)
                skipped.append(eval_item.repo_name)
            else:
                codebase = CodebaseIndex.create(
                    organization=eval_item.organization_id, project=eval_item.project_id, 
                    repo=repo_definition,
                    embedding_model=embedding_model, 
                    sha=eval_item.commit.parents[0].sha)
                successful.append(eval_item.repo_name)
                codebase.cleanup()
        except Exception as e:
            errored.append({'repo':eval_item.repo_name, type(e):e})
            ctr2.update(1)
            
if len(errored) > 0:
    print('----------------Errors-------------')
    for err in errored:
        print(err, '-----------------------')
        


Remove the examples for which repo indexing failed

In [None]:
errored_repos = set([err['repo'] for err in errored])
examples = langsmith_client.list_examples(dataset_name=dataset_name)
to_delete = []
for example in examples:
    if example.inputs['repo_name'] in errored_repos:
        to_delete.append(example.id)
print(to_delete)

for cur_del in to_delete:
    langsmith_client.delete_example(example_id=cur_del)

Dump the db repos and namespaces into jsons

In [None]:
import json
from seer.automation.codebase.models import CodebaseNamespace, RepositoryInfo
from seer.db import DbCodebaseNamespace, DbRepositoryInfo, Session

def get_namespace_dumps():
    with Session() as session:
        repository_info = session.query(DbRepositoryInfo).all()
        codebase_namespaces = session.query(DbCodebaseNamespace).all()
        
        repo_infos = [RepositoryInfo.from_db(repo_info).model_dump_json() for repo_info in repository_info]
        namespaces = [CodebaseNamespace.from_db(codebase_namespace).model_dump_json() for codebase_namespace in codebase_namespaces]

    return repo_infos, namespaces

repo_infos, namespaces = get_namespace_dumps()

with open('data/repo_infos.json', 'w') as f:
    f.write(json.dumps(repo_infos))

with open('data/namespaces.json', 'w') as f:
    f.write(json.dumps(namespaces))
    

In [None]:
import json
from seer.automation.codebase.models import CodebaseNamespace, RepositoryInfo
from seer.db import DbCodebaseNamespace, DbRepositoryInfo, Session

with Session() as session:
    repository_info = session.query(DbRepositoryInfo).all()
    print('---------- Repos -------------')
    for info in repository_info:
        print(RepositoryInfo.from_db(info).model_dump_json())
    print('---------- Namespaces -------------')
    namespace_info = session.query(DbCodebaseNamespace).all()
    for info in namespace_info:
        print(CodebaseNamespace.from_db(info).model_dump_json())

    

Load the repos and codebase namespaces into your postgres. This assumes your postgres is clean because the ids will probably conflict otherwise.

In [None]:
import json
from seer.automation.codebase.models import RepositoryInfo, CodebaseNamespace
from seer.db import Session

def load_json_into_db(repo_info_file='../data/repo_infos.json', namespace_file='../data/namespaces.json'):
    with open(repo_info_file, 'r') as f:
        repo_infos_json = json.load(f)
    
    with open(namespace_file, 'r') as f:
        namespaces_json = json.load(f)
    
    with Session() as session:
        for repo_info in repo_infos_json:
            db_repo_info = RepositoryInfo.model_validate_json(repo_info).to_db_model()
            session.merge(db_repo_info)

        session.flush()
        
        for namespace in namespaces_json:
            db_namespace = CodebaseNamespace.model_validate_json(namespace).to_db_model()
            session.merge(db_namespace)
        
        session.commit()

load_json_into_db()
