This notebook will create a dataset from Sentry Issues <-> Github commits that reference a sentry issue and save it to langsmith.

Setup the github client and instantiate the repository.

In [2]:
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '../src')))
from github import Github
from github.Auth import Token
from tqdm.auto import tqdm
import requests
import pandas as pd

from dotenv import load_dotenv
load_dotenv('../.env')

from pydantic import (
    AliasChoices,
    AliasGenerator,
    BaseModel,
    ConfigDict,
    Field,
    ValidationError,
    field_validator,
    field_serializer
)

# from pydantic import field_serializer, BaseModel
from github.Commit import Commit
from typing import Any, Optional
from pydantic import ConfigDict, field_validator

from seer.automation.autofix.models import IssueDetails
from seer.automation.models import EventDetails

from datetime import timedelta
import datetime
import re

github = Github(auth=Token(token=os.environ.get('GITHUB_TOKEN')))

In [3]:

def get_resolved_issues(organization_slug="sentry", project_slug="sentry", cursor=None):
    url = f"https://sentry.io/api/0/projects/{organization_slug}/{project_slug}/issues/?query=is:resolved error.type:TypeError"

    # if cursor: 

    headers = {"Authorization": f"Bearer {os.environ.get('SENTRY_AUTH_TOKEN')}"}

    response = requests.get(url, headers=headers)

    result = response.json()

    if "detail" in result:
        raise Exception(result["detail"])

    return result, response.links["next"]

def auth_headers(auth_token=None, auth_cookie=None):
    auth_token = auth_token if auth_token else os.environ.get('SENTRY_AUTH_TOKEN')
    headers = {
        "Authorization": f"Bearer {auth_token}"
    }
    if auth_cookie:
        headers["Cookie"] = auth_cookie
    return headers    

def get_issue_by_id(issue_id, organization_slug="sentry", auth_token=None, auth_cookie=None):
    url = (
        f"https://sentry.io/api/0/organizations/{organization_slug}/issues/{issue_id}/"
    )

    headers = auth_headers(auth_token, auth_cookie)
    response = requests.get(url, headers=headers)
    issue = response.json()

    if "detail" in issue and issue["detail"] == "The requested resource does not exist":
        raise Exception(f"Could not find issue with id {issue_id}")

    return issue


def get_issue_id_from_short_id(short_id, organization_slug="sentry", auth_token=None, auth_cookie=None):
    url = f"https://sentry.io/api/0/organizations/{organization_slug}/shortids/{short_id}/"
    headers = auth_headers(auth_token, auth_cookie)

    response = requests.get(url, headers=headers)
    result = response.json()

    if ( 
        "detail" in result
        and result["detail"] == "The requested resource does not exist"
    ):
        raise Exception(f"Could not find issue with short id {short_id}")

    return result["groupId"]


def get_details_for_issue(issue_id=None, short_id=None, organization_slug="sentry", auth_token=None, auth_cookie=None):
    if issue_id is None and short_id is None:
        raise Exception("Either issue_id or short_id must be provided")

    if short_id:
        issue_id = get_issue_id_from_short_id(short_id, organization_slug, auth_token, auth_cookie)
    issue = get_issue_by_id(issue_id, organization_slug, auth_token, auth_cookie)
    
    if 'id' not in issue:
        if issue['detail'] == 'You do not have permission to perform this action.':
            # Its possible that the token is expired. Prompt for token and retry
            auth_cookie = input('Sentry sudo cookie')
            if short_id:
                issue_id = get_issue_id_from_short_id(short_id, organization_slug, auth_token, auth_cookie)
            issue = get_issue_by_id(issue_id, organization_slug, auth_token, auth_cookie)
            if 'id' not in issue:
                raise Exception(issue)
        else:
            raise Exception(issue)
            
    url = f"https://sentry.io/api/0/organizations/{organization_slug}/issues/{issue['id']}/events/?full=true"
    headers = auth_headers(auth_token, auth_cookie)
    response = requests.get(url, headers=headers)
    events = response.json()
    return auth_cookie, dict(
        **issue,
        events=events[:1],
    )

In [None]:
# print(get_details_for_issue(issue_id=5206388570, organization_slug='seria-ati'))
print(get_details_for_issue(issue_id=5177147602, organization_slug='sentry'))

In [5]:
class EvalItem(BaseModel):
    raw_data: dict[str, Any]
    organization_id: int
    project_id: int
    repo_name: Optional[str] = None
    commit_hash: Optional[str] = None
    # Field order matters as commit is dependent on repo_name and commit_hash, it should come later down the order.
    commit: Commit | str
    issue: IssueDetails
    event: EventDetails
    
    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )
    
    @field_serializer('commit')
    def serialize_commit(self, commit: Commit, _info):
        return commit.sha
    
    @field_validator('commit', mode="after")
    @classmethod
    def validate_commit(cls, commit: Commit | str, values, **kwargs):
        if isinstance(commit, Commit):
            return commit
        if 'repo_name' in values.data and values.data['repo_name'] is not None :
            repo_name = values.data['repo_name']
        else:
            repo_name = 'getsentry/sentry'
            values.data['repo_name'] = repo_name
        repo = github.get_repo(repo_name)
        values.data['commit_hash'] = commit
        return repo.get_commit(commit)
        
class EvalItemWithDiff(EvalItem):
    diff: str

In [6]:
def get_commits(repo, since):
    """
        Get all the commits from repo for a timeframe.
    """
    days_ago = datetime.datetime.now() - timedelta(days=since)
    print('Querying for commits')
    all_commits = repo.get_commits(since=days_ago)
    # all_commits = [commit for commit in commits]
    print('Total commits in this timeframe: ', all_commits.totalCount)
    return all_commits

In [7]:
def commits_with_sentry_issue(all_commits):
    """
        Filter it down to only commits with sentry issues.
    """
    # Gets the commits with an id or url to a sentry issue
    with_id_or_url = []

    with tqdm(all_commits, total=all_commits.totalCount, desc='Find Commits That Fix Issues', unit='Commit') as pbar:
        with tqdm(desc='Positive', unit='Commit') as ctr1:
            with tqdm(desc='Negative', unit='Commit') as ctr2:
                for commit in pbar:
                    if 'SENTRY-' in commit.commit.message or 'https://sentry.sentry.io/issues/' in commit.commit.message:
                        # Extracts the short id or id from the commit message
                        message = commit.commit.message
                        issue_short_id_match = re.findall(r'SENTRY-.{4}', message)
                        issue_short_id = issue_short_id_match[0] if issue_short_id_match else None
                        issue_url = re.findall(r'https://sentry.sentry.io/issues/\d+', message)
                        issue_id = issue_url[0].split('/')[-1] if issue_url else None                
                        if issue_short_id or issue_id:
                            with_id_or_url.append((1, 1, 'getsentry/sentry', commit.sha, commit, issue_short_id, issue_id, 'sentry'))
                            ctr1.update(1)
                        else:
                            ctr2.update(1)
                    else:
                        ctr2.update(1)
                        
    return with_id_or_url

In [8]:
def eval_items_for_sentry(with_id_or_url, auth_token=None, auth_cookie=None):
    """
        Populate into eval items.
    """
    eval_items: list[EvalItem] = []
    skipped_items: list[EvalItem] = []
    error_count = 0
    errors = []
    prev_auth_cookie = auth_cookie
    with tqdm(total=len(with_id_or_url), desc='Get issue details') as pbar: 
        for org_id, project_id, repo_name, commit_hash, commit, short_id, issue_id, org_slug in with_id_or_url:
            try:
                auth_cookie, issue = get_details_for_issue(issue_id=issue_id, short_id=short_id, organization_slug=org_slug, auth_token=auth_token, auth_cookie=auth_cookie)
                if auth_cookie != prev_auth_cookie:
                    print("Cookie changed")
                    prev_auth_cookie = auth_cookie
                issue_details = IssueDetails.model_validate(issue)
                event_details = EventDetails.from_event(issue_details.events[0])
    
                eval_item = EvalItem(
                    organization_id=org_id,
                    project_id=project_id,
                    repo_name=repo_name,
                    commit_hash=commit_hash,
                    commit=commit,
                    raw_data=issue,
                    issue=issue_details,
                    event=event_details
                )
    
                if len(event_details.exceptions) == 0:
                    skipped_items.append(eval_item)
                    continue
    
                eval_items.append(eval_item)
            except Exception as e:
                if 'You do not have permission to perform this action.' in repr(e):
                    abort = input("Auth token is not working. Abort (yes/no/retry)?")
                    if abort.lower() == 'yes':
                        break
                else:
                    print(repr(e))
                errors.append({type(e):e})
                error_count += 1
            finally:
                pbar.update(1)
    
    print('Total eval items:', len(eval_items))
    print('Total skipped items (no exceptions in event details):', len(skipped_items))
    print('Total errors:', error_count)
    if len(errors) > 0:
        print('Errors:')
        print('------------------------------')
        for error in errors:
            print(error)
            print('------------------------------')
    return eval_items

In [None]:
from langchain.chat_models.openai import ChatOpenAI
from github.Commit import Commit
from github.File import File

model = ChatOpenAI(model_name="gpt-4-0125-preview")

# Methods for Prompt GPT so we can filter it down to only issues that are "actionable" 
# which means in this case, given a sentry issue, it should be evident 
# what the developer should do to fix it.

def file_patch_to_str(file: File):
    return f"[{file.filename}]\n{file.patch}"


def explain_changes(error_msg, stack_str, commit_message, files_str):
    response = model.invoke(
        f"""<error_message>
{error_msg}
</error_message>
<stacktrace>
{stack_str}
</stacktrace>

A software engineer then created the following changes in a commit to fix the above issue:
<commit_message>
{commit_message}
</commit_message>
<changes>
{files_str}
</changes>

How would you describe the solution to the error in a short summary. Also describe what the root cause of the problem ended up being."""
    )

    return response.content


def determine_actionability(error_msg, stack_str, solution):
    response = model.invoke(
        f"""Given the provided information:
<information>
<error_message>
{error_msg}
</error_message>
<stacktrace>
{stack_str}
</stacktrace>
</information>

<expected_solution>
{solution}
</expected_solution>

Based on the error message and stacktrace, can the solution be inferred from the information given and access to reading the codebase? Why or why not?
Answer in the format:<response>yes/no</response><reason>reason for the response</reason>"""
    )
    comatch = re.match(r"<response>(.*?)</response>", response.content)
    if comatch and "yes" in comatch.group(1).lower():
        return True
    return False


def get_fixable_items(eval_items):
    fixable_items: list[EvalItem] = []
    panel_n = 5
    with tqdm(eval_items, desc='Check fixability', total=len(eval_items)) as pbar:
        with tqdm(desc='Fixable') as ctr1:
            with tqdm(desc='Not Fixable') as ctr2:
                for eval_item in pbar:
                    issue_details = eval_item.issue
                    stacktrace = eval_item.event.exceptions[0].stacktrace            
                    stacktrace_str = stacktrace.to_str(max_frames=64)
                    commit = eval_item.commit
                    files = commit.files
                    files_str = "\n".join([file_patch_to_str(file) for file in files])
            
                    explain_result = explain_changes(
                        issue_details.title, stacktrace_str, commit.commit.message, files_str
                    )
            
                    actionability_results = []
                    final_result = False
                    for _ in range(panel_n):
                        actionability_result = determine_actionability(issue_details.title, stacktrace_str, explain_result)
                        actionability_results.append(actionability_result)
                        true_count = actionability_results.count(True)
                        false_count = actionability_results.count(False)
                        if true_count > panel_n / 2:
                            final_result = True
                            break
                        if false_count > panel_n / 2:
                            final_result = False
                            break
                    
                    if final_result:
                        fixable_items.append(eval_item)
                        ctr1.update(1)
                    else:
                        ctr2.update(1)
                        
    print('Total fixable items:', len(fixable_items))
    print('Total non-fixable items:', len(eval_items) - len(fixable_items))
    return fixable_items

In [10]:
def add_diff(fixable_items):
    """
        Populate the eval items that are fixable with their expected diffs
    """
    final_eval_items: list[EvalItemWithDiff] = []
    for item in tqdm(fixable_items, desc='Loading Diff Info'):
        repo = github.get_repo(item.repo_name)
        comparison = repo.compare(item.commit.commit.parents[0].sha, item.commit.sha)
        
        requester = repo._requester
        headers = {
            "Authorization": f"{requester._Requester__auth.token_type} {requester._Requester__auth.token}",  # type: ignore
            "User-Agent": requester._Requester__userAgent,  # type: ignore
        }
        diff_data = requests.get(comparison.diff_url, headers=headers).content.decode('utf-8')
    
        final_item = EvalItemWithDiff.model_validate(dict(
            **dict(item),
            diff=diff_data
        ))
        final_eval_items.append(final_item)
    return final_eval_items


In [11]:
import json

def dump_items(items: list[EvalItem], filename: str):
    serialized_items = [item.model_dump(mode='json') for item in items]

    with open(filename, 'w') as f:
        json.dump(serialized_items, f)

# Get Fixable Items From Sentry

In [12]:
def get_fixable_issues_from_sentry(since=90, op_file='../data/eval_items.json'):
    repo = github.get_repo('getsentry/sentry')
    all_commits = get_commits(repo, since)
    with_id_or_url = commits_with_sentry_issue(all_commits)
    eval_items = eval_items_for_sentry(with_id_or_url)
    fixable_items = get_fixable_items(eval_items)
    final_eval_items = add_diff(fixable_items)
    print('Total final eval items:', len(final_eval_items))
    dump_items(final_eval_items, op_file)

In [13]:
from langsmith import Client
def delete_if_exists(client, dataset_name):
    if client.has_dataset(dataset_name=dataset_name):
        deleted = False
        print(f'Dataset {dataset_name} exists already. Clearing it first.')
        for cur in client.list_datasets():
            if cur.name == dataset_name:
                client.delete_dataset(dataset_id=str(cur.id))
                deleted = True
        if not deleted:
            raise Exception('Failed to find the dataset to delete')

def create_langsmith_dataset(items, num_entries, dataset_name, description, overwrite=False):    
    client = Client()
    if overwrite:
        delete_if_exists(client, dataset_name)
        
    dataset = client.create_dataset(
        dataset_name=dataset_name,
        description=description)
    errors = []
    with tqdm(desc='Uploading Example', total=num_entries) as pbar:
        with tqdm(desc='Errors') as ctr1:
            uploaded = 0
            cur_index = 0
            while uploaded < num_entries and cur_index < len(items):
                item = EvalItemWithDiff.model_validate(items[cur_index])
                cur_index = cur_index + 1
                input = item.model_dump(mode='json')
                output = { "diff": item.diff }
                try:            
                    client.create_example(
                        inputs=input, 
                        outputs=output,
                        dataset_id=dataset.id)
                    pbar.update(1)
                    uploaded = uploaded + 1
                except Exception as e:
                    ctr1.update(1)
                    errors.append({type(e):e})

    print(f'Uploaded {uploaded} samples to dataset')
    if len(errors) > 0:
        print('-------------Errors-------------')
        for e in errors:
            print(e, '----------------------')
        
    

In [14]:
# create_langsmith_dataset(
#     final_eval_items, 
#     "Autofix Eval Full 240314", 
#     "Autofix full eval made from mapping sentry <-> github commits for sentry project")

# Get Fixable Items From Issues Related To Open Source Repos

In [None]:
!gcloud auth application-default login

In [None]:
from google.cloud import bigquery
bigquery_client = bigquery.Client()

In [16]:
def os_commits_with_sentry_issue(sentry_org=False, limit=None):
    org_predicate = 'organization_id = 1 AND project_id = 1' if sentry_org else 'organization_id <> 1'
    limit_clause = f'LIMIT {limit}' if limit else ''
    
         
    bq_sql = f"""
        WITH status_info AS (
          SELECT * FROM getsentry.sentry_grouphistory
          WHERE status IN (12, 13)
            AND {org_predicate}
            -- AND organization_id <> 1
            -- AND organization_id = 1 AND project_id = 1
        ),
        commit_ids AS (
          SELECT group_id, linked_id, project_id
          FROM getsentry.sentry_grouplink 
          WHERE linked_type = 1 AND relationship = 1 
            AND group_id IN (SELECT distinct group_id FROM status_info)
        )
        SELECT commits.organization_id, 
          org.name as organization_name, org.slug as organization_slug,
          commit_ids.project_id, commit_ids.group_id, repos.name, 
          commits.author_id, commits.date_added, 
          commits.key, commits.message, 
          commits.repository_id
        FROM getsentry.sentry_commit AS commits
        JOIN getsentry.sentry_repository AS repos
          ON commits.organization_id = repos.organization_id AND commits.repository_id = repos.id
        JOIN `tmp_ram.github_open_source_repos` AS oss_repos 
          ON repos.name = oss_repos.name
        JOIN commit_ids 
          ON  commits.id = commit_ids.linked_id
        JOIN `getsentry.sentry_organization` AS org
          ON commits.organization_id = org.id  
        WHERE commits.id IN (SELECT distinct linked_id FROM commit_ids) ORDER BY date_added DESC
        -- LIMIT 5
        {limit_clause}
    """
    results = bigquery_client.query(bq_sql).to_dataframe()
    print(f'Retrieved {results.shape[0]} commits associated with resolved issues')
    with_id_or_url = []
    failed = []

    with (
        tqdm(results.iterrows(), total=results.shape[0]) as pbar,
        tqdm(desc='Successful') as ctr1,
        tqdm(desc='Failed') as ctr2):
        for i, row in pbar:
            cur_hash = row['key']
            repo_name = row['name']
            pbar.set_description(f'Commit {cur_hash[0:8]} from {repo_name}')
            try:
                repo = github.get_repo(repo_name)
                commit = repo.get_commit(cur_hash)
                with_id_or_url.append((row['organization_id'], row['project_id'], repo_name, cur_hash, commit, None, row['group_id'], row['organization_slug']))
                ctr1.update(1)
            except Exception as e:
                failed.append([repo_name, cur_hash, e])
                ctr2.update(1)

    if len(failed) > 0:
        print('Errors:')
        print('-----------------------------')
        for repo_name, cur_hash, e in failed:
            print(f'Error getting commit details for {cur_hash} from repo {repo_name}: {e}')
            print('-----------------------------')
        
    return with_id_or_url

In [17]:
def load_data_from(filenames, items_have_diff=True, add_diff_if_missing=True):
    all_items = []
    for filename in filenames:
        with open(filename) as f:
            data = json.load(f)
            if items_have_diff:
                items = [EvalItemWithDiff.model_validate(cur) for cur in tqdm(data, desc='Validating Data')]
            else:
                items = [EvalItem.model_validate(cur) for cur in tqdm(data, desc='Validating Data')]
                if add_diff_if_missing:
                    items = add_diff(items)
            all_items = all_items + items
    return all_items


In [18]:
def get_os_fixable_issues(op_file='../data/eval_items.json', eval_items_file=None, needs_su=False, sentry_org=False, limit=None, load_cached=False, skip_fixable_check=False):
    if load_cached:
        #load from eval_items_file
        eval_items = load_data_from([eval_items_file], items_have_diff=False, add_diff_if_missing=False)
    else:
        with_id_or_url = os_commits_with_sentry_issue(sentry_org, limit)
        if needs_su:
            eval_items = eval_items_for_sentry(with_id_or_url, auth_token=None, auth_cookie='dummy_cookie')
        else:
            eval_items = eval_items_for_sentry(with_id_or_url)
        if eval_items_file:
            print(f'Saving intermediate results (before running ChatGPT based filtering) to {eval_items_file}.')
            dump_items(eval_items, eval_items_file)
    if not skip_fixable_check:
        fixable_items = get_fixable_items(eval_items)
        final_eval_items = add_diff(fixable_items)
        print('Total final eval items:', len(final_eval_items))
        dump_items(final_eval_items, op_file)

### Extract Issues And Commits From Sentry

In [28]:
# Step 1: load data from sentry, filter and save to intermediate file.
# get_os_fixable_issues(op_file='../data/eval_sentry_items_from_db_apr_23_90_days.json', 
#                       eval_items_file='../data/inter_eval_sentry_items_from_db_apr_23_90_days.json',
#                       needs_su=False, sentry_org=True, limit=None, 
#                       load_cached=False, skip_fixable_check=True)

# Step 2: Load from intermendiate file, check if fixable using ChatGPT and save
# get_os_fixable_issues(op_file='../data/eval_sentry_items_from_db_apr_23_90_days.json', 
#                       eval_items_file='../data/inter_eval_sentry_items_from_db_apr_23_90_days.json',
#                       needs_su=False, sentry_org=True, limit=None, load_cached=True, skip_fixable_check=False)


Validating Data:   0%|          | 0/127 [00:00<?, ?it/s]

Check fixability:   0%|          | 0/127 [00:00<?, ?it/s]

Fixable: 0it [00:00, ?it/s]

Not Fixable: 0it [00:00, ?it/s]

Total fixable items: 88
Total non-fixable items: 39


Loading Diff Info:   0%|          | 0/88 [00:00<?, ?it/s]

Total final eval items: 88


### Extract Issues And Commits From Open Source Repos

In [27]:
# # Step 1: load data from sentry, filter and save to intermediate file.
# get_os_fixable_issues(op_file='../data/eval_os_items_from_db_apr_23_90_days.json', 
#                       eval_items_file='../data/inter_eval_os_items_from_db_apr_23_90_days.json',
#                       needs_su=True, sentry_org=False, limit=None,
#                       load_cached=False, skip_fixable_check=True)

# Step 2: Load from intermendiate file, check if fixable using ChatGPT and save
# get_os_fixable_issues(op_file='../data/eval_os_items_from_db_apr_23_90_days.json', 
#                       eval_items_file='../data/inter_eval_os_items_from_db_apr_23_90_days.json',
#                       needs_su=True, sentry_org=False, limit=None,
#                       load_cached=True, skip_fixable_check=False)


Validating Data:   0%|          | 0/189 [00:00<?, ?it/s]

Check fixability:   0%|          | 0/189 [00:00<?, ?it/s]

Fixable: 0it [00:00, ?it/s]

Not Fixable: 0it [00:00, ?it/s]

Total fixable items: 99
Total non-fixable items: 90


Loading Diff Info:   0%|          | 0/99 [00:00<?, ?it/s]

Total final eval items: 99


# Loading Presaved JSON Data To LangSmith

In [102]:
import random
def random_sample(all_items, num_entries):
    indices = list(range(len(all_items)))
    random.shuffle(indices)
    selected = []
    for idx in indices[0:num_entries]:
        selected.append(all_items[idx])
    return selected
    
def save_langsmith(ds_name, filenames, num_entries=None, shuffle=True, overwrite=False):
    all_items = []
    for filename in filenames:
        with open(filename) as f:
            data = json.load(f)
            all_items = all_items + data
    
    print(f'Loaded {len(all_items)} items')
    if shuffle:
        random.shuffle(all_items)
        
    create_langsmith_dataset(
        all_items,
        num_entries,
        ds_name,
        f"{num_entries} issues with related github commits for autofix validation",
        overwrite=overwrite)
    

### Create The Full Dataset

In [108]:
save_langsmith(
    ds_name="Autofix Eval 100 240423", 
    filenames=['../data/eval_os_items_from_db_apr_23_90_days.json', '../data/eval_sentry_items_from_db_apr_23_90_days.json'], 
    num_entries=100,
    shuffle=True,
    overwrite=True
)

Loaded 187 items
Dataset Autofix Eval 100 240423 exists already. Clearing it first.


Uploading Example:   0%|          | 0/100 [00:00<?, ?it/s]

Errors: 0it [00:00, ?it/s]

Uploaded 100 samples to dataset
-------------Errors-------------
{<class 'requests.exceptions.HTTPError'>: HTTPError('500 Server Error: Internal Server Error for url: https://api.smith.langchain.com/examples', '{"detail":"Internal server error"}')} ----------------------


### Create A Smaller Test Dataset

In [110]:
save_langsmith(
    ds_name="Autofix Eval 3 240423", 
    # filenames=['../data/deleteme_three.json']
    # filenames=['../data/eval_os_items_from_db_apr_23_90_days.json'],
    # filenames=['../data/oss_one.json'],
    filenames=['../data/eval_os_items_from_db_apr_23_90_days.json', '../data/eval_sentry_items_from_db_apr_23_90_days.json'], 
    num_entries=3,
    shuffle=True,
    overwrite=True
)

Loaded 187 items


Uploading Example:   0%|          | 0/3 [00:00<?, ?it/s]

Errors: 0it [00:00, ?it/s]

Uploaded 3 samples to dataset


In [20]:
# save_langsmith(ds_name='small-sentry', filenames=['../data/deleteme_three.json'])

Validating Data:   0%|          | 0/10 [00:00<?, ?it/s]

10


In [21]:
def get_repos(filenames):
    items = []
    for filename in filenames:
        with open(filename) as f:
            data = json.load(f)
            for i, cur in tqdm(enum(data), desc='Validating Data'):
                item = EvalItemWithDiff.model_validate(cur)
                items.append((item.commit_hash, item.repo_name))
    return set(items)
    

get_repos(['../data/eval_os_items_from_db_apr_23_90_days.json', '../data/eval_sentry_items_from_db_apr_23_90_days.json'])

Validating Data:   0%|          | 0/99 [00:00<?, ?it/s]

Validating Data:   0%|          | 0/88 [00:00<?, ?it/s]

{('06abdc1cbde8429a72c24e1aa1d38c29213dc3e0', 'getsentry/sentry'),
 ('09e8a788e756b0267fbd9b9615dfa3b8d3bfa3e4', 'seriaati/hoyo-buddy'),
 ('0b7934a41977af9dc83b4449cbdc644ea9914eab', 'FireDiscordBot/bot'),
 ('0ca66611fbc27cdb10ba0b5fc5b0b3549f86042b', 'getsentry/sentry'),
 ('0cd5bf636e6033076a2782bbf67db890a9084a8a', 'getsentry/sentry'),
 ('0e1b234d6876fb16835268407adb418166b7d502', 'renalreg/ukrdc-nuxt-3'),
 ('0fc53e9a2bd17c2aca3108ef1ded78e41b4fb3f6', 'internetstandards/Internet.nl'),
 ('14c3d20a481950f5eed83e5c832eda93ea36133c', 'freelawproject/courtlistener'),
 ('16104c07186d2d9fc10c0418e829597b529e8d94', 'getsentry/sentry'),
 ('167faedb0522fdb550a0c61c42a1ab5b2c8ecadd', 'getsentry/sentry'),
 ('16845a9456e4c4fe309e8fe48d71198fa275ba72', 'seriaati/hoyo-buddy'),
 ('181e6bea052b4e0efc0bc1bea8ab3f26e1882b3e', 'bfkeinberg/route-forecast'),
 ('185c973b9720bab11864e3917c8faff8f94b0207', 'getsentry/sentry'),
 ('196615a59a0eb5d7498c39208e0b786667243ce9', 'YodaBotOS/YodaBot'),
 ('19c9b4e61be

In [None]:
get_repos('../data/eval_os_items_from_db_apr_23_90_days.json')