diff --git a/src/sentry/integrations/source_code_management/repo_trees.py b/src/sentry/integrations/source_code_management/repo_trees.py index 8ff5a333bccfd8..a14d7b412150e6 100644 --- a/src/sentry/integrations/source_code_management/repo_trees.py +++ b/src/sentry/integrations/source_code_management/repo_trees.py @@ -190,6 +190,8 @@ def get_cached_repo_files( repo_full_name: e.g. getsentry/sentry tree_sha: A branch or a commit sha + shifted_seconds: Staggers cache expiration times across repositories + so cache misses and API refreshes are spread out over time. only_source_code_files: Include all files or just the source code files only_use_cache: Do not hit the network but use the value from the cache if any. This is useful if the remaining API requests are low @@ -199,6 +201,7 @@ def get_cached_repo_files( use_api = not cache_hit and not only_use_cache repo_files: list[str] = cache.get(key, []) if use_api: + tree = None # Cache miss – fetch from API try: tree = self.get_client().get_tree(repo_full_name, tree_sha) @@ -209,8 +212,14 @@ def get_cached_repo_files( "Caching empty files result for repo", extra={"repo": repo_full_name}, ) - cache.set(key, [], self.CACHE_SECONDS + shifted_seconds) - tree = None + except ApiError as error: + if _is_not_found_error(error): + logger.info( + "Caching empty files result for missing repo or ref", + extra={"repo": repo_full_name}, + ) + else: + raise if tree: # Keep files; discard directories repo_files = [node["path"] for node in tree if node["type"] == "blob"] @@ -225,6 +234,8 @@ def get_cached_repo_files( # repositories is a single API network request, thus, # being acceptable to sometimes not having everything cached cache.set(key, repo_files, self.CACHE_SECONDS + shifted_seconds) + else: + cache.set(key, [], self.CACHE_SECONDS + shifted_seconds) metrics.incr( f"{METRICS_KEY_PREFIX}.get_tree", @@ -296,3 +307,11 @@ def should_include(file_path: str) -> bool: if any(file_path.startswith(path) for path in EXCLUDED_PATHS): return False return True + + +def _is_not_found_error(error: ApiError) -> bool: + if error.code == 404: + return True + + error_message = error.json.get("message") if error.json else error.text + return error_message in ("Not Found", "Not Found.") diff --git a/tests/sentry/integrations/github/test_client.py b/tests/sentry/integrations/github/test_client.py index 41180ba78c5dd7..b82398bde6c660 100644 --- a/tests/sentry/integrations/github/test_client.py +++ b/tests/sentry/integrations/github/test_client.py @@ -379,6 +379,60 @@ def test_get_cached_repo_files_with_all_files(self) -> None: files = self.install.get_cached_repo_files(self.repo.name, "master", 0) assert files == ["src/foo.py"] + @responses.activate + def test_get_cached_repo_files_caches_not_found(self) -> None: + responses.add( + method=responses.GET, + url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1", + status=404, + json={"message": "Not Found"}, + ) + repo_key = f"github:repo:{self.repo.name}:source-code" + assert cache.get(repo_key) is None + + files = self.install.get_cached_repo_files(self.repo.name, "master", 0) + assert files == [] + assert cache.get(repo_key) == [] + + # Negative-cache hit should avoid an additional API request. + files = self.install.get_cached_repo_files(self.repo.name, "master", 0) + assert files == [] + assert len(responses.calls) == 1 + + @responses.activate + def test_get_cached_repo_files_not_found_cache_ttl_is_staggered(self) -> None: + responses.add( + method=responses.GET, + url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1", + status=404, + json={"message": "Not Found"}, + ) + + shifted_seconds = 3600 + repo_key = f"github:repo:{self.repo.name}:source-code" + with mock.patch( + "sentry.integrations.source_code_management.repo_trees.cache.set" + ) as cache_set: + self.install.get_cached_repo_files(self.repo.name, "master", shifted_seconds) + + cache_set.assert_called_once_with( + repo_key, + [], + self.install.CACHE_SECONDS + shifted_seconds, + ) + + @responses.activate + def test_get_cached_repo_files_raises_non_not_found_api_error(self) -> None: + responses.add( + method=responses.GET, + url=f"https://api.github.com/repos/{self.repo.name}/git/trees/master?recursive=1", + status=500, + json={"message": "Server Error"}, + ) + + with pytest.raises(ApiError): + self.install.get_cached_repo_files(self.repo.name, "master", 0) + @mock.patch("sentry.integrations.github.client.get_jwt", return_value="jwt_token_1") @responses.activate def test_update_comment(self, get_jwt) -> None: diff --git a/tests/sentry/integrations/github/test_integration.py b/tests/sentry/integrations/github/test_integration.py index 3a32685c5fa8de..e219bc68dcbe16 100644 --- a/tests/sentry/integrations/github/test_integration.py +++ b/tests/sentry/integrations/github/test_integration.py @@ -13,7 +13,6 @@ from django.http import HttpResponse from django.urls import reverse -import sentry from fixtures.github import INSTALLATION_EVENT_EXAMPLE from sentry.constants import ObjectStatus from sentry.integrations.github import client @@ -21,6 +20,7 @@ from sentry.integrations.github.client import ( MINIMUM_REQUESTS, GitHubApiClient, + GitHubBaseClient, GithubSetupApiClient, ) from sentry.integrations.github.integration import ( @@ -750,7 +750,7 @@ def test_get_repositories_all_and_pagination(self) -> None: GitHubIntegration, integration, self.organization.id ) - with patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1): + with patch.object(GitHubBaseClient, "page_size", 1): result = installation.get_repositories() assert result == [ { @@ -785,10 +785,8 @@ def test_get_repositories_only_first_page(self) -> None: ) with ( - patch.object( - sentry.integrations.github.client.GitHubBaseClient, "page_number_limit", 1 - ), - patch.object(sentry.integrations.github.client.GitHubBaseClient, "page_size", 1), + patch.object(GitHubBaseClient, "page_number_limit", 1), + patch.object(GitHubBaseClient, "page_size", 1), ): result = installation.get_repositories() assert result == [ @@ -1007,12 +1005,13 @@ def get_installation_helper(self) -> GitHubIntegration: def _expected_trees(self, repo_info_list=None): result = {} - # bar (409 empty repo) returns an empty RepoTree since we cache the result - # baz (404) still fails and is excluded + # bar (409 empty repo) and baz (404 missing repo/ref) both return empty + # RepoTrees since those responses are negative-cached. list = repo_info_list or [ ("xyz", "master", ["src/xyz.py"]), ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), ("bar", "main", []), + ("baz", "master", []), ] for repo, branch, files in list: result[f"{self.gh_org}/{repo}"] = RepoTree( @@ -1094,6 +1093,7 @@ def test_get_trees_for_org_prevent_exhaustion_some_repos(self) -> None: # Now that the rate limit is reset we should get files for foo ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), ("bar", "main", []), + ("baz", "master", []), ] ) @@ -1118,7 +1118,8 @@ def test_get_trees_for_org_rate_limit_401(self) -> None: ) # This time the rate limit will not fail, thus, it will fetch the trees - # bar (409 empty repo) now returns an empty RepoTree since we cache the empty result + # bar (409) and baz (404) now return empty RepoTrees because those + # responses are cached as empty results. self.set_rate_limit() trees = installation.get_trees_for_org() assert trees == self._expected_trees( @@ -1126,6 +1127,7 @@ def test_get_trees_for_org_rate_limit_401(self) -> None: ("xyz", "master", ["src/xyz.py"]), ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), ("bar", "main", []), + ("baz", "master", []), ] ) @@ -1172,9 +1174,10 @@ def test_get_trees_for_org_makes_API_requests_before_MAX_CONNECTION_ERRORS_is_hi # xyz is missing because its request errors # foo has data because its API request is made in spite of xyz's error ("foo", "master", ["src/sentry/api/endpoints/auth_login.py"]), - # bar (409 empty repo) is present with empty files since we cache the result - # baz (404) is missing because its API request throws an error + # bar (409) and baz (404) are present with empty files + # because both outcomes are cached as empty results. ("bar", "main", []), + ("baz", "master", []), ] )