diff --git a/README.md b/README.md index 7af965cb3..eb8dae654 100644 --- a/README.md +++ b/README.md @@ -594,6 +594,8 @@ with the `mod_auth_gssapi` module. * `include-git-dir` - when used, `.git` file objects are not removed from the source bundle created by Cachito. This is useful when the git history is important to the build process. + If this flag is absent, Cachito will use a shallow clone (depth=1) of the repository in order to save + resources. * `cgo-disable` - use this flag to make Cachito set `CGO_ENABLED=0` while processing gomod packages. This environment variable will only be used internally by Cachito, it will *not* be set in the diff --git a/cachito/web/api_v1.py b/cachito/web/api_v1.py index abe564e34..bbf906895 100644 --- a/cachito/web/api_v1.py +++ b/cachito/web/api_v1.py @@ -346,7 +346,11 @@ def create_request(): error_callback = tasks.failed_request_callback.s(request.id) chain_tasks = [ tasks.fetch_app_source.s( - request.repo, request.ref, request.id, "git-submodule" in pkg_manager_names + request.repo, + request.ref, + request.id, + "git-submodule" in pkg_manager_names, + "include-git-dir" in [flag.name for flag in request.flags], ).on_error(error_callback) ] diff --git a/cachito/workers/scm.py b/cachito/workers/scm.py index 0df31ac2e..760401018 100644 --- a/cachito/workers/scm.py +++ b/cachito/workers/scm.py @@ -143,24 +143,33 @@ def _create_archive(self, from_dir): os.unlink(self.sources_dir.archive_path) raise - def clone_and_archive(self, gitsubmodule=False): + def clone_and_archive(self, gitsubmodule=False, shallow=False): """ Clone the git repository and create the compressed source archive. :param bool gitsubmodule: a bool to determine whether git submodules need to be processed. + :param bool shallow: determines if a shallow clone should be made (depth=1). :raises CachitoError: if cloning the repository fails or if the archive can't be created """ with tempfile.TemporaryDirectory(prefix="cachito-") as temp_dir: log.debug("Cloning the Git repository from %s", self.url) clone_path = os.path.join(temp_dir, "repo") + + kwargs = { + "no_checkout": True, + # Don't allow git to prompt for a username if we don't have access + "env": {"GIT_TERMINAL_PROMPT": "0"}, + } + + if shallow: + kwargs["depth"] = 1 + try: - repo = git.repo.Repo.clone_from( - self.url, - clone_path, - no_checkout=True, - # Don't allow git to prompt for a username if we don't have access - env={"GIT_TERMINAL_PROMPT": "0"}, - ) + repo = git.repo.Repo.clone_from(self.url, clone_path, **kwargs) + + if shallow: + # Fetch the exact commit we need + repo.remote().fetch(refspec=self.ref, depth=1) except: # noqa E722 log.exception("Cloning the Git repository from %s failed", self.url) raise CachitoError("Cloning the Git repository failed") @@ -173,12 +182,13 @@ def clone_and_archive(self, gitsubmodule=False): repo.git.gc("--prune=now") self._create_archive(repo.working_dir) - def update_and_archive(self, previous_archive, gitsubmodule=False): + def update_and_archive(self, previous_archive, gitsubmodule=False, shallow=False): """ Update the existing Git history and create a source archive. :param str previous_archive: path to an archive file created before. :param bool gitsubmodule: a bool to determine whether git submodules need to be processed. + :param bool shallow: determines if only a single reference should be fetched. :raises CachitoError: if pulling the Git history from the remote repo or the checkout of the target Git ref fails. """ @@ -187,10 +197,13 @@ def update_and_archive(self, previous_archive, gitsubmodule=False): tar.extractall(temp_dir) repo = git.Repo(os.path.join(temp_dir, "app")) + + kwargs = {"depth": 1} if shallow else {} + try: # The reference must be specified to handle commits which are not part # of a branch. - repo.remote().fetch(refspec=self.ref) + repo.remote().fetch(refspec=self.ref, **kwargs) except: # noqa E722 log.exception("Failed to fetch from remote %s", self.url) raise CachitoError("Failed to fetch from the remote Git repository") @@ -202,10 +215,11 @@ def update_and_archive(self, previous_archive, gitsubmodule=False): repo.git.gc("--prune=now") self._create_archive(repo.working_dir) - def fetch_source(self, gitsubmodule=False): + def fetch_source(self, gitsubmodule=False, shallow=False): """Fetch the repo, create a compressed tar file, and put it in long-term storage. :param bool gitsubmodule: a bool to determine whether git submodules need to be processed. + :param bool shallow: determines if a shallow clone should be made (depth=1). """ if gitsubmodule: self.sources_dir = SourcesDir(self.repo_name, f"{self.ref}-with-submodules") @@ -235,7 +249,9 @@ def fetch_source(self, gitsubmodule=False): if "-with-submodules" in str(previous_archive): continue try: - self.update_and_archive(previous_archive, gitsubmodule=gitsubmodule) + self.update_and_archive( + previous_archive, gitsubmodule=gitsubmodule, shallow=shallow, + ) return except ( git.exc.InvalidGitRepositoryError, @@ -253,7 +269,7 @@ def fetch_source(self, gitsubmodule=False): previous_archive, ) - self.clone_and_archive(gitsubmodule=gitsubmodule) + self.clone_and_archive(gitsubmodule=gitsubmodule, shallow=shallow) def update_git_submodules(self, repo): """Update git submodules. diff --git a/cachito/workers/tasks/general.py b/cachito/workers/tasks/general.py index 0a36ff80f..713bc02cc 100644 --- a/cachito/workers/tasks/general.py +++ b/cachito/workers/tasks/general.py @@ -37,7 +37,7 @@ @app.task(priority=0) @runs_if_request_in_progress -def fetch_app_source(url, ref, request_id, gitsubmodule=False): +def fetch_app_source(url, ref, request_id, gitsubmodule=False, include_git_dir=False): """ Fetch the application source code that was requested and put it in long-term storage. @@ -51,7 +51,9 @@ def fetch_app_source(url, ref, request_id, gitsubmodule=False): try: # Default to Git for now scm = Git(url, ref) - scm.fetch_source(gitsubmodule=gitsubmodule) + # In case don't need to keep the Git history, use a shallow clone to save resources + shallow = not include_git_dir + scm.fetch_source(gitsubmodule=gitsubmodule, shallow=shallow) except requests.Timeout: raise CachitoError("The connection timed out while downloading the source") except CachitoError: diff --git a/tests/test_api_v1.py b/tests/test_api_v1.py index b19caaa66..c73dabe66 100644 --- a/tests/test_api_v1.py +++ b/tests/test_api_v1.py @@ -110,6 +110,7 @@ def test_get_status_short(mock_status, error, client): ([], ["npm"], None, ["npm"], None,), ([], ["pip"], None, ["pip"], None,), ([], ["yarn"], None, ["yarn"], None,), + ([], [], None, [], ["include-git-dir"],), ), ) @mock.patch("cachito.web.api_v1.chain") @@ -162,6 +163,7 @@ def test_create_and_fetch_request( "c50b93a32df1c9d700e3e80996845bc2e13be848", 1, "git-submodule" in expected_pkg_managers, + "include-git-dir" in flags if flags is not None else False, ).on_error(error_callback) ] if "gomod" in expected_pkg_managers: @@ -220,6 +222,7 @@ def test_create_request_with_gomod_package_configs( "c50b93a32df1c9d700e3e80996845bc2e13be848", 1, False, + False, ).on_error(error_callback), fetch_gomod_source.si(1, [], package_value["gomod"]).on_error(error_callback), process_fetched_sources.si(1).on_error(error_callback), @@ -250,6 +253,7 @@ def test_create_request_with_npm_package_configs( "c50b93a32df1c9d700e3e80996845bc2e13be848", 1, False, + False, ).on_error(error_callback), fetch_npm_source.si(1, package_value["npm"]).on_error(error_callback), process_fetched_sources.si(1).on_error(error_callback), @@ -290,6 +294,7 @@ def test_create_request_with_pip_package_configs(mock_chain, app, auth_env, clie "c50b93a32df1c9d700e3e80996845bc2e13be848", 1, False, + False, ).on_error(error_callback), fetch_pip_source.si(1, package_value["pip"]).on_error(error_callback), process_fetched_sources.si(1).on_error(error_callback), @@ -320,6 +325,7 @@ def test_create_request_with_yarn_package_configs( "c50b93a32df1c9d700e3e80996845bc2e13be848", 1, False, + False, ).on_error(error_callback), fetch_yarn_source.si(1, package_value["yarn"]).on_error(error_callback), process_fetched_sources.si(1).on_error(error_callback), @@ -372,6 +378,7 @@ def test_create_and_fetch_request_with_flag(mock_chain, app, auth_env, client, d "c50b93a32df1c9d700e3e80996845bc2e13be848", 1, False, + False, ).on_error(error_callback), fetch_gomod_source.si(1, [], []).on_error(error_callback), process_fetched_sources.si(1).on_error(error_callback), diff --git a/tests/test_workers/test_scm.py b/tests/test_workers/test_scm.py index 18be5d01a..cda97897c 100644 --- a/tests/test_workers/test_scm.py +++ b/tests/test_workers/test_scm.py @@ -30,7 +30,9 @@ def test_repo_name(): assert git_obj.repo_name == "release-engineering/retrodep" -@pytest.mark.parametrize("gitsubmodule", [True, False]) +@pytest.mark.parametrize( + "gitsubmodule, shallow", [(True, False), (False, False), (True, True), (False, True)] +) @mock.patch("tarfile.open") @mock.patch("tempfile.TemporaryDirectory") @mock.patch("tempfile.NamedTemporaryFile") @@ -51,6 +53,7 @@ def test_clone_and_archive( mock_temp_dir, mock_tarfile_open, gitsubmodule, + shallow, ): # Mock the archive being created mock_exists.return_value = True @@ -69,14 +72,16 @@ def test_clone_and_archive( git_obj = scm.Git(url, ref) with mock.patch.object(git_obj.sources_dir, "archive_path", new=archive_path): - git_obj.clone_and_archive(gitsubmodule) + git_obj.clone_and_archive(gitsubmodule, shallow) + + kwargs = {"depth": 1} if shallow else {} # Verify the tempfile.TemporaryDirectory context manager was used twice: # once for _clone_and_archive and once for _verify_archive assert mock_temp_dir.return_value.__enter__.call_count == 2 # Verify the repo was cloned and checked out properly mock_clone.assert_called_once_with( - url, "/tmp/cachito-temp/repo", no_checkout=True, env={"GIT_TERMINAL_PROMPT": "0"} + url, "/tmp/cachito-temp/repo", no_checkout=True, env={"GIT_TERMINAL_PROMPT": "0"}, **kwargs ) assert mock_clone.return_value.head.reference == mock_commit mock_clone.return_value.head.reset.assert_called_once_with(index=True, working_tree=True) @@ -89,6 +94,9 @@ def test_clone_and_archive( mock_ugs.assert_called_once_with(mock_clone.return_value) else: mock_ugs.assert_not_called() + # In case a shallow clone was made, we also need to fetch the exact commit needed + if shallow: + mock_clone.return_value.remote().fetch.assert_called_once_with(refspec=ref, **kwargs) mock_clone.return_value.git.gc.assert_called_once_with("--prune=now") @@ -155,7 +163,7 @@ def test_fetch_source_clone_if_no_archive_yet(mock_clone_and_archive, gitsubmodu with po(scm_git.sources_dir.package_dir, "glob", return_value=[]): scm_git.fetch_source(gitsubmodule) - mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule) + mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule, shallow=False) @pytest.mark.parametrize("gitsubmodule", [True, False]) @@ -194,7 +202,9 @@ def test_fetch_source_by_pull(mock_update_and_archive, mock_getctime, gitsubmodu return_value=["29eh2a.tar.gz", "a8c2d2.tar.gz", "a8c2d2-with-submodules.tar.gz"], ): scm_git.fetch_source(gitsubmodule) - mock_update_and_archive.assert_called_once_with("a8c2d2.tar.gz", gitsubmodule=gitsubmodule) + mock_update_and_archive.assert_called_once_with( + "a8c2d2.tar.gz", gitsubmodule=gitsubmodule, shallow=False + ) @pytest.mark.parametrize( @@ -242,17 +252,19 @@ def test_fetch_source_by_pull_corrupt_archive( assert mock_update_and_archive.call_count == 2 calls = [ - mock.call("a8c2d2.tar.gz", gitsubmodule=gitsubmodule), - mock.call("29eh2a.tar.gz", gitsubmodule=gitsubmodule), + mock.call("a8c2d2.tar.gz", gitsubmodule=gitsubmodule, shallow=False), + mock.call("29eh2a.tar.gz", gitsubmodule=gitsubmodule, shallow=False), ] mock_update_and_archive.assert_has_calls(calls) if all_corrupt: - mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule) + mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule, shallow=False) else: mock_clone_and_archive.assert_not_called() -@pytest.mark.parametrize("gitsubmodule", [True, False]) +@pytest.mark.parametrize( + "gitsubmodule, shallow", [(True, False), (False, False), (True, True), (False, True)] +) @mock.patch("tarfile.open") @mock.patch("tempfile.TemporaryDirectory") @mock.patch("git.Repo") @@ -260,7 +272,14 @@ def test_fetch_source_by_pull_corrupt_archive( @mock.patch("os.path.exists") @mock.patch("cachito.workers.scm.Git.update_git_submodules") def test_update_and_archive( - mock_ugs, mock_exists, mock_fsck, mock_repo, mock_temp_dir, mock_tarfile_open, gitsubmodule + mock_ugs, + mock_exists, + mock_fsck, + mock_repo, + mock_temp_dir, + mock_tarfile_open, + gitsubmodule, + shallow, ): # Mock the archive being created mock_exists.return_value = True @@ -270,8 +289,10 @@ def test_update_and_archive( # Mock the tempfile.TemporaryDirectory context manager mock_temp_dir.return_value.__enter__.return_value = "/tmp/cachito-temp" + kwargs = {"depth": 1} if shallow else {} + # Test does not really extract this archive file. The filename could be arbitrary. - scm.Git(url, ref).update_and_archive("/tmp/1234567.tar.gz", gitsubmodule) + scm.Git(url, ref).update_and_archive("/tmp/1234567.tar.gz", gitsubmodule, shallow) # Verify the tempfile.TemporaryDirectory context manager was used twice: # once for _update_and_archive and once for _verify_archive @@ -279,7 +300,7 @@ def test_update_and_archive( repo = mock_repo.return_value # Verify the changes are pulled. - repo.remote.return_value.fetch.assert_called_once_with(refspec=ref) + repo.remote.return_value.fetch.assert_called_once_with(refspec=ref, **kwargs) # Verify the repo is reset to specific ref repo.commit.assert_called_once_with(ref) assert repo.commit.return_value == repo.head.reference