Skip to content

Commit

Permalink
Uses shallow clones as default for Git operations
Browse files Browse the repository at this point in the history
This is to avoid exhausting the resources while cloning very large
repos (such as github.com/openshift/origin, which consumes ~3GB
memory).

In case the "include-git-dir" flag is set, a deep clone will still
be used.

Signed-off-by: Bruno Pimentel <bpimente@redhat.com>
  • Loading branch information
brunoapimentel committed Dec 29, 2021
1 parent 3600456 commit 72bd76b
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 28 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -594,6 +594,8 @@ with the `mod_auth_gssapi` module.

* `include-git-dir` - when used, `.git` file objects are not removed from the source bundle created
by Cachito. This is useful when the git history is important to the build process.
If this flag is absent, Cachito will use a shallow clone (depth=1) of the repository in order to save
resources.

* `cgo-disable` - use this flag to make Cachito set `CGO_ENABLED=0` while processing gomod packages.
This environment variable will only be used internally by Cachito, it will *not* be set in the
Expand Down
6 changes: 5 additions & 1 deletion cachito/web/api_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,7 +346,11 @@ def create_request():
error_callback = tasks.failed_request_callback.s(request.id)
chain_tasks = [
tasks.fetch_app_source.s(
request.repo, request.ref, request.id, "git-submodule" in pkg_manager_names
request.repo,
request.ref,
request.id,
"git-submodule" in pkg_manager_names,
"include-git-dir" in request.flags,
).on_error(error_callback)
]

Expand Down
41 changes: 28 additions & 13 deletions cachito/workers/scm.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ def _create_archive(self, from_dir):
os.unlink(self.sources_dir.archive_path)
raise

def clone_and_archive(self, gitsubmodule=False):
def clone_and_archive(self, gitsubmodule=False, include_git_dir=False):
"""
Clone the git repository and create the compressed source archive.
Expand All @@ -153,14 +153,24 @@ def clone_and_archive(self, gitsubmodule=False):
with tempfile.TemporaryDirectory(prefix="cachito-") as temp_dir:
log.debug("Cloning the Git repository from %s", self.url)
clone_path = os.path.join(temp_dir, "repo")

kwargs = {
"no_checkout": True,
# Don't allow git to prompt for a username if we don't have access
"env": {"GIT_TERMINAL_PROMPT": "0"},
}

if include_git_dir:
# In case we don't need to keep the Git history, clone with depth 1 to save
# resources on large repos
kwargs["depth"] = 1

try:
repo = git.repo.Repo.clone_from(
self.url,
clone_path,
no_checkout=True,
# Don't allow git to prompt for a username if we don't have access
env={"GIT_TERMINAL_PROMPT": "0"},
)
repo = git.repo.Repo.clone_from(self.url, clone_path, **kwargs)

if include_git_dir:
# Fetch the exact commit we need
repo.remote().fetch(refspec=self.ref, depth=1)
except: # noqa E722
log.exception("Cloning the Git repository from %s failed", self.url)
raise CachitoError("Cloning the Git repository failed")
Expand All @@ -173,7 +183,7 @@ def clone_and_archive(self, gitsubmodule=False):
repo.git.gc("--prune=now")
self._create_archive(repo.working_dir)

def update_and_archive(self, previous_archive, gitsubmodule=False):
def update_and_archive(self, previous_archive, gitsubmodule=False, include_git_dir=False):
"""
Update the existing Git history and create a source archive.
Expand All @@ -187,10 +197,13 @@ def update_and_archive(self, previous_archive, gitsubmodule=False):
tar.extractall(temp_dir)

repo = git.Repo(os.path.join(temp_dir, "app"))

kwargs = {"depth": 1} if include_git_dir else {}

try:
# The reference must be specified to handle commits which are not part
# of a branch.
repo.remote().fetch(refspec=self.ref)
repo.remote().fetch(refspec=self.ref, **kwargs)
except: # noqa E722
log.exception("Failed to fetch from remote %s", self.url)
raise CachitoError("Failed to fetch from the remote Git repository")
Expand All @@ -202,7 +215,7 @@ def update_and_archive(self, previous_archive, gitsubmodule=False):
repo.git.gc("--prune=now")
self._create_archive(repo.working_dir)

def fetch_source(self, gitsubmodule=False):
def fetch_source(self, gitsubmodule=False, include_git_dir=False):
"""Fetch the repo, create a compressed tar file, and put it in long-term storage.
:param bool gitsubmodule: a bool to determine whether git submodules need to be processed.
Expand Down Expand Up @@ -235,7 +248,9 @@ def fetch_source(self, gitsubmodule=False):
if "-with-submodules" in str(previous_archive):
continue
try:
self.update_and_archive(previous_archive, gitsubmodule=gitsubmodule)
self.update_and_archive(
previous_archive, gitsubmodule=gitsubmodule, include_git_dir=include_git_dir,
)
return
except (
git.exc.InvalidGitRepositoryError,
Expand All @@ -253,7 +268,7 @@ def fetch_source(self, gitsubmodule=False):
previous_archive,
)

self.clone_and_archive(gitsubmodule=gitsubmodule)
self.clone_and_archive(gitsubmodule=gitsubmodule, include_git_dir=include_git_dir)

def update_git_submodules(self, repo):
"""Update git submodules.
Expand Down
4 changes: 2 additions & 2 deletions cachito/workers/tasks/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@

@app.task(priority=0)
@runs_if_request_in_progress
def fetch_app_source(url, ref, request_id, gitsubmodule=False):
def fetch_app_source(url, ref, request_id, gitsubmodule=False, include_git_dir=False):
"""
Fetch the application source code that was requested and put it in long-term storage.
Expand All @@ -51,7 +51,7 @@ def fetch_app_source(url, ref, request_id, gitsubmodule=False):
try:
# Default to Git for now
scm = Git(url, ref)
scm.fetch_source(gitsubmodule=gitsubmodule)
scm.fetch_source(gitsubmodule=gitsubmodule, include_git_dir=include_git_dir)
except requests.Timeout:
raise CachitoError("The connection timed out while downloading the source")
except CachitoError:
Expand Down
6 changes: 6 additions & 0 deletions tests/test_api_v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ def test_create_and_fetch_request(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
"git-submodule" in expected_pkg_managers,
"include-git-dir" in flags if flags is not None else False,
).on_error(error_callback)
]
if "gomod" in expected_pkg_managers:
Expand Down Expand Up @@ -220,6 +221,7 @@ def test_create_request_with_gomod_package_configs(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_gomod_source.si(1, [], package_value["gomod"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -250,6 +252,7 @@ def test_create_request_with_npm_package_configs(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_npm_source.si(1, package_value["npm"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -290,6 +293,7 @@ def test_create_request_with_pip_package_configs(mock_chain, app, auth_env, clie
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_pip_source.si(1, package_value["pip"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -320,6 +324,7 @@ def test_create_request_with_yarn_package_configs(
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_yarn_source.si(1, package_value["yarn"]).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down Expand Up @@ -372,6 +377,7 @@ def test_create_and_fetch_request_with_flag(mock_chain, app, auth_env, client, d
"c50b93a32df1c9d700e3e80996845bc2e13be848",
1,
False,
False,
).on_error(error_callback),
fetch_gomod_source.si(1, [], []).on_error(error_callback),
process_fetched_sources.si(1).on_error(error_callback),
Expand Down
47 changes: 35 additions & 12 deletions tests/test_workers/test_scm.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ def test_repo_name():
assert git_obj.repo_name == "release-engineering/retrodep"


@pytest.mark.parametrize("gitsubmodule", [True, False])
@pytest.mark.parametrize(
"gitsubmodule, include_git_dir", [(True, False), (False, False), (True, True), (False, True)]
)
@mock.patch("tarfile.open")
@mock.patch("tempfile.TemporaryDirectory")
@mock.patch("tempfile.NamedTemporaryFile")
Expand All @@ -51,6 +53,7 @@ def test_clone_and_archive(
mock_temp_dir,
mock_tarfile_open,
gitsubmodule,
include_git_dir,
):
# Mock the archive being created
mock_exists.return_value = True
Expand All @@ -69,14 +72,16 @@ def test_clone_and_archive(
git_obj = scm.Git(url, ref)

with mock.patch.object(git_obj.sources_dir, "archive_path", new=archive_path):
git_obj.clone_and_archive(gitsubmodule)
git_obj.clone_and_archive(gitsubmodule, include_git_dir)

kwargs = {"depth": 1} if include_git_dir else {}

# Verify the tempfile.TemporaryDirectory context manager was used twice:
# once for _clone_and_archive and once for _verify_archive
assert mock_temp_dir.return_value.__enter__.call_count == 2
# Verify the repo was cloned and checked out properly
mock_clone.assert_called_once_with(
url, "/tmp/cachito-temp/repo", no_checkout=True, env={"GIT_TERMINAL_PROMPT": "0"}
url, "/tmp/cachito-temp/repo", no_checkout=True, env={"GIT_TERMINAL_PROMPT": "0"}, **kwargs
)
assert mock_clone.return_value.head.reference == mock_commit
mock_clone.return_value.head.reset.assert_called_once_with(index=True, working_tree=True)
Expand All @@ -89,6 +94,9 @@ def test_clone_and_archive(
mock_ugs.assert_called_once_with(mock_clone.return_value)
else:
mock_ugs.assert_not_called()
#
if include_git_dir:
mock_clone.return_value.remote().fetch.assert_called_once_with(refspec=ref, **kwargs)

mock_clone.return_value.git.gc.assert_called_once_with("--prune=now")

Expand Down Expand Up @@ -155,7 +163,7 @@ def test_fetch_source_clone_if_no_archive_yet(mock_clone_and_archive, gitsubmodu
with po(scm_git.sources_dir.package_dir, "glob", return_value=[]):
scm_git.fetch_source(gitsubmodule)

mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule)
mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule, include_git_dir=False)


@pytest.mark.parametrize("gitsubmodule", [True, False])
Expand Down Expand Up @@ -194,7 +202,9 @@ def test_fetch_source_by_pull(mock_update_and_archive, mock_getctime, gitsubmodu
return_value=["29eh2a.tar.gz", "a8c2d2.tar.gz", "a8c2d2-with-submodules.tar.gz"],
):
scm_git.fetch_source(gitsubmodule)
mock_update_and_archive.assert_called_once_with("a8c2d2.tar.gz", gitsubmodule=gitsubmodule)
mock_update_and_archive.assert_called_once_with(
"a8c2d2.tar.gz", gitsubmodule=gitsubmodule, include_git_dir=False
)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -242,25 +252,36 @@ def test_fetch_source_by_pull_corrupt_archive(

assert mock_update_and_archive.call_count == 2
calls = [
mock.call("a8c2d2.tar.gz", gitsubmodule=gitsubmodule),
mock.call("29eh2a.tar.gz", gitsubmodule=gitsubmodule),
mock.call("a8c2d2.tar.gz", gitsubmodule=gitsubmodule, include_git_dir=False),
mock.call("29eh2a.tar.gz", gitsubmodule=gitsubmodule, include_git_dir=False),
]
mock_update_and_archive.assert_has_calls(calls)
if all_corrupt:
mock_clone_and_archive.assert_called_once_with(gitsubmodule=gitsubmodule)
mock_clone_and_archive.assert_called_once_with(
gitsubmodule=gitsubmodule, include_git_dir=False
)
else:
mock_clone_and_archive.assert_not_called()


@pytest.mark.parametrize("gitsubmodule", [True, False])
@pytest.mark.parametrize(
"gitsubmodule, include_git_dir", [(True, False), (False, False), (True, True), (False, True)]
)
@mock.patch("tarfile.open")
@mock.patch("tempfile.TemporaryDirectory")
@mock.patch("git.Repo")
@mock.patch("cachito.workers.scm.run_cmd")
@mock.patch("os.path.exists")
@mock.patch("cachito.workers.scm.Git.update_git_submodules")
def test_update_and_archive(
mock_ugs, mock_exists, mock_fsck, mock_repo, mock_temp_dir, mock_tarfile_open, gitsubmodule
mock_ugs,
mock_exists,
mock_fsck,
mock_repo,
mock_temp_dir,
mock_tarfile_open,
gitsubmodule,
include_git_dir,
):
# Mock the archive being created
mock_exists.return_value = True
Expand All @@ -270,16 +291,18 @@ def test_update_and_archive(
# Mock the tempfile.TemporaryDirectory context manager
mock_temp_dir.return_value.__enter__.return_value = "/tmp/cachito-temp"

kwargs = {"depth": 1} if include_git_dir else {}

# Test does not really extract this archive file. The filename could be arbitrary.
scm.Git(url, ref).update_and_archive("/tmp/1234567.tar.gz", gitsubmodule)
scm.Git(url, ref).update_and_archive("/tmp/1234567.tar.gz", gitsubmodule, include_git_dir)

# Verify the tempfile.TemporaryDirectory context manager was used twice:
# once for _update_and_archive and once for _verify_archive
assert mock_temp_dir.return_value.__enter__.call_count == 2

repo = mock_repo.return_value
# Verify the changes are pulled.
repo.remote.return_value.fetch.assert_called_once_with(refspec=ref)
repo.remote.return_value.fetch.assert_called_once_with(refspec=ref, **kwargs)
# Verify the repo is reset to specific ref
repo.commit.assert_called_once_with(ref)
assert repo.commit.return_value == repo.head.reference
Expand Down

0 comments on commit 72bd76b

Please sign in to comment.