Skip to content

Commit

Permalink
archive-tar: add internal gzip implementation
Browse files Browse the repository at this point in the history
Git uses zlib for its own object store, but calls gzip when creating tgz
archives.  Add an option to perform the gzip compression for the latter
using zlib, without depending on the external gzip binary.

Plug it in by making write_block a function pointer and switching to a
compressing variant if the filter command has the magic value "git
archive gzip".  Does that indirection slow down tar creation?  Not
really, at least not in this test:

$ hyperfine -w3 -L rev HEAD,origin/main -p 'git checkout {rev} && make' \
'./git -C ../linux archive --format=tar HEAD # {rev}'
Benchmark #1: ./git -C ../linux archive --format=tar HEAD # HEAD
  Time (mean ± σ):      4.044 s ±  0.007 s    [User: 3.901 s, System: 0.137 s]
  Range (min … max):    4.038 s …  4.059 s    10 runs

Benchmark #2: ./git -C ../linux archive --format=tar HEAD # origin/main
  Time (mean ± σ):      4.047 s ±  0.009 s    [User: 3.903 s, System: 0.138 s]
  Range (min … max):    4.038 s …  4.066 s    10 runs

How does tgz creation perform?

$ hyperfine -w3 -L command 'gzip -cn','git archive gzip' \
'./git -c tar.tgz.command="{command}" -C ../linux archive --format=tgz HEAD'
Benchmark #1: ./git -c tar.tgz.command="gzip -cn" -C ../linux archive --format=tgz HEAD
  Time (mean ± σ):     20.404 s ±  0.006 s    [User: 23.943 s, System: 0.401 s]
  Range (min … max):   20.395 s … 20.414 s    10 runs

Benchmark #2: ./git -c tar.tgz.command="git archive gzip" -C ../linux archive --format=tgz HEAD
  Time (mean ± σ):     23.807 s ±  0.023 s    [User: 23.655 s, System: 0.145 s]
  Range (min … max):   23.782 s … 23.857 s    10 runs

Summary
  './git -c tar.tgz.command="gzip -cn" -C ../linux archive --format=tgz HEAD' ran
    1.17 ± 0.00 times faster than './git -c tar.tgz.command="git archive gzip" -C ../linux archive --format=tgz HEAD'

So the internal implementation takes 17% longer on the Linux repo, but
uses 2% less CPU time.  That's because the external gzip can run in
parallel on its own processor, while the internal one works sequentially
and avoids the inter-process communication overhead.

What are the benefits?  Only an internal sequential implementation can
offer this eco mode, and it allows avoiding the gzip(1) requirement.

This implementation uses the helper functions from our zlib.c instead of
the convenient gz* functions from zlib, because the latter doesn't give
the control over the generated gzip header that the next patch requires.

Original-patch-by: Rohit Ashiwal <rohit.ashiwal265@gmail.com>
Signed-off-by: René Scharfe <l.s.r@web.de>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information
rscharfe authored and gitster committed Jun 15, 2022
1 parent dfce118 commit 76d7602
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 2 deletions.
3 changes: 2 additions & 1 deletion Documentation/git-archive.txt
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,8 @@ tar.<format>.command::
to the command (e.g., `-9`).
+
The `tar.gz` and `tgz` formats are defined automatically and use the
command `gzip -cn` by default.
command `gzip -cn` by default. An internal gzip implementation can be
used by specifying the value `git archive gzip`.

tar.<format>.remote::
If true, enable the format for use by remote clients via
Expand Down
45 changes: 44 additions & 1 deletion archive-tar.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,13 @@ static int write_tar_filter_archive(const struct archiver *ar,
#define USTAR_MAX_MTIME 077777777777ULL
#endif

static void write_block(const void *buf)
static void tar_write_block(const void *buf)
{
write_or_die(1, buf, BLOCKSIZE);
}

static void (*write_block)(const void *) = tar_write_block;

/* writes out the whole block, but only if it is full */
static void write_if_needed(void)
{
Expand Down Expand Up @@ -430,6 +432,34 @@ static int write_tar_archive(const struct archiver *ar,
return err;
}

static git_zstream gzstream;
static unsigned char outbuf[16384];

static void tgz_deflate(int flush)
{
while (gzstream.avail_in || flush == Z_FINISH) {
int status = git_deflate(&gzstream, flush);
if (!gzstream.avail_out || status == Z_STREAM_END) {
write_or_die(1, outbuf, gzstream.next_out - outbuf);
gzstream.next_out = outbuf;
gzstream.avail_out = sizeof(outbuf);
if (status == Z_STREAM_END)
break;
}
if (status != Z_OK && status != Z_BUF_ERROR)
die(_("deflate error (%d)"), status);
}
}

static void tgz_write_block(const void *data)
{
gzstream.next_in = (void *)data;
gzstream.avail_in = BLOCKSIZE;
tgz_deflate(Z_NO_FLUSH);
}

static const char internal_gzip_command[] = "git archive gzip";

static int write_tar_filter_archive(const struct archiver *ar,
struct archiver_args *args)
{
Expand All @@ -440,6 +470,19 @@ static int write_tar_filter_archive(const struct archiver *ar,
if (!ar->filter_command)
BUG("tar-filter archiver called with no filter defined");

if (!strcmp(ar->filter_command, internal_gzip_command)) {
write_block = tgz_write_block;
git_deflate_init_gzip(&gzstream, args->compression_level);
gzstream.next_out = outbuf;
gzstream.avail_out = sizeof(outbuf);

r = write_tar_archive(ar, args);

tgz_deflate(Z_FINISH);
git_deflate_end(&gzstream);
return r;
}

strbuf_addstr(&cmd, ar->filter_command);
if (args->compression_level >= 0)
strbuf_addf(&cmd, " -%d", args->compression_level);
Expand Down
16 changes: 16 additions & 0 deletions t/t5000-tar-tree.sh
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,22 @@ test_expect_success GZIP 'remote tar.gz can be disabled' '
>remote.tar.gz
'

test_expect_success 'git archive --format=tgz (internal gzip)' '
test_config tar.tgz.command "git archive gzip" &&
git archive --format=tgz HEAD >internal_gzip.tgz
'

test_expect_success 'git archive --format=tar.gz (internal gzip)' '
test_config tar.tar.gz.command "git archive gzip" &&
git archive --format=tar.gz HEAD >internal_gzip.tar.gz &&
test_cmp_bin internal_gzip.tgz internal_gzip.tar.gz
'

test_expect_success GZIP 'extract tgz file (internal gzip)' '
gzip -d -c <internal_gzip.tgz >internal_gzip.tar &&
test_cmp_bin b.tar internal_gzip.tar
'

test_expect_success 'archive and :(glob)' '
git archive -v HEAD -- ":(glob)**/sh" >/dev/null 2>actual &&
cat >expect <<EOF &&
Expand Down

0 comments on commit 76d7602

Please sign in to comment.