Skip to content

Commit

Permalink
builtin/repack.c: add '--geometric' option
Browse files Browse the repository at this point in the history
Often it is useful to both:

  - have relatively few packfiles in a repository, and

  - avoid having so few packfiles in a repository that we repack its
    entire contents regularly

This patch implements a '--geometric=<n>' option in 'git repack'. This
allows the caller to specify that they would like each pack to be at
least a factor times as large as the previous largest pack (by object
count).

Concretely, say that a repository has 'n' packfiles, labeled P1, P2,
..., up to Pn. Each packfile has an object count equal to 'objects(Pn)'.
With a geometric factor of 'r', it should be that:

  objects(Pi) > r*objects(P(i-1))

for all i in [1, n], where the packs are sorted by

  objects(P1) <= objects(P2) <= ... <= objects(Pn).

Since finding a true optimal repacking is NP-hard, we approximate it
along two directions:

  1. We assume that there is a cutoff of packs _before starting the
     repack_ where everything to the right of that cut-off already forms
     a geometric progression (or no cutoff exists and everything must be
     repacked).

  2. We assume that everything smaller than the cutoff count must be
     repacked. This forms our base assumption, but it can also cause
     even the "heavy" packs to get repacked, for e.g., if we have 6
     packs containing the following number of objects:

       1, 1, 1, 2, 4, 32

     then we would place the cutoff between '1, 1' and '1, 2, 4, 32',
     rolling up the first two packs into a pack with 2 objects. That
     breaks our progression and leaves us:

       2, 1, 2, 4, 32
         ^

     (where the '^' indicates the position of our split). To restore a
     progression, we move the split forward (towards larger packs)
     joining each pack into our new pack until a geometric progression
     is restored. Here, that looks like:

       2, 1, 2, 4, 32  ~>  3, 2, 4, 32  ~>  5, 4, 32  ~> ... ~> 9, 32
         ^                   ^                ^                   ^

This has the advantage of not repacking the heavy-side of packs too
often while also only creating one new pack at a time. Another wrinkle
is that we assume that loose, indexed, and reflog'd objects are
insignificant, and lump them into any new pack that we create. This can
lead to non-idempotent results.

Suggested-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Reviewed-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information
ttaylorr authored and gitster committed Feb 23, 2021
1 parent 20b031f commit 0fabafd
Show file tree
Hide file tree
Showing 3 changed files with 343 additions and 4 deletions.
23 changes: 23 additions & 0 deletions Documentation/git-repack.txt
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,29 @@ depth is 4095.
Pass the `--delta-islands` option to `git-pack-objects`, see
linkgit:git-pack-objects[1].

-g=<factor>::
--geometric=<factor>::
Arrange resulting pack structure so that each successive pack
contains at least `<factor>` times the number of objects as the
next-largest pack.
+
`git repack` ensures this by determining a "cut" of packfiles that need
to be repacked into one in order to ensure a geometric progression. It
picks the smallest set of packfiles such that as many of the larger
packfiles (by count of objects contained in that pack) may be left
intact.
+
Unlike other repack modes, the set of objects to pack is determined
uniquely by the set of packs being "rolled-up"; in other words, the
packs determined to need to be combined in order to restore a geometric
progression.
+
When `--unpacked` is specified, loose objects are implicitly included in
this "roll-up", without respect to their reachability. This is subject
to change in the future. This option (implying a drastically different
repack mode) is not guaranteed to work with all other combinations of
option to `git repack`).

Configuration
-------------

Expand Down
187 changes: 183 additions & 4 deletions builtin/repack.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,132 @@ static void repack_promisor_objects(const struct pack_objects_args *args,
#define ALL_INTO_ONE 1
#define LOOSEN_UNREACHABLE 2

struct pack_geometry {
struct packed_git **pack;
uint32_t pack_nr, pack_alloc;
uint32_t split;
};

static uint32_t geometry_pack_weight(struct packed_git *p)
{
if (open_pack_index(p))
die(_("cannot open index for %s"), p->pack_name);
return p->num_objects;
}

static int geometry_cmp(const void *va, const void *vb)
{
uint32_t aw = geometry_pack_weight(*(struct packed_git **)va),
bw = geometry_pack_weight(*(struct packed_git **)vb);

if (aw < bw)
return -1;
if (aw > bw)
return 1;
return 0;
}

static void init_pack_geometry(struct pack_geometry **geometry_p)
{
struct packed_git *p;
struct pack_geometry *geometry;

*geometry_p = xcalloc(1, sizeof(struct pack_geometry));
geometry = *geometry_p;

for (p = get_all_packs(the_repository); p; p = p->next) {
if (!pack_kept_objects && p->pack_keep)
continue;

ALLOC_GROW(geometry->pack,
geometry->pack_nr + 1,
geometry->pack_alloc);

geometry->pack[geometry->pack_nr] = p;
geometry->pack_nr++;
}

QSORT(geometry->pack, geometry->pack_nr, geometry_cmp);
}

static void split_pack_geometry(struct pack_geometry *geometry, int factor)
{
uint32_t i;
uint32_t split;
off_t total_size = 0;

if (geometry->pack_nr <= 1) {
geometry->split = geometry->pack_nr;
return;
}

split = geometry->pack_nr - 1;

/*
* First, count the number of packs (in descending order of size) which
* already form a geometric progression.
*/
for (i = geometry->pack_nr - 1; i > 0; i--) {
struct packed_git *ours = geometry->pack[i];
struct packed_git *prev = geometry->pack[i - 1];
if (geometry_pack_weight(ours) >= factor * geometry_pack_weight(prev))
split--;
else
break;
}

if (split) {
/*
* Move the split one to the right, since the top element in the
* last-compared pair can't be in the progression. Only do this
* when we split in the middle of the array (otherwise if we got
* to the end, then the split is in the right place).
*/
split++;
}

/*
* Then, anything to the left of 'split' must be in a new pack. But,
* creating that new pack may cause packs in the heavy half to no longer
* form a geometric progression.
*
* Compute an expected size of the new pack, and then determine how many
* packs in the heavy half need to be joined into it (if any) to restore
* the geometric progression.
*/
for (i = 0; i < split; i++)
total_size += geometry_pack_weight(geometry->pack[i]);
for (i = split; i < geometry->pack_nr; i++) {
struct packed_git *ours = geometry->pack[i];
if (geometry_pack_weight(ours) < factor * total_size) {
split++;
total_size += geometry_pack_weight(ours);
} else
break;
}

geometry->split = split;
}

static void clear_pack_geometry(struct pack_geometry *geometry)
{
if (!geometry)
return;

free(geometry->pack);
geometry->pack_nr = 0;
geometry->pack_alloc = 0;
geometry->split = 0;
}

int cmd_repack(int argc, const char **argv, const char *prefix)
{
struct child_process cmd = CHILD_PROCESS_INIT;
struct string_list_item *item;
struct string_list names = STRING_LIST_INIT_DUP;
struct string_list rollback = STRING_LIST_INIT_NODUP;
struct string_list existing_packs = STRING_LIST_INIT_DUP;
struct pack_geometry *geometry = NULL;
struct strbuf line = STRBUF_INIT;
int i, ext, ret;
FILE *out;
Expand All @@ -316,6 +435,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
struct string_list keep_pack_list = STRING_LIST_INIT_NODUP;
int no_update_server_info = 0;
struct pack_objects_args po_args = {NULL};
int geometric_factor = 0;

struct option builtin_repack_options[] = {
OPT_BIT('a', NULL, &pack_everything,
Expand Down Expand Up @@ -356,6 +476,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
N_("repack objects in packs marked with .keep")),
OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"),
N_("do not repack this pack")),
OPT_INTEGER('g', "geometric", &geometric_factor,
N_("find a geometric progression with factor <N>")),
OPT_END()
};

Expand All @@ -382,6 +504,13 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
if (write_bitmaps && !(pack_everything & ALL_INTO_ONE))
die(_(incremental_bitmap_conflict_error));

if (geometric_factor) {
if (pack_everything)
die(_("--geometric is incompatible with -A, -a"));
init_pack_geometry(&geometry);
split_pack_geometry(geometry, geometric_factor);
}

packdir = mkpathdup("%s/pack", get_object_directory());
packtmp = mkpathdup("%s/.tmp-%d-pack", packdir, (int)getpid());

Expand All @@ -396,9 +525,19 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
strvec_pushf(&cmd.args, "--keep-pack=%s",
keep_pack_list.items[i].string);
strvec_push(&cmd.args, "--non-empty");
strvec_push(&cmd.args, "--all");
strvec_push(&cmd.args, "--reflog");
strvec_push(&cmd.args, "--indexed-objects");
if (!geometry) {
/*
* 'git pack-objects' will up all objects loose or packed
* (either rolling them up or leaving them alone), so don't pass
* these options.
*
* The implementation of 'git pack-objects --stdin-packs'
* makes them redundant (and the two are incompatible).
*/
strvec_push(&cmd.args, "--all");
strvec_push(&cmd.args, "--reflog");
strvec_push(&cmd.args, "--indexed-objects");
}
if (has_promisor_remote())
strvec_push(&cmd.args, "--exclude-promisor-objects");
if (write_bitmaps > 0)
Expand Down Expand Up @@ -429,17 +568,37 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
strvec_push(&cmd.env_array, "GIT_REF_PARANOIA=1");
}
}
} else if (geometry) {
strvec_push(&cmd.args, "--stdin-packs");
strvec_push(&cmd.args, "--unpacked");
} else {
strvec_push(&cmd.args, "--unpacked");
strvec_push(&cmd.args, "--incremental");
}

cmd.no_stdin = 1;
if (geometry)
cmd.in = -1;
else
cmd.no_stdin = 1;

ret = start_command(&cmd);
if (ret)
return ret;

if (geometry) {
FILE *in = xfdopen(cmd.in, "w");
/*
* The resulting pack should contain all objects in packs that
* are going to be rolled up, but exclude objects in packs which
* are being left alone.
*/
for (i = 0; i < geometry->split; i++)
fprintf(in, "%s\n", pack_basename(geometry->pack[i]));
for (i = geometry->split; i < geometry->pack_nr; i++)
fprintf(in, "^%s\n", pack_basename(geometry->pack[i]));
fclose(in);
}

out = xfdopen(cmd.out, "r");
while (strbuf_getline_lf(&line, out) != EOF) {
if (line.len != the_hash_algo->hexsz)
Expand Down Expand Up @@ -507,6 +666,25 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
if (!string_list_has_string(&names, sha1))
remove_redundant_pack(packdir, item->string);
}

if (geometry) {
struct strbuf buf = STRBUF_INIT;

uint32_t i;
for (i = 0; i < geometry->split; i++) {
struct packed_git *p = geometry->pack[i];
if (string_list_has_string(&names,
hash_to_hex(p->hash)))
continue;

strbuf_reset(&buf);
strbuf_addstr(&buf, pack_basename(p));
strbuf_strip_suffix(&buf, ".pack");

remove_redundant_pack(packdir, buf.buf);
}
strbuf_release(&buf);
}
if (!po_args.quiet && isatty(2))
opts |= PRUNE_PACKED_VERBOSE;
prune_packed_objects(opts);
Expand All @@ -528,6 +706,7 @@ int cmd_repack(int argc, const char **argv, const char *prefix)
string_list_clear(&names, 0);
string_list_clear(&rollback, 0);
string_list_clear(&existing_packs, 0);
clear_pack_geometry(geometry);
strbuf_release(&line);

return 0;
Expand Down

0 comments on commit 0fabafd

Please sign in to comment.