From 78cfdd0cf51f64ffe3cfe3173cbe40b871515681 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 15 Jun 2021 22:41:42 +0000 Subject: [PATCH 1/5] promisor-remote: output trace2 statistics for number of objects fetched Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- promisor-remote.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/promisor-remote.c b/promisor-remote.c index da3f2ca2615e08..d465377d7d32a1 100644 --- a/promisor-remote.c +++ b/promisor-remote.c @@ -12,7 +12,8 @@ void set_repository_format_partial_clone(char *partial_clone) repository_format_partial_clone = xstrdup_or_null(partial_clone); } -static int fetch_objects(const char *remote_name, +static int fetch_objects(struct repository *repo, + const char *remote_name, const struct object_id *oids, int oid_nr) { @@ -30,6 +31,8 @@ static int fetch_objects(const char *remote_name, die(_("promisor-remote: unable to fork off fetch subprocess")); child_in = xfdopen(child.in, "w"); + trace2_data_intmax("promisor", repo, "fetch_count", oid_nr); + for (i = 0; i < oid_nr; i++) { if (fputs(oid_to_hex(&oids[i]), child_in) < 0) die_errno(_("promisor-remote: could not write to fetch subprocess")); @@ -238,7 +241,7 @@ int promisor_remote_get_direct(struct repository *repo, promisor_remote_init(); for (r = promisors; r; r = r->next) { - if (fetch_objects(r->name, remaining_oids, remaining_nr) < 0) { + if (fetch_objects(repo, r->name, remaining_oids, remaining_nr) < 0) { if (remaining_nr == 1) continue; remaining_nr = remove_fetched_oids(repo, &remaining_oids, From c75c42395267f3b484f47404ae746d323586a929 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 22 Jun 2021 08:04:38 +0000 Subject: [PATCH 2/5] t6421: add tests checking for excessive object downloads during merge Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- t/t6421-merge-partial-clone.sh | 440 +++++++++++++++++++++++++++++++++ 1 file changed, 440 insertions(+) create mode 100755 t/t6421-merge-partial-clone.sh diff --git a/t/t6421-merge-partial-clone.sh b/t/t6421-merge-partial-clone.sh new file mode 100755 index 00000000000000..3dcffc15f801e9 --- /dev/null +++ b/t/t6421-merge-partial-clone.sh @@ -0,0 +1,440 @@ +#!/bin/sh + +test_description="limiting blob downloads when merging with partial clones" +# Uses a methodology similar to +# t6042: corner cases with renames but not criss-cross merges +# t6036: corner cases with both renames and criss-cross merges +# t6423: directory rename detection +# +# The setup for all of them, pictorially, is: +# +# A +# o +# / \ +# O o ? +# \ / +# o +# B +# +# To help make it easier to follow the flow of tests, they have been +# divided into sections and each test will start with a quick explanation +# of what commits O, A, and B contain. +# +# Notation: +# z/{b,c} means files z/b and z/c both exist +# x/d_1 means file x/d exists with content d1. (Purpose of the +# underscore notation is to differentiate different +# files that might be renamed into each other's paths.) + +. ./test-lib.sh +. "$TEST_DIRECTORY"/lib-merge.sh + +test_setup_repo () { + test -d server && return + test_create_repo server && + ( + cd server && + + git config uploadpack.allowfilter 1 && + git config uploadpack.allowanysha1inwant 1 && + + mkdir -p general && + test_seq 2 9 >general/leap1 && + cp general/leap1 general/leap2 && + echo leap2 >>general/leap2 && + + mkdir -p basename && + cp general/leap1 basename/numbers && + cp general/leap1 basename/sequence && + cp general/leap1 basename/values && + echo numbers >>basename/numbers && + echo sequence >>basename/sequence && + echo values >>basename/values && + + mkdir -p dir/unchanged && + mkdir -p dir/subdir/tweaked && + echo a >dir/subdir/a && + echo b >dir/subdir/b && + echo c >dir/subdir/c && + echo d >dir/subdir/d && + echo e >dir/subdir/e && + cp general/leap1 dir/subdir/Makefile && + echo toplevel makefile >>dir/subdir/Makefile && + echo f >dir/subdir/tweaked/f && + echo g >dir/subdir/tweaked/g && + echo h >dir/subdir/tweaked/h && + echo subdirectory makefile >dir/subdir/tweaked/Makefile && + for i in $(test_seq 1 88) + do + echo content $i >dir/unchanged/file_$i + done && + git add . && + git commit -m "O" && + + git branch O && + git branch A && + git branch B-single && + git branch B-dir && + git branch B-many && + + git switch A && + + git rm general/leap* && + mkdir general/ && + test_seq 1 9 >general/jump1 && + cp general/jump1 general/jump2 && + echo leap2 >>general/jump2 && + + rm basename/numbers basename/sequence basename/values && + mkdir -p basename/subdir/ + cp general/jump1 basename/subdir/numbers && + cp general/jump1 basename/subdir/sequence && + cp general/jump1 basename/subdir/values && + echo numbers >>basename/subdir/numbers && + echo sequence >>basename/subdir/sequence && + echo values >>basename/subdir/values && + + git rm dir/subdir/tweaked/f && + echo more >>dir/subdir/e && + echo more >>dir/subdir/Makefile && + echo more >>dir/subdir/tweaked/Makefile && + mkdir dir/subdir/newsubdir && + echo rust code >dir/subdir/newsubdir/newfile.rs && + git mv dir/subdir/e dir/subdir/newsubdir/ && + git mv dir folder && + git add . && + git commit -m "A" && + + git switch B-single && + echo new first line >dir/subdir/Makefile && + cat general/leap1 >>dir/subdir/Makefile && + echo toplevel makefile >>dir/subdir/Makefile && + echo perl code >general/newfile.pl && + git add . && + git commit -m "B-single" && + + git switch B-dir && + echo java code >dir/subdir/newfile.java && + echo scala code >dir/subdir/newfile.scala && + echo groovy code >dir/subdir/newfile.groovy && + git add . && + git commit -m "B-dir" && + + git switch B-many && + test_seq 2 10 >general/leap1 && + rm general/leap2 && + cp general/leap1 general/leap2 && + echo leap2 >>general/leap2 && + + rm basename/numbers basename/sequence basename/values && + mkdir -p basename/subdir/ + cp general/leap1 basename/subdir/numbers && + cp general/leap1 basename/subdir/sequence && + cp general/leap1 basename/subdir/values && + echo numbers >>basename/subdir/numbers && + echo sequence >>basename/subdir/sequence && + echo values >>basename/subdir/values && + + mkdir dir/subdir/newsubdir/ && + echo c code >dir/subdir/newfile.c && + echo python code >dir/subdir/newsubdir/newfile.py && + git add . && + git commit -m "B-many" && + + git switch A + ) +} + +# Testcase: Objects downloaded for single relevant rename +# Commit O: +# general/{leap1_O, leap2_O} +# basename/{numbers_O, sequence_O, values_O} +# dir/subdir/{a,b,c,d,e_O,Makefile_TOP_O} +# dir/subdir/tweaked/{f,g,h,Makefile_SUB_O} +# dir/unchanged/ +# Commit A: +# (Rename leap->jump, rename basename/ -> basename/subdir/, rename dir/ +# -> folder/, move e into newsubdir, add newfile.rs, remove f, modify +# both both Makefiles and jumps) +# general/{jump1_A, jump2_A} +# basename/subdir/{numbers_A, sequence_A, values_A} +# folder/subdir/{a,b,c,d,Makefile_TOP_A} +# folder/subdir/newsubdir/{e_A,newfile.rs} +# folder/subdir/tweaked/{g,h,Makefile_SUB_A} +# folder/unchanged/ +# Commit B(-single): +# (add newfile.pl, tweak Makefile_TOP) +# general/{leap1_O, leap2_O,newfile.pl} +# basename/{numbers_O, sequence_O, values_O} +# dir/{a,b,c,d,e_O,Makefile_TOP_B} +# dir/tweaked/{f,g,h,Makefile_SUB_O} +# dir/unchanged/ +# Expected: +# general/{jump1_A, jump2_A,newfile.pl} +# basename/subdir/{numbers_A, sequence_A, values_A} +# folder/subdir/{a,b,c,d,Makefile_TOP_Merged} +# folder/subdir/newsubdir/{e_A,newfile.rs} +# folder/subdir/tweaked/{g,h,Makefile_SUB_A} +# folder/unchanged/ +# +# Objects that need to be fetched: +# Rename detection: +# Side1 (O->A): +# Basename-matches rename detection only needs to fetch these objects: +# Makefile_TOP_O, Makefile_TOP_A +# (Despite many renames, all others are content irrelevant. They +# are also location irrelevant because newfile.rs was added on +# the side doing the directory rename, and newfile.pl was added to +# a directory that was not renamed on either side.) +# General rename detection only needs to fetch these objects: +# +# (Even though newfile.rs, jump[12], basename/subdir/*, and e +# could all be used as destinations in rename detection, the +# basename detection for Makefile matches up all relevant +# sources, so these other files never end up needing to be +# used) +# Side2 (O->B): +# Basename-matches rename detection only needs to fetch these objects: +# +# (there are no deleted files, so no possible sources) +# General rename detection only needs to fetch these objects: +# +# (there are no deleted files, so no possible sources) +# Merge: +# 3-way content merge needs to grab these objects: +# Makefile_TOP_B +# Nothing else needs to fetch objects +# +# Summary: 2 fetches (1 for 2 objects, 1 for 1 object) +# +test_expect_merge_algorithm failure failure 'Objects downloaded for single relevant rename' ' + test_setup_repo && + git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-single && + ( + cd objects-single && + + git rev-list --objects --all --missing=print | + grep "^?" | sort >missing-objects-before && + + git checkout -q origin/A && + + GIT_TRACE2_PERF="$(pwd)/trace.output" git \ + -c merge.directoryRenames=true merge --no-stat \ + --no-progress origin/B-single && + + # Check the number of objects we reported we would fetch + cat >expect <<-EOF && + fetch_count:2 + fetch_count:1 + EOF + grep fetch_count trace.output | cut -d "|" -f 9 | tr -d " ." >actual && + test_cmp expect actual && + + # Check the number of fetch commands exec-ed + grep d0.*fetch.negotiationAlgorithm trace.output >fetches && + test_line_count = 2 fetches && + + git rev-list --objects --all --missing=print | + grep "^?" | sort >missing-objects-after && + comm -2 -3 missing-objects-before missing-objects-after >old && + comm -1 -3 missing-objects-before missing-objects-after >new && + # No new missing objects + test_must_be_empty new && + # Fetched 2 + 1 = 3 objects + test_line_count = 3 old + ) +' + +# Testcase: Objects downloaded for directory rename +# Commit O: +# general/{leap1_O, leap2_O} +# basename/{numbers_O, sequence_O, values_O} +# dir/subdir/{a,b,c,d,e_O,Makefile_TOP_O} +# dir/subdir/tweaked/{f,g,h,Makefile_SUB_O} +# dir/unchanged/ +# Commit A: +# (Rename leap->jump, rename basename/ -> basename/subdir/, rename dir/ -> +# folder/, move e into newsubdir, add newfile.rs, remove f, modify +# both Makefiles and jumps) +# general/{jump1_A, jump2_A} +# basename/subdir/{numbers_A, sequence_A, values_A} +# folder/subdir/{a,b,c,d,Makefile_TOP_A} +# folder/subdir/newsubdir/{e_A,newfile.rs} +# folder/subdir/tweaked/{g,h,Makefile_SUB_A} +# folder/unchanged/ +# Commit B(-dir): +# (add dir/subdir/newfile.{java,scala,groovy} +# general/{leap1_O, leap2_O} +# basename/{numbers_O, sequence_O, values_O} +# dir/subdir/{a,b,c,d,e_O,Makefile_TOP_O, +# newfile.java,newfile.scala,newfile.groovy} +# dir/subdir/tweaked/{f,g,h,Makefile_SUB_O} +# dir/unchanged/ +# Expected: +# general/{jump1_A, jump2_A} +# basename/subdir/{numbers_A, sequence_A, values_A} +# folder/subdir/{a,b,c,d,Makefile_TOP_A, +# newfile.java,newfile.scala,newfile.groovy} +# folder/subdir/newsubdir/{e_A,newfile.rs} +# folder/subdir/tweaked/{g,h,Makefile_SUB_A} +# folder/unchanged/ +# +# Objects that need to be fetched: +# Makefile_TOP_O, Makefile_TOP_A +# Makefile_SUB_O, Makefile_SUB_A +# e_O, e_A +# * Despite A's rename of jump->leap, those renames are irrelevant. +# * Despite A's rename of basename/ -> basename/subdir/, those renames are +# irrelevant. +# * Because of A's rename of dir/ -> folder/ and B-dir's addition of +# newfile.* into dir/subdir/, we need to determine directory renames. +# (Technically, there are enough exact renames to determine directory +# rename detection, but the current implementation always does +# basename searching before directory rename detection. Running it +# also before basename searching would mean doing directory rename +# detection twice, but it's a bit expensive to do that and cases like +# this are not all that common.) +# Summary: 1 fetches for 6 objects +# +test_expect_merge_algorithm failure failure 'Objects downloaded when a directory rename triggered' ' + test_setup_repo && + git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-dir && + ( + cd objects-dir && + + git rev-list --objects --all --missing=print | + grep "^?" | sort >missing-objects-before && + + git checkout -q origin/A && + + GIT_TRACE2_PERF="$(pwd)/trace.output" git \ + -c merge.directoryRenames=true merge --no-stat \ + --no-progress origin/B-dir && + + # Check the number of objects we reported we would fetch + cat >expect <<-EOF && + fetch_count:6 + EOF + grep fetch_count trace.output | cut -d "|" -f 9 | tr -d " ." >actual && + test_cmp expect actual && + + # Check the number of fetch commands exec-ed + grep d0.*fetch.negotiationAlgorithm trace.output >fetches && + test_line_count = 1 fetches && + + git rev-list --objects --all --missing=print | + grep "^?" | sort >missing-objects-after && + comm -2 -3 missing-objects-before missing-objects-after >old && + comm -1 -3 missing-objects-before missing-objects-after >new && + # No new missing objects + test_must_be_empty new && + # Fetched 6 objects + test_line_count = 6 old + ) +' + +# Testcase: Objects downloaded with lots of renames and modifications +# Commit O: +# general/{leap1_O, leap2_O} +# basename/{numbers_O, sequence_O, values_O} +# dir/subdir/{a,b,c,d,e_O,Makefile_TOP_O} +# dir/subdir/tweaked/{f,g,h,Makefile_SUB_O} +# dir/unchanged/ +# Commit A: +# (Rename leap->jump, rename basename/ -> basename/subdir/, rename dir/ +# -> folder/, move e into newsubdir, add newfile.rs, remove f, modify +# both both Makefiles and jumps) +# general/{jump1_A, jump2_A} +# basename/subdir/{numbers_A, sequence_A, values_A} +# folder/subdir/{a,b,c,d,Makefile_TOP_A} +# folder/subdir/newsubdir/{e_A,newfile.rs} +# folder/subdir/tweaked/{g,h,Makefile_SUB_A} +# folder/unchanged/ +# Commit B(-minimal): +# (modify both leaps, rename basename/ -> basename/subdir/, add +# newfile.{c,py}) +# general/{leap1_B, leap2_B} +# basename/subdir/{numbers_B, sequence_B, values_B} +# dir/{a,b,c,d,e_O,Makefile_TOP_O,newfile.c} +# dir/tweaked/{f,g,h,Makefile_SUB_O,newfile.py} +# dir/unchanged/ +# Expected: +# general/{jump1_Merged, jump2_Merged} +# basename/subdir/{numbers_Merged, sequence_Merged, values_Merged} +# folder/subdir/{a,b,c,d,Makefile_TOP_A,newfile.c} +# folder/subdir/newsubdir/e_A +# folder/subdir/tweaked/{g,h,Makefile_SUB_A,newfile.py} +# folder/unchanged/ +# +# Objects that need to be fetched: +# Rename detection: +# Side1 (O->A): +# Basename-matches rename detection only needs to fetch these objects: +# numbers_O, numbers_A +# sequence_O, sequence_A +# values_O, values_A +# Makefile_TOP_O, Makefile_TOP_A +# Makefile_SUB_O, Makefile_SUB_A +# e_O, e_A +# General rename detection only needs to fetch these objects: +# leap1_O, leap2_O +# jump1_A, jump2_A, newfile.rs +# (only need remaining relevant sources, but any relevant sources need +# to be matched against all possible unpaired destinations) +# Side2 (O->B): +# Basename-matches rename detection only needs to fetch these objects: +# numbers_B +# sequence_B +# values_B +# (because numbers_O, sequence_O, and values_O already fetched above) +# General rename detection only needs to fetch these objects: +# +# Merge: +# 3-way content merge needs to grab these objects: +# leap1_B +# leap2_B +# Nothing else needs to fetch objects +# +# Summary: 4 fetches (1 for 6 objects, 1 for 8, 1 for 3, 1 for 2) +# +test_expect_merge_algorithm failure failure 'Objects downloaded with lots of renames and modifications' ' + test_setup_repo && + git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-many && + ( + cd objects-many && + + git rev-list --objects --all --missing=print | + grep "^?" | sort >missing-objects-before && + + git checkout -q origin/A && + + GIT_TRACE2_PERF="$(pwd)/trace.output" git \ + -c merge.directoryRenames=true merge --no-stat \ + --no-progress origin/B-many && + + # Check the number of objects we reported we would fetch + cat >expect <<-EOF && + fetch_count:12 + fetch_count:5 + fetch_count:3 + fetch_count:2 + EOF + grep fetch_count trace.output | cut -d "|" -f 9 | tr -d " ." >actual && + test_cmp expect actual && + + # Check the number of fetch commands exec-ed + grep d0.*fetch.negotiationAlgorithm trace.output >fetches && + test_line_count = 4 fetches && + + git rev-list --objects --all --missing=print | + grep "^?" | sort >missing-objects-after && + comm -2 -3 missing-objects-before missing-objects-after >old && + comm -1 -3 missing-objects-before missing-objects-after >new && + # No new missing objects + test_must_be_empty new && + # Fetched 12 + 5 + 3 + 2 = 22 objects + test_line_count = 22 old + ) +' + +test_done From d331dd3b0c829fe9019f0113a095ed95bc06f227 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 22 Jun 2021 08:04:39 +0000 Subject: [PATCH 3/5] diffcore-rename: allow different missing_object_cb functions estimate_similarity() was setting up a diff_populate_filespec_options every time it was called, requiring the caller of estimate_similarity() to pass in some data needed to set up this option. Currently the needed data consisted of a single variable (skip_unmodified), but we want to also have the different estimate_similarity() callsites start using different missing_object_cb functions as well. Rather than also passing that data in, just have the caller pass in the whole diff_populate_filespec_options, and reduce the number of times we need to set it up. As a side note, this also drops the number of calls to has_promisor_remote() dramatically. If L is the number of basename paths to compare, M is the number of inexact sources, and N is the number of inexact destinations, then the number of calls to has_promisor_remote() drops from L+M*N down to at most 2 -- one for each of the sites that calls estimate_similarity(). has_promisor_remote() is a very fast function so this almost certainly has no measurable performance impact, but it seems cleaner to avoid calling that function so many times. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 58 +++++++++++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 3375e24659ea1a..8affa6130e2da2 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -126,7 +126,7 @@ static int estimate_similarity(struct repository *r, struct diff_filespec *src, struct diff_filespec *dst, int minimum_score, - int skip_unmodified) + struct diff_populate_filespec_options *dpf_opt) { /* src points at a file that existed in the original tree (or * optionally a file in the destination tree) and dst points @@ -143,15 +143,6 @@ static int estimate_similarity(struct repository *r, */ unsigned long max_size, delta_size, base_size, src_copied, literal_added; int score; - struct diff_populate_filespec_options dpf_options = { - .check_size_only = 1 - }; - struct prefetch_options prefetch_options = {r, skip_unmodified}; - - if (r == the_repository && has_promisor_remote()) { - dpf_options.missing_object_cb = prefetch; - dpf_options.missing_object_data = &prefetch_options; - } /* We deal only with regular files. Symlink renames are handled * only when they are exact matches --- in other words, no edits @@ -169,11 +160,13 @@ static int estimate_similarity(struct repository *r, * is a possible size - we really should have a flag to * say whether the size is valid or not!) */ + dpf_opt->check_size_only = 1; + if (!src->cnt_data && - diff_populate_filespec(r, src, &dpf_options)) + diff_populate_filespec(r, src, dpf_opt)) return 0; if (!dst->cnt_data && - diff_populate_filespec(r, dst, &dpf_options)) + diff_populate_filespec(r, dst, dpf_opt)) return 0; max_size = ((src->size > dst->size) ? src->size : dst->size); @@ -191,11 +184,11 @@ static int estimate_similarity(struct repository *r, if (max_size * (MAX_SCORE-minimum_score) < delta_size * MAX_SCORE) return 0; - dpf_options.check_size_only = 0; + dpf_opt->check_size_only = 0; - if (!src->cnt_data && diff_populate_filespec(r, src, &dpf_options)) + if (!src->cnt_data && diff_populate_filespec(r, src, dpf_opt)) return 0; - if (!dst->cnt_data && diff_populate_filespec(r, dst, &dpf_options)) + if (!dst->cnt_data && diff_populate_filespec(r, dst, dpf_opt)) return 0; if (diffcore_count_changes(r, src, dst, @@ -862,7 +855,11 @@ static int find_basename_matches(struct diff_options *options, int i, renames = 0; struct strintmap sources; struct strintmap dests; - + struct diff_populate_filespec_options dpf_options = { + .check_binary = 0, + .missing_object_cb = NULL, + .missing_object_data = NULL + }; /* * The prefeteching stuff wants to know if it can skip prefetching * blobs that are unmodified...and will then do a little extra work @@ -873,7 +870,10 @@ static int find_basename_matches(struct diff_options *options, * the extra work necessary to check if rename_src entries are * unmodified would be a small waste. */ - int skip_unmodified = 0; + struct prefetch_options prefetch_options = { + .repo = options->repo, + .skip_unmodified = 0 + }; /* * Create maps of basename -> fullname(s) for remaining sources and @@ -910,6 +910,11 @@ static int find_basename_matches(struct diff_options *options, strintmap_set(&dests, base, i); } + if (options->repo == the_repository && has_promisor_remote()) { + dpf_options.missing_object_cb = prefetch; + dpf_options.missing_object_data = &prefetch_options; + } + /* Now look for basename matchups and do similarity estimation */ for (i = 0; i < rename_src_nr; ++i) { char *filename = rename_src[i].p->one->path; @@ -953,7 +958,7 @@ static int find_basename_matches(struct diff_options *options, one = rename_src[src_index].p->one; two = rename_dst[dst_index].p->two; score = estimate_similarity(options->repo, one, two, - minimum_score, skip_unmodified); + minimum_score, &dpf_options); /* If sufficiently similar, record as rename pair */ if (score < minimum_score) @@ -1272,6 +1277,14 @@ void diffcore_rename_extended(struct diff_options *options, int num_sources, want_copies; struct progress *progress = NULL; struct dir_rename_info info; + struct diff_populate_filespec_options dpf_options = { + .check_binary = 0, + .missing_object_cb = NULL, + .missing_object_data = NULL + }; + struct prefetch_options prefetch_options = { + .repo = options->repo + }; trace2_region_enter("diff", "setup", options->repo); info.setup = 0; @@ -1433,6 +1446,13 @@ void diffcore_rename_extended(struct diff_options *options, (uint64_t)num_destinations * (uint64_t)num_sources); } + /* Finish setting up dpf_options */ + prefetch_options.skip_unmodified = skip_unmodified; + if (options->repo == the_repository && has_promisor_remote()) { + dpf_options.missing_object_cb = prefetch; + dpf_options.missing_object_data = &prefetch_options; + } + CALLOC_ARRAY(mx, st_mult(NUM_CANDIDATE_PER_DST, num_destinations)); for (dst_cnt = i = 0; i < rename_dst_nr; i++) { struct diff_filespec *two = rename_dst[i].p->two; @@ -1458,7 +1478,7 @@ void diffcore_rename_extended(struct diff_options *options, this_src.score = estimate_similarity(options->repo, one, two, minimum_score, - skip_unmodified); + &dpf_options); this_src.name_score = basename_same(one, two); this_src.dst = i; this_src.src = j; From 1aedd03afb1592be9e4fbacdc614f45a99a865ea Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 22 Jun 2021 08:04:40 +0000 Subject: [PATCH 4/5] diffcore-rename: use a different prefetch for basename comparisons merge-ort was designed to minimize the amount of data needed and used, and several changes were made to diffcore-rename to take advantage of extra metadata to enable this data minimization (particularly the relevant_sources variable for skipping "irrelevant" renames). This effort obviously succeeded in drastically reducing computation times, but should also theoretically allow partial clones to download much less information. Previously, though, the "prefetch" command used in diffcore-rename had never been modified and downloaded many blobs that were unnecessary for merge-ort. This commit corrects that. When doing basename comparisons, we want to fetch only the objects that will be used for basename comparisons. If after basename fetching this leaves us with no more relevant sources (or no more destinations), then we won't need to do the full inexact rename detection and can skip downloading additional source and destination files. Even if we have to do that later full inexact rename detection, irrelevant sources are culled after basename matching and before the full inexact rename detection, so we can still avoid downloading the blobs for irrelevant sources. Rename prefetch() to inexact_prefetch(), and introduce a new basename_prefetch() to take advantage of this. If we modify the testcase from commit 557ac0350d ("merge-ort: begin performance work; instrument with trace2_region_* calls", 2021-01-23) to pass --sparse --filter=blob:none to the clone command, and use the new trace2 "fetch_count" output from a few commits ago to track both the number of fetch subcommands invoked and the number of objects fetched across all those fetches, then for the mega-renames testcase we observe the following: BEFORE this commit, rebasing 35 patches: strategy # of fetches total # of objects fetched --------- ------------ -------------------------- recursive 62 11423 ort 30 11391 AFTER this commit, rebasing the same 35 patches: ort 32 63 This means that the new code only needs to download less than 2 blobs per patch being rebased. That is especially interesting given that the repository at the start only had approximately half a dozen TOTAL blobs downloaded to start with (because the default sparse-checkout of just the toplevel directory was in use). So, for this particular linux kernel testcase that involved ~26,000 renames on the upstream side (drivers/ -> pilots/) across which 35 patches were being rebased, this change reduces the number of blobs that need to be downloaded by a factor of ~180. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- diffcore-rename.c | 101 +++++++++++++++++++++++++++------ t/t6421-merge-partial-clone.sh | 4 +- 2 files changed, 85 insertions(+), 20 deletions(-) diff --git a/diffcore-rename.c b/diffcore-rename.c index 8affa6130e2da2..5ab513e78c763b 100644 --- a/diffcore-rename.c +++ b/diffcore-rename.c @@ -87,13 +87,13 @@ struct diff_score { short name_score; }; -struct prefetch_options { +struct inexact_prefetch_options { struct repository *repo; int skip_unmodified; }; -static void prefetch(void *prefetch_options) +static void inexact_prefetch(void *prefetch_options) { - struct prefetch_options *options = prefetch_options; + struct inexact_prefetch_options *options = prefetch_options; int i; struct oid_array to_fetch = OID_ARRAY_INIT; @@ -816,6 +816,78 @@ static int idx_possible_rename(char *filename, struct dir_rename_info *info) return idx; } +struct basename_prefetch_options { + struct repository *repo; + struct strintmap *relevant_sources; + struct strintmap *sources; + struct strintmap *dests; + struct dir_rename_info *info; +}; +static void basename_prefetch(void *prefetch_options) +{ + struct basename_prefetch_options *options = prefetch_options; + struct strintmap *relevant_sources = options->relevant_sources; + struct strintmap *sources = options->sources; + struct strintmap *dests = options->dests; + struct dir_rename_info *info = options->info; + int i; + struct oid_array to_fetch = OID_ARRAY_INIT; + + /* + * TODO: The following loops mirror the code/logic from + * find_basename_matches(), though not quite exactly. Maybe + * abstract the iteration logic out somehow? + */ + for (i = 0; i < rename_src_nr; ++i) { + char *filename = rename_src[i].p->one->path; + const char *base = NULL; + intptr_t src_index; + intptr_t dst_index; + + /* Skip irrelevant sources */ + if (relevant_sources && + !strintmap_contains(relevant_sources, filename)) + continue; + + /* + * If the basename is unique among remaining sources, then + * src_index will equal 'i' and we can attempt to match it + * to a unique basename in the destinations. Otherwise, + * use directory rename heuristics, if possible. + */ + base = get_basename(filename); + src_index = strintmap_get(sources, base); + assert(src_index == -1 || src_index == i); + + if (strintmap_contains(dests, base)) { + struct diff_filespec *one, *two; + + /* Find a matching destination, if possible */ + dst_index = strintmap_get(dests, base); + if (src_index == -1 || dst_index == -1) { + src_index = i; + dst_index = idx_possible_rename(filename, info); + } + if (dst_index == -1) + continue; + + /* Ignore this dest if already used in a rename */ + if (rename_dst[dst_index].is_rename) + continue; /* already used previously */ + + one = rename_src[src_index].p->one; + two = rename_dst[dst_index].p->two; + + /* Add the pairs */ + diff_add_if_missing(options->repo, &to_fetch, two); + diff_add_if_missing(options->repo, &to_fetch, one); + } + } + + promisor_remote_get_direct(options->repo, to_fetch.oid, to_fetch.nr); + oid_array_clear(&to_fetch); +} + static int find_basename_matches(struct diff_options *options, int minimum_score, struct dir_rename_info *info, @@ -860,19 +932,12 @@ static int find_basename_matches(struct diff_options *options, .missing_object_cb = NULL, .missing_object_data = NULL }; - /* - * The prefeteching stuff wants to know if it can skip prefetching - * blobs that are unmodified...and will then do a little extra work - * to verify that the oids are indeed different before prefetching. - * Unmodified blobs are only relevant when doing copy detection; - * when limiting to rename detection, diffcore_rename[_extended]() - * will never be called with unmodified source paths fed to us, so - * the extra work necessary to check if rename_src entries are - * unmodified would be a small waste. - */ - struct prefetch_options prefetch_options = { + struct basename_prefetch_options prefetch_options = { .repo = options->repo, - .skip_unmodified = 0 + .relevant_sources = relevant_sources, + .sources = &sources, + .dests = &dests, + .info = info }; /* @@ -911,7 +976,7 @@ static int find_basename_matches(struct diff_options *options, } if (options->repo == the_repository && has_promisor_remote()) { - dpf_options.missing_object_cb = prefetch; + dpf_options.missing_object_cb = basename_prefetch; dpf_options.missing_object_data = &prefetch_options; } @@ -1282,7 +1347,7 @@ void diffcore_rename_extended(struct diff_options *options, .missing_object_cb = NULL, .missing_object_data = NULL }; - struct prefetch_options prefetch_options = { + struct inexact_prefetch_options prefetch_options = { .repo = options->repo }; @@ -1449,7 +1514,7 @@ void diffcore_rename_extended(struct diff_options *options, /* Finish setting up dpf_options */ prefetch_options.skip_unmodified = skip_unmodified; if (options->repo == the_repository && has_promisor_remote()) { - dpf_options.missing_object_cb = prefetch; + dpf_options.missing_object_cb = inexact_prefetch; dpf_options.missing_object_data = &prefetch_options; } diff --git a/t/t6421-merge-partial-clone.sh b/t/t6421-merge-partial-clone.sh index 3dcffc15f801e9..aad975d24ddb0b 100755 --- a/t/t6421-merge-partial-clone.sh +++ b/t/t6421-merge-partial-clone.sh @@ -207,7 +207,7 @@ test_setup_repo () { # # Summary: 2 fetches (1 for 2 objects, 1 for 1 object) # -test_expect_merge_algorithm failure failure 'Objects downloaded for single relevant rename' ' +test_expect_merge_algorithm failure success 'Objects downloaded for single relevant rename' ' test_setup_repo && git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-single && ( @@ -296,7 +296,7 @@ test_expect_merge_algorithm failure failure 'Objects downloaded for single relev # this are not all that common.) # Summary: 1 fetches for 6 objects # -test_expect_merge_algorithm failure failure 'Objects downloaded when a directory rename triggered' ' +test_expect_merge_algorithm failure success 'Objects downloaded when a directory rename triggered' ' test_setup_repo && git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-dir && ( From 2bff554b23e8d5c17f4e6827358bb51811fc7137 Mon Sep 17 00:00:00 2001 From: Elijah Newren Date: Tue, 22 Jun 2021 08:04:41 +0000 Subject: [PATCH 5/5] merge-ort: add prefetching for content merges Commit 7fbbcb21b1 ("diff: batch fetching of missing blobs", 2019-04-05) introduced batching of fetching missing blobs, so that the diff machinery would have one fetch subprocess grab N blobs instead of N processes each grabbing 1. However, the diff machinery is not the only thing in a merge that needs to work on blobs. The 3-way content merges need them as well. Rather than download all the blobs 1 at a time, prefetch all the blobs needed for regular content merges. This does not cover all possible paths in merge-ort that might need to download blobs. Others include: - The blob_unchanged() calls to avoid modify/delete conflicts (when blob renormalization results in an "unchanged" file) - Preliminary content merges needed for rename/add and rename/rename(2to1) style conflicts. (Both of these types of conflicts can result in nested conflict markers from the need to do two levels of content merging; the first happens before our new prefetch_for_content_merges() function.) The first of these wouldn't be an extreme amount of work to support, and even the second could be theoretically supported in batching, but all of these cases seem unusual to me, and this is a minor performance optimization anyway; in the worst case we only get some of the fetches batched and have a few additional one-off fetches. So for now, just handle the regular 3-way content merges in our prefetching. For the testcase from the previous commit, the number of downloaded objects remains at 63, but this drops the number of fetches needed from 32 down to 20, a sizeable reduction. Signed-off-by: Elijah Newren Signed-off-by: Junio C Hamano --- merge-ort.c | 50 ++++++++++++++++++++++++++++++++++ t/t6421-merge-partial-clone.sh | 2 +- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/merge-ort.c b/merge-ort.c index b954f7184a5f56..ccf85b738c36ee 100644 --- a/merge-ort.c +++ b/merge-ort.c @@ -29,6 +29,7 @@ #include "entry.h" #include "ll-merge.h" #include "object-store.h" +#include "promisor-remote.h" #include "revision.h" #include "strmap.h" #include "submodule.h" @@ -3460,6 +3461,54 @@ static void process_entry(struct merge_options *opt, record_entry_for_tree(dir_metadata, path, &ci->merged); } +static void prefetch_for_content_merges(struct merge_options *opt, + struct string_list *plist) +{ + struct string_list_item *e; + struct oid_array to_fetch = OID_ARRAY_INIT; + + if (opt->repo != the_repository || !has_promisor_remote()) + return; + + for (e = &plist->items[plist->nr-1]; e >= plist->items; --e) { + /* char *path = e->string; */ + struct conflict_info *ci = e->util; + int i; + + /* Ignore clean entries */ + if (ci->merged.clean) + continue; + + /* Ignore entries that don't need a content merge */ + if (ci->match_mask || ci->filemask < 6 || + !S_ISREG(ci->stages[1].mode) || + !S_ISREG(ci->stages[2].mode) || + oideq(&ci->stages[1].oid, &ci->stages[2].oid)) + continue; + + /* Also don't need content merge if base matches either side */ + if (ci->filemask == 7 && + S_ISREG(ci->stages[0].mode) && + (oideq(&ci->stages[0].oid, &ci->stages[1].oid) || + oideq(&ci->stages[0].oid, &ci->stages[2].oid))) + continue; + + for (i = 0; i < 3; i++) { + unsigned side_mask = (1 << i); + struct version_info *vi = &ci->stages[i]; + + if ((ci->filemask & side_mask) && + S_ISREG(vi->mode) && + oid_object_info_extended(opt->repo, &vi->oid, NULL, + OBJECT_INFO_FOR_PREFETCH)) + oid_array_append(&to_fetch, &vi->oid); + } + } + + promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr); + oid_array_clear(&to_fetch); +} + static void process_entries(struct merge_options *opt, struct object_id *result_oid) { @@ -3506,6 +3555,7 @@ static void process_entries(struct merge_options *opt, * the way when it is time to process the file at the same path). */ trace2_region_enter("merge", "processing", opt->repo); + prefetch_for_content_merges(opt, &plist); for (entry = &plist.items[plist.nr-1]; entry >= plist.items; --entry) { char *path = entry->string; /* diff --git a/t/t6421-merge-partial-clone.sh b/t/t6421-merge-partial-clone.sh index aad975d24ddb0b..36bcd7c3280152 100755 --- a/t/t6421-merge-partial-clone.sh +++ b/t/t6421-merge-partial-clone.sh @@ -397,7 +397,7 @@ test_expect_merge_algorithm failure success 'Objects downloaded when a directory # # Summary: 4 fetches (1 for 6 objects, 1 for 8, 1 for 3, 1 for 2) # -test_expect_merge_algorithm failure failure 'Objects downloaded with lots of renames and modifications' ' +test_expect_merge_algorithm failure success 'Objects downloaded with lots of renames and modifications' ' test_setup_repo && git clone --sparse --filter=blob:none "file://$(pwd)/server" objects-many && (