diff --git a/Documentation/config/diff.adoc b/Documentation/config/diff.adoc index 1135a62a0ad3de..475736c6edacd5 100644 --- a/Documentation/config/diff.adoc +++ b/Documentation/config/diff.adoc @@ -218,6 +218,16 @@ endif::git-diff[] Set this option to `true` to make the diff driver cache the text conversion outputs. See linkgit:gitattributes[5] for details. +`diff..process`:: + The command to run as a long-running diff process. + The tool communicates via the pkt-line protocol and returns + hunks that are fed into Git's diff and blame pipelines. + If the tool returns zero hunks, the file is treated as + unchanged for both diff output and blame attribution. + Git provides `git diff-process-normalize` as a built-in + tool that detects whitespace-only changes. + See linkgit:gitattributes[5] for details. + `diff.indentHeuristic`:: Set this option to `false` to disable the default heuristics that shift diff hunk boundaries to make patches easier to read. diff --git a/Documentation/gitattributes.adoc b/Documentation/gitattributes.adoc index f20041a323d174..3f1d7affd8fbe1 100644 --- a/Documentation/gitattributes.adoc +++ b/Documentation/gitattributes.adoc @@ -821,6 +821,64 @@ NOTE: If `diff..command` is defined for path with the (see above), and adding `diff..algorithm` has no effect, as the algorithm is not passed to the external diff driver. +Using an external diff process +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +An external tool can provide content-aware line matching by +setting `diff..process` to the command that runs +the tool. The tool is a long-running process that communicates via +the pkt-line protocol (see +linkgit:gitprotocol-long-running-process[5]). + +------------------------ +*.c diff=cdiff +------------------------ + +---------------------------------------------------------------- +[diff "cdiff"] + process = /path/to/diff-process-tool +---------------------------------------------------------------- + +The tool receives file pairs and returns hunk descriptors indicating +which lines changed. Git feeds these hunks into its standard diff +pipeline, so all output features (word diff, function context, +color) work normally. + +If the tool fails or returns an error, Git silently falls back to +the builtin diff algorithm. If the tool returns invalid hunks +(out of bounds, overlapping), Git also falls back silently. + +The handshake negotiates `version=1` and `capability=hunks`. +Per-file requests send `command=hunks` and `pathname=`, +followed by the old and new file content as packetized data. +The tool responds with lines of the form +`hunk ` +(1-based line numbers), a flush packet, and `status=success`. + +If the tool returns zero hunks with `status=success`, Git treats +the file as having no changes and produces no diff output. +`git blame` also consults the diff process and skips commits +where it reports zero hunks, attributing lines to earlier commits +instead. + +Git ships with a built-in diff process, `git diff-process-normalize`, +that detects whitespace-only changes. Files whose only differences +are whitespace produce zero hunks; files with non-whitespace changes +fall back to the builtin diff algorithm. To use it: + +---------------------------------------------------------------- +[diff "cdiff"] + process = git diff-process-normalize +---------------------------------------------------------------- + +This is useful after running a code formatter: `git diff` shows +no output for files that only had whitespace changes, +`git blame` skips whitespace-only commits automatically without +requiring a `.git-blame-ignore-revs` file. + +Tools should ignore unknown keys in the per-file request to +remain forward-compatible. + Defining a custom hunk-header ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/Makefile b/Makefile index cedc234173e377..01acfaf7b80764 100644 --- a/Makefile +++ b/Makefile @@ -1142,6 +1142,7 @@ LIB_OBJS += diff-delta.o LIB_OBJS += diff-merges.o LIB_OBJS += diff-lib.o LIB_OBJS += diff-no-index.o +LIB_OBJS += diff-process.o LIB_OBJS += diff.o LIB_OBJS += diffcore-break.o LIB_OBJS += diffcore-delta.o @@ -1408,6 +1409,7 @@ BUILTIN_OBJS += builtin/diagnose.o BUILTIN_OBJS += builtin/diff-files.o BUILTIN_OBJS += builtin/diff-index.o BUILTIN_OBJS += builtin/diff-pairs.o +BUILTIN_OBJS += builtin/diff-process-normalize.o BUILTIN_OBJS += builtin/diff-tree.o BUILTIN_OBJS += builtin/diff.o BUILTIN_OBJS += builtin/difftool.o diff --git a/blame.c b/blame.c index a3c49d132e4ae1..8a5f14db7a0f87 100644 --- a/blame.c +++ b/blame.c @@ -19,6 +19,8 @@ #include "tag.h" #include "trace2.h" #include "blame.h" +#include "diff-process.h" +#include "userdiff.h" #include "alloc.h" #include "commit-slab.h" #include "bloom.h" @@ -315,16 +317,47 @@ static struct commit *fake_working_tree_commit(struct repository *r, static int diff_hunks(mmfile_t *file_a, mmfile_t *file_b, - xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, int xdl_opts) + xdl_emit_hunk_consume_func_t hunk_func, void *cb_data, + int xdl_opts, struct index_state *istate, + const char *path) { xpparam_t xpp = {0}; xdemitconf_t xecfg = {0}; xdemitcb_t ecb = {NULL}; + struct xdl_hunk *ext_hunks = NULL; + int ret; xpp.flags = xdl_opts; xecfg.hunk_func = hunk_func; ecb.priv = cb_data; - return xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb); + + if (path && istate) { + struct userdiff_driver *drv; + drv = userdiff_find_by_path(istate, path); + if (drv && drv->process) { + size_t nr = 0; + if (!diff_process_get_hunks(drv, path, + file_a->ptr, file_a->size, + file_b->ptr, file_b->size, + &ext_hunks, &nr)) { + if (!nr) { + /* + * Zero hunks: the diff process + * considers these files equivalent. + * Skip so blame looks past this + * commit. + */ + return 0; + } + xpp.external_hunks = ext_hunks; + xpp.external_hunks_nr = nr; + } + } + } + + ret = xdi_diff(file_a, file_b, &xpp, &xecfg, &ecb); + free(ext_hunks); + return ret; } static const char *get_next_line(const char *start, const char *end) @@ -1961,7 +1994,8 @@ static void pass_blame_to_parent(struct blame_scoreboard *sb, &sb->num_read_blob, ignore_diffs); sb->num_get_patch++; - if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts)) + if (diff_hunks(&file_p, &file_o, blame_chunk_cb, &d, sb->xdl_opts, + sb->revs->diffopt.repo->index, target->path)) die("unable to generate diff (%s -> %s)", oid_to_hex(&parent->commit->object.oid), oid_to_hex(&target->commit->object.oid)); @@ -2114,7 +2148,8 @@ static void find_copy_in_blob(struct blame_scoreboard *sb, * file_p partially may match that image. */ memset(split, 0, sizeof(struct blame_entry [3])); - if (diff_hunks(file_p, &file_o, handle_split_cb, &d, sb->xdl_opts)) + if (diff_hunks(file_p, &file_o, handle_split_cb, &d, sb->xdl_opts, + NULL, NULL)) die("unable to generate diff (%s)", oid_to_hex(&parent->commit->object.oid)); /* remainder, if any, all match the preimage */ diff --git a/builtin.h b/builtin.h index 235c51f30e5380..c713a0417f615f 100644 --- a/builtin.h +++ b/builtin.h @@ -178,6 +178,7 @@ int cmd_diff_files(int argc, const char **argv, const char *prefix, struct repos int cmd_diff_index(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_diff(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_diff_pairs(int argc, const char **argv, const char *prefix, struct repository *repo); +int cmd_diff_process_normalize(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_diff_tree(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_difftool(int argc, const char **argv, const char *prefix, struct repository *repo); int cmd_env__helper(int argc, const char **argv, const char *prefix, struct repository *repo); diff --git a/builtin/diff-process-normalize.c b/builtin/diff-process-normalize.c new file mode 100644 index 00000000000000..1580f6b7d958a1 --- /dev/null +++ b/builtin/diff-process-normalize.c @@ -0,0 +1,143 @@ +/* + * Built-in diff process that returns zero hunks for files whose + * only differences are whitespace, and status=error otherwise. + * See diff-process.c for the protocol and gitattributes(5) for usage. + * + * Uses xdiff_compare_lines() with XDF_IGNORE_WHITESPACE to compare + * lines, giving the same whitespace handling as "git diff -w". + */ + +#include "builtin.h" +#include "pkt-line.h" +#include "strbuf.h" +#include "xdiff-interface.h" + +/* + * Read a single pkt-line. Returns 1 for data, 0 for flush, -1 for EOF. + */ +static int read_pkt(int fd, struct strbuf *line) +{ + int len; + char *data; + + if (packet_read_line_gently(fd, &len, &data) < 0) + return -1; + if (!data || !len) + return 0; /* flush */ + strbuf_reset(line); + strbuf_add(line, data, len); + strbuf_rtrim(line); + return 1; +} + +/* + * Read packetized content until a flush packet. + */ +static int read_content(int fd, struct strbuf *out) +{ + strbuf_reset(out); + if (read_packetized_to_strbuf(fd, out, PACKET_READ_GENTLE_ON_EOF) < 0) + return -1; + return 0; +} + +/* + * Compare two buffers line by line using xdiff_compare_lines() with + * XDF_IGNORE_WHITESPACE (same logic as "git diff -w"). + * Returns 1 if all lines match, 0 otherwise. + */ +static int whitespace_equivalent(const char *a, long size_a, + const char *b, long size_b) +{ + const char *ea = a + size_a; + const char *eb = b + size_b; + + while (a < ea && b < eb) { + const char *eol_a = memchr(a, '\n', ea - a); + const char *eol_b = memchr(b, '\n', eb - b); + long len_a = (eol_a ? eol_a : ea) - a; + long len_b = (eol_b ? eol_b : eb) - b; + + if (!xdiff_compare_lines(a, len_a, b, len_b, + XDF_IGNORE_WHITESPACE)) + return 0; + + a += len_a + (eol_a ? 1 : 0); + b += len_b + (eol_b ? 1 : 0); + } + + /* Both sides must be exhausted */ + return a >= ea && b >= eb; +} + +int cmd_diff_process_normalize(int argc UNUSED, const char **argv UNUSED, + const char *prefix UNUSED, + struct repository *repo UNUSED) +{ + struct strbuf line = STRBUF_INIT; + struct strbuf old_content = STRBUF_INIT; + struct strbuf new_content = STRBUF_INIT; + int ret; + + /* Handshake: read client greeting */ + ret = read_pkt(0, &line); + if (ret <= 0 || strcmp(line.buf, "git-diff-client")) + return 1; + ret = read_pkt(0, &line); + if (ret <= 0 || strcmp(line.buf, "version=1")) + return 1; + read_pkt(0, &line); /* flush */ + + /* Send server greeting */ + packet_write_fmt(1, "git-diff-server\n"); + packet_write_fmt(1, "version=1\n"); + packet_flush(1); + + /* Read client capabilities until flush */ + while ((ret = read_pkt(0, &line)) > 0) + ; /* consume */ + + /* Send our capabilities */ + packet_write_fmt(1, "capability=hunks\n"); + packet_flush(1); + + /* Main loop: process file pairs */ + for (;;) { + int have_command = 0; + + /* Read request headers until flush */ + while ((ret = read_pkt(0, &line)) > 0) { + if (starts_with(line.buf, "command=")) + have_command = 1; + } + if (ret < 0) + break; /* EOF: client closed connection */ + if (!have_command) + break; + + /* Read old file content */ + if (read_content(0, &old_content) < 0) + break; + /* Read new file content */ + if (read_content(0, &new_content) < 0) + break; + + if (whitespace_equivalent(old_content.buf, old_content.len, + new_content.buf, new_content.len)) { + /* Whitespace-only differences */ + packet_flush(1); /* zero hunks */ + packet_write_fmt(1, "status=success\n"); + packet_flush(1); + } else { + /* Non-whitespace differences: fall back */ + packet_flush(1); + packet_write_fmt(1, "status=error\n"); + packet_flush(1); + } + } + + strbuf_release(&line); + strbuf_release(&old_content); + strbuf_release(&new_content); + return 0; +} diff --git a/diff-process.c b/diff-process.c new file mode 100644 index 00000000000000..7b0f0e1f7e2084 --- /dev/null +++ b/diff-process.c @@ -0,0 +1,203 @@ +/* + * Diff process backend: communicates with a long-running external + * tool via the pkt-line protocol to obtain content-aware hunks. + * + * Protocol: pkt-line over stdin/stdout, following the pattern of + * the long-running filter process protocol (see convert.c). + * + * Handshake: + * git> git-diff-client / version=1 / flush + * tool< git-diff-server / version=1 / flush + * git> capability=hunks / flush + * tool< capability=hunks / flush + * + * Per-file: + * git> command=hunks / pathname= / flush + * git> / flush + * git> / flush + * tool< hunk + * tool< ... / flush + * tool< status=success / flush + * + * Zero hunks with status=success means the tool considers the + * files equivalent. Git will skip the diff for that file. + */ + +#include "git-compat-util.h" +#include "diff-process.h" +#include "userdiff.h" +#include "sub-process.h" +#include "pkt-line.h" +#include "strbuf.h" +#include "xdiff/xdiff.h" + +#define CAP_HUNKS (1u << 0) + +struct diff_subprocess { + struct subprocess_entry subprocess; + unsigned int supported_capabilities; +}; + +static int subprocess_map_initialized; +static struct hashmap subprocess_map; + +static int start_diff_process_fn(struct subprocess_entry *subprocess) +{ + static int versions[] = { 1, 0 }; + static struct subprocess_capability capabilities[] = { + { "hunks", CAP_HUNKS }, + { NULL, 0 } + }; + struct diff_subprocess *entry = + (struct diff_subprocess *)subprocess; + + /* Uses dying pkt-line variant, same as convert.c filters. */ + return subprocess_handshake(subprocess, "git-diff", + versions, NULL, + capabilities, + &entry->supported_capabilities); +} + +static struct diff_subprocess *find_or_start_process(const char *cmd) +{ + struct diff_subprocess *entry; + + if (!subprocess_map_initialized) { + subprocess_map_initialized = 1; + hashmap_init(&subprocess_map, cmd2process_cmp, NULL, 0); + } + + entry = (struct diff_subprocess *) + subprocess_find_entry(&subprocess_map, cmd); + if (entry) + return entry; + + entry = xcalloc(1, sizeof(*entry)); + if (subprocess_start(&subprocess_map, &entry->subprocess, + cmd, start_diff_process_fn)) { + free(entry); + return NULL; + } + + return entry; +} + +static int send_file_content(int fd, const char *buf, long size) +{ + int ret; + + if (size > 0) + ret = write_packetized_from_buf_no_flush(buf, size, fd); + else + ret = 0; + if (ret) + return ret; + return packet_flush_gently(fd); +} + +static int parse_hunk_line(const char *line, struct xdl_hunk *hunk) +{ + char *end; + + /* Format: "hunk " */ + if (!skip_prefix(line, "hunk ", &line)) + return -1; + + hunk->old_start = strtol(line, &end, 10); + if (end == line || *end != ' ') + return -1; + line = end; + + hunk->old_count = strtol(line, &end, 10); + if (end == line || *end != ' ') + return -1; + line = end; + + hunk->new_start = strtol(line, &end, 10); + if (end == line || *end != ' ') + return -1; + line = end; + + hunk->new_count = strtol(line, &end, 10); + if (end == line || *end != '\0') + return -1; + + return 0; +} + +int diff_process_get_hunks(struct userdiff_driver *drv, + const char *path, + const char *old_buf, long old_size, + const char *new_buf, long new_size, + struct xdl_hunk **hunks_out, + size_t *nr_hunks_out) +{ + struct diff_subprocess *backend; + struct child_process *process; + int fd_in, fd_out; + struct strbuf status = STRBUF_INIT; + struct xdl_hunk *hunks = NULL; + struct xdl_hunk hunk; + size_t nr_hunks = 0, alloc_hunks = 0; + int len; + char *line; + + if (!drv || !drv->process) + return -1; + + backend = find_or_start_process(drv->process); + if (!backend) + return -1; + + if (!(backend->supported_capabilities & CAP_HUNKS)) + return -1; + + process = subprocess_get_child_process(&backend->subprocess); + fd_in = process->in; + fd_out = process->out; + + /* Send request */ + if (packet_write_fmt_gently(fd_in, "command=hunks\n") || + packet_write_fmt_gently(fd_in, "pathname=%s\n", path) || + packet_flush_gently(fd_in)) + goto error; + + /* Send old file content */ + if (send_file_content(fd_in, old_buf, old_size)) + goto error; + + /* Send new file content */ + if (send_file_content(fd_in, new_buf, new_size)) + goto error; + + /* Read hunks until flush packet */ + while ((len = packet_read_line_gently(fd_out, NULL, &line)) >= 0 && + line) { + if (parse_hunk_line(line, &hunk) < 0) + goto error; + ALLOC_GROW(hunks, nr_hunks + 1, alloc_hunks); + hunks[nr_hunks++] = hunk; + } + if (len < 0) + goto error; + + /* Read status */ + if (subprocess_read_status(fd_out, &status)) + goto error; + + if (strcmp(status.buf, "success")) { + if (!strcmp(status.buf, "abort")) + backend->supported_capabilities &= ~CAP_HUNKS; + goto error; + } + + *hunks_out = hunks; + *nr_hunks_out = nr_hunks; + strbuf_release(&status); + return 0; + +error: + free(hunks); + strbuf_release(&status); + return -1; +} diff --git a/diff-process.h b/diff-process.h new file mode 100644 index 00000000000000..4c84951e0273fa --- /dev/null +++ b/diff-process.h @@ -0,0 +1,28 @@ +#ifndef DIFF_PROCESS_H +#define DIFF_PROCESS_H + +struct userdiff_driver; +struct xdl_hunk; + +/* + * Query a diff process for hunks describing the changes + * between old_buf and new_buf. + * + * The backend is a long-running subprocess configured via + * diff..process. It receives file content via + * pkt-line and returns hunks with 1-based line numbers. + * + * On success, sets *hunks_out and *nr_hunks_out to a newly allocated + * array (caller must free) and returns 0. + * + * On failure, returns -1. The caller should fall back to the + * builtin diff algorithm. + */ +int diff_process_get_hunks(struct userdiff_driver *drv, + const char *path, + const char *old_buf, long old_size, + const char *new_buf, long new_size, + struct xdl_hunk **hunks_out, + size_t *nr_hunks_out); + +#endif /* DIFF_PROCESS_H */ diff --git a/diff.c b/diff.c index 397e38b41cc6fa..c5e7c329b25609 100644 --- a/diff.c +++ b/diff.c @@ -25,7 +25,9 @@ #include "utf8.h" #include "odb.h" #include "userdiff.h" +#include "diff-process.h" #include "submodule.h" +#include "trace2.h" #include "hashmap.h" #include "mem-pool.h" #include "merge-ll.h" @@ -3991,6 +3993,7 @@ static void builtin_diff(const char *name_a, xpparam_t xpp; xdemitconf_t xecfg; struct emit_callback ecbdata; + struct xdl_hunk *ext_hunks = NULL; unsigned ws_rule; const struct userdiff_funcname *pe; @@ -4031,6 +4034,27 @@ static void builtin_diff(const char *name_a, xpp.ignore_regex_nr = o->ignore_regex_nr; xpp.anchors = o->anchors; xpp.anchors_nr = o->anchors_nr; + + if (!o->ignore_driver_algorithm && + one->driver && one->driver->process) { + size_t ext_hunks_nr = 0; + if (!diff_process_get_hunks( + one->driver, name_a, + mf1.ptr, mf1.size, + mf2.ptr, mf2.size, + &ext_hunks, &ext_hunks_nr)) { + if (!ext_hunks_nr) + goto free_ab_and_return; + xpp.external_hunks = ext_hunks; + xpp.external_hunks_nr = ext_hunks_nr; + } else { + trace2_data_string("diff", + o->repo, + "diff-process-fallback", + name_a); + } + } + xecfg.ctxlen = o->context; xecfg.interhunkctxlen = o->interhunkcontext; xecfg.flags = XDL_EMIT_FUNCNAMES; @@ -4111,6 +4135,7 @@ static void builtin_diff(const char *name_a, } else if (xdi_diff_outf(&mf1, &mf2, NULL, fn_out_consume, &ecbdata, &xpp, &xecfg)) die("unable to generate diff for %s", one->path); + free(ext_hunks); if (o->word_diff) free_diff_words_data(&ecbdata); if (textconv_one) diff --git a/git.c b/git.c index 5a40eab8a26a66..6239240b021f1d 100644 --- a/git.c +++ b/git.c @@ -568,6 +568,7 @@ static struct cmd_struct commands[] = { { "diff-files", cmd_diff_files, RUN_SETUP | NEED_WORK_TREE | NO_PARSEOPT }, { "diff-index", cmd_diff_index, RUN_SETUP | NO_PARSEOPT }, { "diff-pairs", cmd_diff_pairs, RUN_SETUP | NO_PARSEOPT }, + { "diff-process-normalize", cmd_diff_process_normalize, NO_PARSEOPT }, { "diff-tree", cmd_diff_tree, RUN_SETUP | NO_PARSEOPT }, { "difftool", cmd_difftool, RUN_SETUP_GENTLY }, { "fast-export", cmd_fast_export, RUN_SETUP }, diff --git a/t/t4080-diff-process.sh b/t/t4080-diff-process.sh new file mode 100755 index 00000000000000..a6fa1df456d7f2 --- /dev/null +++ b/t/t4080-diff-process.sh @@ -0,0 +1,430 @@ +#!/bin/sh + +test_description='diff process via long-running process' + +. ./test-lib.sh + +if test_have_prereq PYTHON +then + PYTHON_PATH=$(command -v python3) || PYTHON_PATH=$(command -v python) +fi + +# +# A single parametric diff process. +# Usage: diff-process-backend --mode= [--log=] +# +# Modes: +# whole-file - report all lines as changed (default) +# fixed-hunk - always report hunk 5 2 5 2 +# bad-hunk - report out-of-bounds hunk 999 1 999 1 +# zero-hunk - return zero hunks (files considered equivalent) +# error - return status=error for every request +# abort - return status=abort for every request +# crash - read one request then exit without responding +# +setup_backend () { + cat >"$TRASH_DIRECTORY/diff-process-backend.py" <<-\PYEOF + import sys, os + + def read_pkt(): + hdr = sys.stdin.buffer.read(4) + if len(hdr) < 4: return None + length = int(hdr, 16) + if length == 0: return "" + data = sys.stdin.buffer.read(length - 4) + return data.decode().rstrip("\n") + + def write_pkt(line): + data = (line + "\n").encode() + sys.stdout.buffer.write(f"{len(data)+4:04x}".encode() + data) + sys.stdout.buffer.flush() + + def write_flush(): + sys.stdout.buffer.write(b"0000") + sys.stdout.buffer.flush() + + def read_content(): + chunks = [] + while True: + hdr = sys.stdin.buffer.read(4) + if len(hdr) < 4: break + length = int(hdr, 16) + if length == 0: break + chunks.append(sys.stdin.buffer.read(length - 4)) + return b"".join(chunks) + + mode = "whole-file" + logfile = None + for arg in sys.argv[1:]: + if arg.startswith("--mode="): + mode = arg[7:] + elif arg.startswith("--log="): + logfile = open(arg[6:], "a") + + def log(msg): + if logfile: + logfile.write(msg + "\n") + logfile.flush() + + # Handshake + assert read_pkt() == "git-diff-client" + assert read_pkt() == "version=1" + read_pkt() + write_pkt("git-diff-server") + write_pkt("version=1") + write_flush() + while True: + p = read_pkt() + if p == "": break + write_pkt("capability=hunks") + write_flush() + + log("ready") + + while True: + cmd = None + pathname = None + while True: + p = read_pkt() + if p is None: sys.exit(0) + if p == "": break + if p.startswith("command="): cmd = p.split("=",1)[1] + if p.startswith("pathname="): pathname = p.split("=",1)[1] + if cmd is None: sys.exit(0) + old = read_content() + new = read_content() + log(f"command={cmd} pathname={pathname}") + + if mode == "error": + write_flush() + write_pkt("status=error") + write_flush() + continue + + if mode == "abort": + write_flush() + write_pkt("status=abort") + write_flush() + continue + + if mode == "crash": + sys.exit(1) + + if cmd == "hunks": + if mode == "fixed-hunk": + write_pkt("hunk 5 2 5 2") + elif mode == "bad-hunk": + write_pkt("hunk 999 1 999 1") + elif mode == "zero-hunk": + pass + else: + ol = len(old.split(b"\n")) + nl = len(new.split(b"\n")) + write_pkt(f"hunk 1 {ol} 1 {nl}") + write_flush() + write_pkt("status=success") + write_flush() + else: + write_flush() + write_pkt("status=error") + write_flush() + PYEOF + write_script diff-process-backend <<-SHEOF + exec "$PYTHON_PATH" "$TRASH_DIRECTORY/diff-process-backend.py" "\$@" + SHEOF +} + +BACKEND="./diff-process-backend" + +test_expect_success PYTHON 'setup' ' + setup_backend && + echo "*.c diff=cdiff" >.gitattributes && + git add .gitattributes && + git commit -m "initial" +' + +test_expect_success PYTHON 'diff process hunk boundaries affect output' ' + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + OLD5 + OLD6 + line7 + line8 + OLD9 + OLD10 + EOF + git add boundary.c && + git commit -m "add boundary.c" && + + cat >boundary.c <<-\EOF && + line1 + line2 + line3 + line4 + NEW5 + NEW6 + line7 + line8 + NEW9 + NEW10 + EOF + + # The file has changes at lines 5-6 and 9-10, but fixed-hunk + # only reports lines 5-6 as changed. Lines 9-10 should not + # appear as changed in the output. + git -c diff.cdiff.process="$BACKEND --mode=fixed-hunk" \ + diff boundary.c >actual && + grep "^-OLD5" actual && + grep "^-OLD6" actual && + grep "^+NEW5" actual && + grep "^+NEW6" actual && + ! grep "^-OLD9" actual && + ! grep "^-OLD10" actual && + ! grep "^+NEW9" actual && + ! grep "^+NEW10" actual +' + +test_expect_success PYTHON 'diff process fallback on tool error status' ' + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --mode=error --log=backend.log" \ + diff boundary.c >actual && + # Fallback produces the full builtin diff (both change regions). + grep "^-OLD5" actual && + grep "^+NEW5" actual && + grep "^-OLD9" actual && + grep "^+NEW9" actual && + # Tool was contacted (it replied with error, not crash). + grep "command=hunks pathname=boundary.c" backend.log +' + +test_expect_success PYTHON 'diff process fallback on bad hunks' ' + git -c diff.cdiff.process="$BACKEND --mode=bad-hunk" \ + diff boundary.c >actual && + grep "^-OLD5" actual && + grep "^+NEW5" actual && + grep "^-OLD9" actual && + grep "^+NEW9" actual +' + +test_expect_success PYTHON 'diff process fallback on tool crash' ' + git -c diff.cdiff.process="$BACKEND --mode=crash" \ + diff boundary.c >actual && + grep "^-OLD5" actual && + grep "^+NEW5" actual && + grep "^-OLD9" actual && + grep "^+NEW9" actual +' + +test_expect_success PYTHON 'diff process abort disables for session' ' + cat >abort1.c <<-\EOF && + int first(void) { return 1; } + EOF + cat >abort2.c <<-\EOF && + int second(void) { return 2; } + EOF + git add abort1.c abort2.c && + git commit -m "add abort files" && + + cat >abort1.c <<-\EOF && + int first(void) { return 10; } + EOF + cat >abort2.c <<-\EOF && + int second(void) { return 20; } + EOF + + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --mode=abort --log=backend.log" \ + diff -- abort1.c abort2.c >actual && + # Both files should still produce diff output via fallback. + grep "return 10" actual && + grep "return 20" actual && + # The tool aborts on the first file and git clears its + # capability. The second file never contacts the tool, + # so the log should have exactly one entry, not two. + grep "command=hunks" backend.log >matches && + test_line_count = 1 matches +' + +test_expect_success PYTHON 'diff process handles multiple files' ' + cat >multi1.c <<-\EOF && + int one(void) { return 1; } + EOF + cat >multi2.c <<-\EOF && + int two(void) { return 2; } + EOF + git add multi1.c multi2.c && + git commit -m "add multi files" && + + cat >multi1.c <<-\EOF && + int one(void) { return 10; } + EOF + cat >multi2.c <<-\EOF && + int two(void) { return 20; } + EOF + + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff -- multi1.c multi2.c >actual && + grep "return 10" actual && + grep "return 20" actual && + grep "pathname=multi1.c" backend.log && + grep "pathname=multi2.c" backend.log +' + +test_expect_success PYTHON 'diff process with --word-diff' ' + cat >worddiff.c <<-\EOF && + int value(void) { return 1; } + EOF + git add worddiff.c && + git commit -m "add worddiff.c" && + + cat >worddiff.c <<-\EOF && + int value(void) { return 999; } + EOF + + git -c diff.cdiff.process="$BACKEND" \ + diff --word-diff worddiff.c >actual && + grep "\[-1;-\]" actual && + grep "{+999;+}" actual +' + +test_expect_success PYTHON 'diff process bypassed by --diff-algorithm' ' + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + diff --diff-algorithm=patience worddiff.c >actual && + grep "return 999" actual && + test_path_is_missing backend.log +' + +test_expect_success PYTHON 'diff process works with git log -p' ' + cat >logtest.c <<-\EOF && + int logfunc(void) { return 1; } + EOF + git add logtest.c && + git commit -m "add logtest.c" && + + cat >logtest.c <<-\EOF && + int logfunc(void) { return 2; } + EOF + git add logtest.c && + git commit -m "change logtest.c" && + + rm -f backend.log && + git -c diff.cdiff.process="$BACKEND --log=backend.log" \ + log -1 -p -- logtest.c >actual && + grep "return 2" actual && + grep "command=hunks pathname=logtest.c" backend.log +' + +test_expect_success PYTHON 'diff process zero hunks suppresses diff output' ' + cat >zerohunk.c <<-\EOF && + int zero(void) { return 0; } + EOF + git add zerohunk.c && + git commit -m "add zerohunk.c" && + + cat >zerohunk.c <<-\EOF && + int zero(void) { return 999; } + EOF + + git -c diff.cdiff.process="$BACKEND --mode=zero-hunk" \ + diff zerohunk.c >actual && + test_must_be_empty actual +' + +test_expect_success PYTHON 'blame skips commits with zero hunks from diff process' ' + cat >blame.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add blame.c && + git commit -m "add blame.c" && + + cat >blame.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add blame.c && + git commit -m "reformat blame.c" && + BLAME_COMMIT=$(git rev-parse --short HEAD) && + + # Without zero-hunk mode, blame attributes the change. + git blame blame.c >without && + grep "$BLAME_COMMIT" without && + + # With zero-hunk mode, the process considers the files equivalent + # and blame skips the reformat commit. + git -c diff.cdiff.process="$BACKEND --mode=zero-hunk" \ + blame blame.c >with && + ! grep "$BLAME_COMMIT" with +' + +NORMALIZE="git diff-process-normalize" + +test_expect_success 'diff-process-normalize setup' ' + echo "*.c diff=cdiff" >.gitattributes && + git add .gitattributes && + test_commit normalize-base +' + +test_expect_success 'diff-process-normalize suppresses whitespace-only changes' ' + cat >ws.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + git add ws.c && + git commit -m "add ws.c" && + + cat >ws.c <<-\EOF && + int main(void) + { + return 0; + } + EOF + + git -c diff.cdiff.process="$NORMALIZE" \ + diff ws.c >actual && + test_must_be_empty actual +' + +test_expect_success 'diff-process-normalize falls back on non-whitespace changes' ' + cat >ws.c <<-\EOF && + int main(void) + { + return 0; + } + + int added_function(void) + { + return 99; + } + EOF + + git -c diff.cdiff.process="$NORMALIZE" \ + diff ws.c >actual && + grep "added_function" actual +' + +test_expect_success 'diff-process-normalize falls back on mixed whitespace and real changes' ' + cat >ws.c <<-\EOF && + int main(void) + { + return 42; + } + EOF + + git -c diff.cdiff.process="$NORMALIZE" \ + diff ws.c >actual && + grep "return 42" actual +' + +test_done diff --git a/userdiff.c b/userdiff.c index fe710a68bfdfa6..81c0bebcce65e0 100644 --- a/userdiff.c +++ b/userdiff.c @@ -499,6 +499,13 @@ int userdiff_config(const char *k, const char *v) drv->algorithm = drv->algorithm_owned; return ret; } + if (!strcmp(type, "process")) { + int ret; + FREE_AND_NULL(drv->process_owned); + ret = git_config_string(&drv->process_owned, k, v); + drv->process = drv->process_owned; + return ret; + } return 0; } diff --git a/userdiff.h b/userdiff.h index 827361b0bc9569..51c26e0d4190e5 100644 --- a/userdiff.h +++ b/userdiff.h @@ -31,6 +31,8 @@ struct userdiff_driver { char *textconv_owned; struct notes_cache *textconv_cache; int textconv_want_cache; + const char *process; + char *process_owned; }; enum userdiff_driver_type { USERDIFF_DRIVER_TYPE_BUILTIN = 1<<0, diff --git a/xdiff-interface.c b/xdiff-interface.c index f043330f2a12a0..9542c0bcc20f37 100644 --- a/xdiff-interface.c +++ b/xdiff-interface.c @@ -124,7 +124,12 @@ int xdi_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t co if (mf1->size > MAX_XDIFF_SIZE || mf2->size > MAX_XDIFF_SIZE) return -1; - if (!xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) + /* + * External hunks reference line numbers in the original content; + * trimming the tail would change line counts and invalidate them. + */ + if (!xpp->external_hunks && + !xecfg->ctxlen && !(xecfg->flags & XDL_EMIT_FUNCCONTEXT)) trim_common_tail(&a, &b); return xdl_diff(&a, &b, xpp, xecfg, xecb); diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h index dc370712e92860..2ee6f1aae3635e 100644 --- a/xdiff/xdiff.h +++ b/xdiff/xdiff.h @@ -78,6 +78,15 @@ typedef struct s_mmbuffer { long size; } mmbuffer_t; +/* + * Hunk descriptor for externally computed diffs. + * Line numbers are 1-based, matching unified diff convention. + */ +struct xdl_hunk { + long old_start, old_count; + long new_start, new_count; +}; + typedef struct s_xpparam { unsigned long flags; @@ -88,6 +97,10 @@ typedef struct s_xpparam { /* See Documentation/diff-options.adoc. */ char **anchors; size_t anchors_nr; + + /* Externally computed hunks: bypass the diff algorithm. */ + const struct xdl_hunk *external_hunks; + size_t external_hunks_nr; } xpparam_t; typedef struct s_xdemitcb { diff --git a/xdiff/xdiffi.c b/xdiff/xdiffi.c index 5455b4690d38ff..7eca4ab4a14071 100644 --- a/xdiff/xdiffi.c +++ b/xdiff/xdiffi.c @@ -1085,16 +1085,108 @@ static void xdl_mark_ignorable_regex(xdchange_t *xscr, const xdfenv_t *xe, } } +/* + * Populate the changed[] arrays from externally supplied hunks, + * bypassing the diff algorithm. Validates that hunks are in order, + * non-overlapping, and within bounds. + * + * Returns 0 on success, -1 on validation failure. + */ +static int xdl_populate_hunks_from_external(xdfenv_t *xe, + const struct xdl_hunk *hunks, + size_t nr_hunks) +{ + size_t i; + long j, prev_old_end = 0, prev_new_end = 0; + long total_old = 0, total_new = 0; + + /* + * Clear changed[] arrays. xdl_prepare_env() may have dirtied + * them via xdl_cleanup_records(). The allocation is nrec + 2 + * elements; changed points one past the start (see xprepare.c). + */ + memset(xe->xdf1.changed - 1, 0, + (xe->xdf1.nrec + 2) * sizeof(bool)); + memset(xe->xdf2.changed - 1, 0, + (xe->xdf2.nrec + 2) * sizeof(bool)); + + for (i = 0; i < nr_hunks; i++) { + const struct xdl_hunk *h = &hunks[i]; + + if (h->old_count < 0 || h->new_count < 0) + return -1; + + /* Bounds check (1-based line numbers) */ + if (h->old_count > 0 && + (h->old_start < 1 || + h->old_start + h->old_count - 1 > xe->xdf1.nrec)) + return -1; + if (h->new_count > 0 && + (h->new_start < 1 || + h->new_start + h->new_count - 1 > xe->xdf2.nrec)) + return -1; + + /* Zero-count hunks: start must still be in [1, nrec+1] */ + if (h->old_count == 0 && + (h->old_start < 1 || h->old_start > xe->xdf1.nrec + 1)) + return -1; + if (h->new_count == 0 && + (h->new_start < 1 || h->new_start > xe->xdf2.nrec + 1)) + return -1; + + /* Ordering: no overlap with previous hunk */ + if (h->old_start < prev_old_end || + h->new_start < prev_new_end) + return -1; + + for (j = 0; j < h->old_count; j++) + xe->xdf1.changed[h->old_start - 1 + j] = true; + for (j = 0; j < h->new_count; j++) + xe->xdf2.changed[h->new_start - 1 + j] = true; + + prev_old_end = h->old_start + h->old_count; + prev_new_end = h->new_start + h->new_count; + total_old += h->old_count; + total_new += h->new_count; + } + + /* + * Synchronization invariant: unchanged line counts must match. + * Otherwise xdl_build_script() would walk off one array. + */ + if ((long)xe->xdf1.nrec - total_old != + (long)xe->xdf2.nrec - total_new) + return -1; + + return 0; +} + int xdl_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp, xdemitconf_t const *xecfg, xdemitcb_t *ecb) { xdchange_t *xscr; xdfenv_t xe; emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff; - if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) { - - return -1; + if (xpp->external_hunks) { + if (xdl_prepare_env(mf1, mf2, xpp, &xe) < 0) + return -1; + if (xdl_populate_hunks_from_external(&xe, + xpp->external_hunks, + xpp->external_hunks_nr) < 0) { + /* + * Invalid external hunks; fall back to the + * builtin diff algorithm. Re-runs + * xdl_prepare_env() via xdl_do_diff(). + */ + xdl_free_env(&xe); + if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) + return -1; + } + } else { + if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) + return -1; } + if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 || xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 || xdl_build_script(&xe, &xscr) < 0) {