Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

teach --histogram to diff

Port JGit's HistogramDiff algorithm over to C. Rough numbers (TODO) show
that it is faster than its --patience cousin, as well as the default
Meyers algorithm.

The implementation has been reworked to use structs and pointers,
instead of bitmasks, thus doing away with JGit's 2^28 line limit.

We also use xdiff's default hash table implementation (xdl_hash_bits()
with XDL_HASHLONG()) for convenience.

Signed-off-by: Tay Ray Chuan <rctay89@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
  • Loading branch information...
commit 8c912eea94a2138e8bc608f7c390eb0b313effb0 1 parent 46c8f29
Tay Ray Chuan rctay authored gitster committed
2  Makefile
@@ -1838,7 +1838,7 @@ ifndef NO_CURL
1838 1838 GIT_OBJS += http.o http-walker.o remote-curl.o
1839 1839 endif
1840 1840 XDIFF_OBJS = xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \
1841   - xdiff/xmerge.o xdiff/xpatience.o
  1841 + xdiff/xmerge.o xdiff/xpatience.o xdiff/xhistogram.o
1842 1842 VCSSVN_OBJS = vcs-svn/string_pool.o vcs-svn/line_buffer.o \
1843 1843 vcs-svn/repo_tree.o vcs-svn/fast_export.o vcs-svn/svndump.o
1844 1844 VCSSVN_TEST_OBJS = test-obj-pool.o test-string-pool.o \
2  diff.c
@@ -3369,6 +3369,8 @@ int diff_opt_parse(struct diff_options *options, const char **av, int ac)
3369 3369 DIFF_XDL_SET(options, IGNORE_WHITESPACE_AT_EOL);
3370 3370 else if (!strcmp(arg, "--patience"))
3371 3371 DIFF_XDL_SET(options, PATIENCE_DIFF);
  3372 + else if (!strcmp(arg, "--histogram"))
  3373 + DIFF_XDL_SET(options, HISTOGRAM_DIFF);
3372 3374
3373 3375 /* flags options */
3374 3376 else if (!strcmp(arg, "--binary")) {
2  merge-recursive.c
@@ -1759,6 +1759,8 @@ int parse_merge_opt(struct merge_options *o, const char *s)
1759 1759 o->subtree_shift = s + strlen("subtree=");
1760 1760 else if (!strcmp(s, "patience"))
1761 1761 o->xdl_opts |= XDF_PATIENCE_DIFF;
  1762 + else if (!strcmp(s, "histogram"))
  1763 + o->xdl_opts |= XDF_HISTOGRAM_DIFF;
1762 1764 else if (!strcmp(s, "ignore-space-change"))
1763 1765 o->xdl_opts |= XDF_IGNORE_WHITESPACE_CHANGE;
1764 1766 else if (!strcmp(s, "ignore-all-space"))
12 t/t4049-diff-histogram.sh
... ... @@ -0,0 +1,12 @@
  1 +#!/bin/sh
  2 +
  3 +test_description='histogram diff algorithm'
  4 +
  5 +. ./test-lib.sh
  6 +. "$TEST_DIRECTORY"/lib-diff-alternative.sh
  7 +
  8 +test_diff_frobnitz "histogram"
  9 +
  10 +test_diff_unique "histogram"
  11 +
  12 +test_done
1  xdiff/xdiff.h
@@ -33,6 +33,7 @@ extern "C" {
33 33 #define XDF_IGNORE_WHITESPACE_CHANGE (1 << 3)
34 34 #define XDF_IGNORE_WHITESPACE_AT_EOL (1 << 4)
35 35 #define XDF_PATIENCE_DIFF (1 << 5)
  36 +#define XDF_HISTOGRAM_DIFF (1 << 6)
36 37 #define XDF_WHITESPACE_FLAGS (XDF_IGNORE_WHITESPACE | XDF_IGNORE_WHITESPACE_CHANGE | XDF_IGNORE_WHITESPACE_AT_EOL)
37 38
38 39 #define XDL_PATCH_NORMAL '-'
3  xdiff/xdiffi.c
@@ -331,6 +331,9 @@ int xdl_do_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
331 331 if (xpp->flags & XDF_PATIENCE_DIFF)
332 332 return xdl_do_patience_diff(mf1, mf2, xpp, xe);
333 333
  334 + if (xpp->flags & XDF_HISTOGRAM_DIFF)
  335 + return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
  336 +
334 337 if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
335 338
336 339 return -1;
2  xdiff/xdiffi.h
@@ -57,5 +57,7 @@ int xdl_emit_diff(xdfenv_t *xe, xdchange_t *xscr, xdemitcb_t *ecb,
57 57 xdemitconf_t const *xecfg);
58 58 int xdl_do_patience_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
59 59 xdfenv_t *env);
  60 +int xdl_do_histogram_diff(mmfile_t *mf1, mmfile_t *mf2, xpparam_t const *xpp,
  61 + xdfenv_t *env);
60 62
61 63 #endif /* #if !defined(XDIFFI_H) */
384 xdiff/xhistogram.c
... ... @@ -0,0 +1,384 @@
  1 +/*
  2 + * Copyright (C) 2010, Google Inc.
  3 + * and other copyright owners as documented in JGit's IP log.
  4 + *
  5 + * This program and the accompanying materials are made available
  6 + * under the terms of the Eclipse Distribution License v1.0 which
  7 + * accompanies this distribution, is reproduced below, and is
  8 + * available at http://www.eclipse.org/org/documents/edl-v10.php
  9 + *
  10 + * All rights reserved.
  11 + *
  12 + * Redistribution and use in source and binary forms, with or
  13 + * without modification, are permitted provided that the following
  14 + * conditions are met:
  15 + *
  16 + * - Redistributions of source code must retain the above copyright
  17 + * notice, this list of conditions and the following disclaimer.
  18 + *
  19 + * - Redistributions in binary form must reproduce the above
  20 + * copyright notice, this list of conditions and the following
  21 + * disclaimer in the documentation and/or other materials provided
  22 + * with the distribution.
  23 + *
  24 + * - Neither the name of the Eclipse Foundation, Inc. nor the
  25 + * names of its contributors may be used to endorse or promote
  26 + * products derived from this software without specific prior
  27 + * written permission.
  28 + *
  29 + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  30 + * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  31 + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  32 + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  33 + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  34 + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  35 + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  36 + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  37 + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  38 + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  39 + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  40 + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  41 + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  42 + */
  43 +
  44 +#include "xinclude.h"
  45 +#include "xtypes.h"
  46 +#include "xdiff.h"
  47 +
  48 +#define MAX_PTR UINT_MAX
  49 +#define MAX_CNT UINT_MAX
  50 +
  51 +#define LINE_END(n) (line##n + count##n - 1)
  52 +#define LINE_END_PTR(n) (*line##n + *count##n - 1)
  53 +
  54 +struct histindex {
  55 + struct record {
  56 + unsigned int ptr, cnt;
  57 + struct record *next;
  58 + } **records, /* an ocurrence */
  59 + **line_map; /* map of line to record chain */
  60 + chastore_t rcha;
  61 + unsigned int *next_ptrs;
  62 + unsigned int table_bits,
  63 + records_size,
  64 + line_map_size;
  65 +
  66 + unsigned int max_chain_length,
  67 + key_shift,
  68 + ptr_shift;
  69 +
  70 + unsigned int cnt,
  71 + has_common;
  72 +
  73 + xdfenv_t *env;
  74 + xpparam_t const *xpp;
  75 +};
  76 +
  77 +struct region {
  78 + unsigned int begin1, end1;
  79 + unsigned int begin2, end2;
  80 +};
  81 +
  82 +#define LINE_MAP(i, a) (i->line_map[(a) - i->ptr_shift])
  83 +
  84 +#define NEXT_PTR(index, ptr) \
  85 + (index->next_ptrs[(ptr) - index->ptr_shift])
  86 +
  87 +#define CNT(index, ptr) \
  88 + ((LINE_MAP(index, ptr))->cnt)
  89 +
  90 +#define REC(env, s, l) \
  91 + (env->xdf##s.recs[l - 1])
  92 +
  93 +static int cmp_recs(xpparam_t const *xpp,
  94 + xrecord_t *r1, xrecord_t *r2)
  95 +{
  96 + return r1->ha == r2->ha &&
  97 + xdl_recmatch(r1->ptr, r1->size, r2->ptr, r2->size,
  98 + xpp->flags);
  99 +}
  100 +
  101 +#define CMP_ENV(xpp, env, s1, l1, s2, l2) \
  102 + (cmp_recs(xpp, REC(env, s1, l1), REC(env, s2, l2)))
  103 +
  104 +#define CMP(i, s1, l1, s2, l2) \
  105 + (CMP_ENV(i->xpp, i->env, s1, l1, s2, l2))
  106 +
  107 +#define TABLE_HASH(index, side, line) \
  108 + XDL_HASHLONG((REC(index->env, side, line))->ha, index->table_bits)
  109 +
  110 +static int scanA(struct histindex *index, int line1, int count1)
  111 +{
  112 + unsigned int ptr, tbl_idx;
  113 + unsigned int chain_len;
  114 + struct record **rec_chain, *rec;
  115 +
  116 + for (ptr = LINE_END(1); line1 <= ptr; ptr--) {
  117 + tbl_idx = TABLE_HASH(index, 1, ptr);
  118 + rec_chain = index->records + tbl_idx;
  119 + rec = *rec_chain;
  120 +
  121 + chain_len = 0;
  122 + while (rec) {
  123 + if (CMP(index, 1, rec->ptr, 1, ptr)) {
  124 + /*
  125 + * ptr is identical to another element. Insert
  126 + * it onto the front of the existing element
  127 + * chain.
  128 + */
  129 + NEXT_PTR(index, ptr) = rec->ptr;
  130 + rec->ptr = ptr;
  131 + /* cap rec->cnt at MAX_CNT */
  132 + rec->cnt = XDL_MIN(MAX_CNT, rec->cnt + 1);
  133 + LINE_MAP(index, ptr) = rec;
  134 + goto continue_scan;
  135 + }
  136 +
  137 + rec = rec->next;
  138 + chain_len++;
  139 + }
  140 +
  141 + if (chain_len == index->max_chain_length)
  142 + return -1;
  143 +
  144 + /*
  145 + * This is the first time we have ever seen this particular
  146 + * element in the sequence. Construct a new chain for it.
  147 + */
  148 + if (!(rec = xdl_cha_alloc(&index->rcha)))
  149 + return -1;
  150 + rec->ptr = ptr;
  151 + rec->cnt = 1;
  152 + rec->next = *rec_chain;
  153 + *rec_chain = rec;
  154 + LINE_MAP(index, ptr) = rec;
  155 +
  156 +continue_scan:
  157 + ; /* no op */
  158 + }
  159 +
  160 + return 0;
  161 +}
  162 +
  163 +static int try_lcs(struct histindex *index, struct region *lcs, int b_ptr,
  164 + int line1, int count1, int line2, int count2)
  165 +{
  166 + unsigned int b_next = b_ptr + 1;
  167 + struct record *rec = index->records[TABLE_HASH(index, 2, b_ptr)];
  168 + unsigned int as, ae, bs, be, np, rc;
  169 + int should_break;
  170 +
  171 + for (; rec; rec = rec->next) {
  172 + if (rec->cnt > index->cnt) {
  173 + if (!index->has_common)
  174 + index->has_common = CMP(index, 1, rec->ptr, 2, b_ptr);
  175 + continue;
  176 + }
  177 +
  178 + as = rec->ptr;
  179 + if (!CMP(index, 1, as, 2, b_ptr))
  180 + continue;
  181 +
  182 + index->has_common = 1;
  183 + for (;;) {
  184 + should_break = 0;
  185 + np = NEXT_PTR(index, as);
  186 + bs = b_ptr;
  187 + ae = as;
  188 + be = bs;
  189 + rc = rec->cnt;
  190 +
  191 + while (line1 < as && line2 < bs
  192 + && CMP(index, 1, as - 1, 2, bs - 1)) {
  193 + as--;
  194 + bs--;
  195 + if (1 < rc)
  196 + rc = XDL_MIN(rc, CNT(index, as));
  197 + }
  198 + while (ae < LINE_END(1) && be < LINE_END(2)
  199 + && CMP(index, 1, ae + 1, 2, be + 1)) {
  200 + ae++;
  201 + be++;
  202 + if (1 < rc)
  203 + rc = XDL_MIN(rc, CNT(index, ae));
  204 + }
  205 +
  206 + if (b_next <= be)
  207 + b_next = be + 1;
  208 + if (lcs->end1 - lcs->begin1 < ae - as || rc < index->cnt) {
  209 + lcs->begin1 = as;
  210 + lcs->begin2 = bs;
  211 + lcs->end1 = ae;
  212 + lcs->end2 = be;
  213 + index->cnt = rc;
  214 + }
  215 +
  216 + if (np == 0)
  217 + break;
  218 +
  219 + while (np <= ae) {
  220 + np = NEXT_PTR(index, np);
  221 + if (np == 0) {
  222 + should_break = 1;
  223 + break;
  224 + }
  225 + }
  226 +
  227 + if (should_break)
  228 + break;
  229 +
  230 + as = np;
  231 + }
  232 + }
  233 + return b_next;
  234 +}
  235 +
  236 +static int find_lcs(struct histindex *index, struct region *lcs,
  237 + int line1, int count1, int line2, int count2) {
  238 + int b_ptr;
  239 +
  240 + if (scanA(index, line1, count1))
  241 + return -1;
  242 +
  243 + index->cnt = index->max_chain_length + 1;
  244 +
  245 + for (b_ptr = line2; b_ptr <= LINE_END(2); )
  246 + b_ptr = try_lcs(index, lcs, b_ptr, line1, count1, line2, count2);
  247 +
  248 + return index->has_common && index->max_chain_length < index->cnt;
  249 +}
  250 +
  251 +static void reduce_common_start_end(xpparam_t const *xpp, xdfenv_t *env,
  252 + int *line1, int *count1, int *line2, int *count2)
  253 +{
  254 + if (*count1 <= 1 || *count2 <= 1)
  255 + return;
  256 + while (*count1 > 1 && *count2 > 1 && CMP_ENV(xpp, env, 1, *line1, 2, *line2)) {
  257 + (*line1)++;
  258 + (*count1)--;
  259 + (*line2)++;
  260 + (*count2)--;
  261 + }
  262 + while (*count1 > 1 && *count2 > 1 && CMP_ENV(xpp, env, 1, LINE_END_PTR(1), 2, LINE_END_PTR(2))) {
  263 + (*count1)--;
  264 + (*count2)--;
  265 + }
  266 +}
  267 +
  268 +static int fall_back_to_classic_diff(struct histindex *index,
  269 + int line1, int count1, int line2, int count2)
  270 +{
  271 + xpparam_t xpp;
  272 + xpp.flags = index->xpp->flags & ~XDF_HISTOGRAM_DIFF;
  273 +
  274 + return xdl_fall_back_diff(index->env, &xpp,
  275 + line1, count1, line2, count2);
  276 +}
  277 +
  278 +static int histogram_diff(xpparam_t const *xpp, xdfenv_t *env,
  279 + int line1, int count1, int line2, int count2)
  280 +{
  281 + struct histindex index;
  282 + struct region lcs;
  283 + int sz;
  284 + int result = -1;
  285 +
  286 + if (count1 <= 0 && count2 <= 0)
  287 + return 0;
  288 +
  289 + if (LINE_END(1) >= MAX_PTR)
  290 + return -1;
  291 +
  292 + if (!count1) {
  293 + while(count2--)
  294 + env->xdf2.rchg[line2++ - 1] = 1;
  295 + return 0;
  296 + } else if (!count2) {
  297 + while(count1--)
  298 + env->xdf1.rchg[line1++ - 1] = 1;
  299 + return 0;
  300 + }
  301 +
  302 + memset(&index, 0, sizeof(index));
  303 +
  304 + index.env = env;
  305 + index.xpp = xpp;
  306 +
  307 + index.records = NULL;
  308 + index.line_map = NULL;
  309 + /* in case of early xdl_cha_free() */
  310 + index.rcha.head = NULL;
  311 +
  312 + index.table_bits = xdl_hashbits(count1);
  313 + sz = index.records_size = 1 << index.table_bits;
  314 + sz *= sizeof(struct record *);
  315 + if (!(index.records = (struct record **) xdl_malloc(sz)))
  316 + goto cleanup;
  317 + memset(index.records, 0, sz);
  318 +
  319 + sz = index.line_map_size = count1;
  320 + sz *= sizeof(struct record *);
  321 + if (!(index.line_map = (struct record **) xdl_malloc(sz)))
  322 + goto cleanup;
  323 + memset(index.line_map, 0, sz);
  324 +
  325 + sz = index.line_map_size;
  326 + sz *= sizeof(unsigned int);
  327 + if (!(index.next_ptrs = (unsigned int *) xdl_malloc(sz)))
  328 + goto cleanup;
  329 + memset(index.next_ptrs, 0, sz);
  330 +
  331 + /* lines / 4 + 1 comes from xprepare.c:xdl_prepare_ctx() */
  332 + if (xdl_cha_init(&index.rcha, sizeof(struct record), count1 / 4 + 1) < 0)
  333 + goto cleanup;
  334 +
  335 + index.ptr_shift = line1;
  336 + index.max_chain_length = 64;
  337 +
  338 + memset(&lcs, 0, sizeof(lcs));
  339 + if (find_lcs(&index, &lcs, line1, count1, line2, count2))
  340 + result = fall_back_to_classic_diff(&index, line1, count1, line2, count2);
  341 + else {
  342 + result = 0;
  343 + if (lcs.begin1 == 0 && lcs.begin2 == 0) {
  344 + int ptr;
  345 + for (ptr = 0; ptr < count1; ptr++)
  346 + env->xdf1.rchg[line1 + ptr - 1] = 1;
  347 + for (ptr = 0; ptr < count2; ptr++)
  348 + env->xdf2.rchg[line2 + ptr - 1] = 1;
  349 + } else {
  350 + result = histogram_diff(xpp, env,
  351 + line1, lcs.begin1 - line1,
  352 + line2, lcs.begin2 - line2);
  353 + result = histogram_diff(xpp, env,
  354 + lcs.end1 + 1, LINE_END(1) - lcs.end1,
  355 + lcs.end2 + 1, LINE_END(2) - lcs.end2);
  356 + result *= -1;
  357 + }
  358 + }
  359 +
  360 +cleanup:
  361 + xdl_free(index.records);
  362 + xdl_free(index.line_map);
  363 + xdl_free(index.next_ptrs);
  364 + xdl_cha_free(&index.rcha);
  365 +
  366 + return result;
  367 +}
  368 +
  369 +int xdl_do_histogram_diff(mmfile_t *file1, mmfile_t *file2,
  370 + xpparam_t const *xpp, xdfenv_t *env)
  371 +{
  372 + int line1, line2, count1, count2;
  373 +
  374 + if (xdl_prepare_env(file1, file2, xpp, env) < 0)
  375 + return -1;
  376 +
  377 + line1 = line2 = 1;
  378 + count1 = env->xdf1.nrec;
  379 + count2 = env->xdf2.nrec;
  380 +
  381 + reduce_common_start_end(xpp, env, &line1, &count1, &line2, &count2);
  382 +
  383 + return histogram_diff(xpp, env, line1, count1, line2, count2);
  384 +}

0 comments on commit 8c912ee

Please sign in to comment.
Something went wrong with that request. Please try again.