Skip to content

Commit

Permalink
Merge branch 'rs/userdiff-multibyte-regex'
Browse files Browse the repository at this point in the history
The userdiff regexp patterns for various filetypes that are built
into the system have been updated to avoid triggering regexp errors
from UTF-8 aware regex engines.

* rs/userdiff-multibyte-regex:
  userdiff: support regexec(3) with multi-byte support
  • Loading branch information
gitster committed Apr 20, 2023
2 parents 667fcf4 + be39144 commit cbfe844
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 2 deletions.
4 changes: 4 additions & 0 deletions t/t4034-diff-words.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ test_language_driver () {
echo "* diff='"$lang"'" >.gitattributes &&
word_diff --color-words
'
test_expect_success "diff driver '$lang' in Islandic" '
LANG=is_IS.UTF-8 LANGUAGE=is LC_ALL="$is_IS_locale" \
word_diff --color-words
'
}

test_expect_success setup '
Expand Down
31 changes: 29 additions & 2 deletions userdiff.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ static int drivers_alloc;
.cflags = REG_EXTENDED, \
}, \
.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
.word_regex_multi_byte = wrx "|[^[:space:]]", \
}
#define IPATTERN(lang, rx, wrx) { \
.name = lang, \
Expand All @@ -26,6 +27,7 @@ static int drivers_alloc;
.cflags = REG_EXTENDED | REG_ICASE, \
}, \
.word_regex = wrx "|[^[:space:]]|[\xc0-\xff][\x80-\xbf]+", \
.word_regex_multi_byte = wrx "|[^[:space:]]", \
}

/*
Expand Down Expand Up @@ -294,7 +296,7 @@ PATTERNS("scheme",
/* All other words should be delimited by spaces or parentheses */
"|([^][)(}{[ \t])+"),
PATTERNS("tex", "^(\\\\((sub)*section|chapter|part)\\*{0,1}\\{.*)$",
"\\\\[a-zA-Z@]+|\\\\.|[a-zA-Z0-9\x80-\xff]+"),
"\\\\[a-zA-Z@]+|\\\\.|([a-zA-Z0-9]|[^\x01-\x7f])+"),
{ "default", NULL, NULL, -1, { NULL, 0 } },
};
#undef PATTERNS
Expand Down Expand Up @@ -330,6 +332,25 @@ static int userdiff_find_by_namelen_cb(struct userdiff_driver *driver,
return 0;
}

static int regexec_supports_multi_byte_chars(void)
{
static const char not_space[] = "[^[:space:]]";
static const char utf8_multi_byte_char[] = "\xc2\xa3";
regex_t re;
regmatch_t match;
static int result = -1;

if (result != -1)
return result;
if (regcomp(&re, not_space, REG_EXTENDED))
BUG("invalid regular expression: %s", not_space);
result = !regexec(&re, utf8_multi_byte_char, 1, &match, 0) &&
match.rm_so == 0 &&
match.rm_eo == strlen(utf8_multi_byte_char);
regfree(&re);
return result;
}

static struct userdiff_driver *userdiff_find_by_namelen(const char *name, size_t len)
{
struct find_by_namelen_data udcbdata = {
Expand Down Expand Up @@ -405,7 +426,13 @@ int userdiff_config(const char *k, const char *v)
struct userdiff_driver *userdiff_find_by_name(const char *name)
{
int len = strlen(name);
return userdiff_find_by_namelen(name, len);
struct userdiff_driver *driver = userdiff_find_by_namelen(name, len);
if (driver && driver->word_regex_multi_byte) {
if (regexec_supports_multi_byte_chars())
driver->word_regex = driver->word_regex_multi_byte;
driver->word_regex_multi_byte = NULL;
}
return driver;
}

struct userdiff_driver *userdiff_find_by_path(struct index_state *istate,
Expand Down
1 change: 1 addition & 0 deletions userdiff.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ struct userdiff_driver {
int binary;
struct userdiff_funcname funcname;
const char *word_regex;
const char *word_regex_multi_byte;
const char *textconv;
struct notes_cache *textconv_cache;
int textconv_want_cache;
Expand Down

0 comments on commit cbfe844

Please sign in to comment.