From dc4fa8c8c47dce999b9fdbd841f16b503b7d8632 Mon Sep 17 00:00:00 2001 From: Mark Nudelman Date: Sat, 24 Sep 2022 20:19:29 -0700 Subject: [PATCH] Add LESSUTFCHARDEF. --- NEWS | 4 ++ charset.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++--- less.h | 1 + less.nro.VER | 40 +++++++++++++++- output.c | 1 + version.c | 3 +- 6 files changed, 173 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index fa7b9eb3..176bbf91 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,10 @@ Major changes between "less" versions 608 and 609 +* Add LESSUTFCHARDEF environment variable (github #275). + +* Fix incorrect handling of some Private Use Unicode characters. + * Fix ANSI color bug when overstriking with colored chars (github #276). * Fix compiler const warning (github #279). diff --git a/charset.c b/charset.c index 5e9a2d64..93e9d532 100644 --- a/charset.c +++ b/charset.c @@ -21,6 +21,7 @@ #endif #include "charset.h" +#include "xbuf.h" #if MSDOS_COMPILER==WIN32C #define WIN32_LEAN_AND_MEAN @@ -123,6 +124,118 @@ static char *binfmt = NULL; static char *utfbinfmt = NULL; public int binattr = AT_STANDOUT|AT_COLOR_BIN; +static struct xbuffer user_wide_array; +static struct xbuffer user_ubin_array; +static struct xbuffer user_compose_array; +static struct xbuffer user_prt_array; +static struct wchar_range_table user_wide_table; +static struct wchar_range_table user_ubin_table; +static struct wchar_range_table user_compose_table; +static struct wchar_range_table user_prt_table; + +/* + * Set a wchar_range_table to the table in an xbuffer. + */ + static void +wchar_range_table_set(tbl, arr) + struct wchar_range_table *tbl; + struct xbuffer *arr; +{ + tbl->table = (struct wchar_range *) arr->data; + tbl->count = arr->end / sizeof(struct wchar_range); +} + +/* + * Skip over a "U" or "U+" prefix before a hex codepoint. + */ + static char * +skip_uprefix(s) + char *s; +{ + if (*s == 'U' || *s == 'u') + if (*++s == '+') ++s; + return s; +} + +/* + * Parse a dash-separated range of hex values. + */ + static void +wchar_range_get(ss, range) + char **ss; + struct wchar_range *range; +{ + char *s = skip_uprefix(*ss); + range->first = lstrtoul(s, &s, 16); + if (s[0] == '-') + { + s = skip_uprefix(&s[1]); + range->last = lstrtoul(s, &s, 16); + } else + { + range->last = range->first; + } + *ss = s; +} + +/* + * Parse the LESSUTFCHARDEF variable. + */ + static void +ichardef_utf(s) + char *s; +{ + xbuf_init(&user_wide_array); + xbuf_init(&user_ubin_array); + xbuf_init(&user_compose_array); + xbuf_init(&user_prt_array); + + if (s != NULL) + { + while (s[0] != '\0') + { + struct wchar_range range; + wchar_range_get(&s, &range); + if (range.last == 0) + { + error("invalid hex number(s) in LESSUTFCHARDEF", NULL_PARG); + quit(QUIT_ERROR); + } + if (*s++ != ':') + { + error("missing colon in LESSUTFCHARDEF", NULL_PARG); + quit(QUIT_ERROR); + } + switch (*s++) + { + case 'b': + xbuf_add_data(&user_ubin_array, (unsigned char *) &range, sizeof(range)); + break; + case 'c': + xbuf_add_data(&user_compose_array, (unsigned char *) &range, sizeof(range)); + break; + case 'w': + xbuf_add_data(&user_wide_array, (unsigned char *) &range, sizeof(range)); + xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range)); + break; + case 'p': case '.': + xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range)); + break; + case '\0': + s--; + break; + default: + error("invalid unicode attribute in LESSUTFCHARDEF", NULL_PARG); + quit(QUIT_ERROR); + } + if (s[0] == ',') ++s; + } + } + wchar_range_table_set(&user_wide_table, &user_wide_array); + wchar_range_table_set(&user_ubin_table, &user_ubin_array); + wchar_range_table_set(&user_compose_table, &user_compose_array); + wchar_range_table_set(&user_prt_table, &user_prt_array); +} /* * Define a charset, given a description string. @@ -327,6 +440,9 @@ set_charset(VOID_PARAM) if (icharset("utf-8", 1)) return; #endif + + ichardef_utf(lgetenv("LESSUTFCHARDEF")); + /* * See if environment variable LESSCHARSET is defined. */ @@ -767,7 +883,7 @@ is_in_table(ch, table) int lo; /* Binary search in the table. */ - if (ch < table->table[0].first) + if (table->table == NULL || table->count == 0 || ch < table->table[0].first) return 0; lo = 0; hi = table->count - 1; @@ -792,7 +908,9 @@ is_in_table(ch, table) is_composing_char(ch) LWCHAR ch; { - return is_in_table(ch, &compose_table) || + if (is_in_table(ch, &user_prt_table)) return 0; + return is_in_table(ch, &user_compose_table) || + is_in_table(ch, &compose_table) || (bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table)); } @@ -803,9 +921,10 @@ is_composing_char(ch) is_ubin_char(ch) LWCHAR ch; { - int ubin = is_in_table(ch, &ubin_table) || - (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table)); - return ubin; + if (is_in_table(ch, &user_prt_table)) return 0; + return is_in_table(ch, &user_ubin_table) || + is_in_table(ch, &ubin_table) || + (bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table)); } /* @@ -815,7 +934,8 @@ is_ubin_char(ch) is_wide_char(ch) LWCHAR ch; { - return is_in_table(ch, &wide_table); + return is_in_table(ch, &user_wide_table) || + is_in_table(ch, &wide_table); } /* diff --git a/less.h b/less.h index 2f880eb1..222a7114 100644 --- a/less.h +++ b/less.h @@ -582,6 +582,7 @@ void linenumtoa LESSPARAMS ((LINENUM, char*)); void inttoa LESSPARAMS ((int, char*)); int lstrtoi LESSPARAMS ((char*, char**, int)); POSITION lstrtopos LESSPARAMS ((char*, char**, int)); +unsigned long lstrtoul LESSPARAMS ((char*, char**, int)); #if MSDOS_COMPILER==WIN32C int pclose LESSPARAMS ((FILE*)); #endif diff --git a/less.nro.VER b/less.nro.VER index 330de31e..22ebd235 100644 --- a/less.nro.VER +++ b/less.nro.VER @@ -1671,7 +1671,43 @@ octets of a complete but non-shortest form sequence, invalid octets, and stray trailing octets) are displayed individually using LESSBINFMT so as to facilitate diagnostic of how the UTF-8 file is ill-formed. -. +.PP +When the character set is utf-8, in rare cases it may be desirable to +override the Unicode definition of the type of certain characters. +For example, characters in a Private Use Area are normally treated as control +characters, but if you are using a custom font with printable characters +in that range, it may be desirable to tell +.I less +to treat such characters as printable. +This can be done by setting the LESSUTFCHARDEF environment variable +to a comma-separated list of +.I "character type" +definitions. +Each character type definition consists of either one hexadecimal codepoint +or a pair of codepoints separated by a dash, +followed by a colon and a type character. +Each hexadecimal codepoint may optionally be preceded by a "U" or "U+". +If a pair of codepoints is given, the type is set for +all characters inclusively between the two values. +The type character may be one of: +.RS +.IP "p" +A normal printable character. +.IP "w" +A wide (2-space) printable character. +.IP "b" +A binary (non-printable) character. +.IP "c" +A composing (zero width) character. +.RE +.PP +For example, setting LESSUTFCHARDEF to +.nf +.sp + E000-F8FF:p,F0000-FFFFD:p,100000-10FFFD:p +.sp +.fi +would make all Private Use Area characters be treated as printable. .SH "PROMPTS" The \-P option allows you to tailor the prompt to your preference. The string given to the \-P option replaces the specified prompt string. @@ -2025,6 +2061,8 @@ See discussion under SECURITY. String to be appended to a directory name in filename completion. .IP LESSUTFBINFMT Format for displaying non-printable Unicode code points. +.IP LESSUTFCHARDEF +Overrides the type of specified Unicode characters. .IP LESS_IS_MORE Emulate the .IR more (1) diff --git a/output.c b/output.c index 0017c525..175d13ef 100644 --- a/output.c +++ b/output.c @@ -522,6 +522,7 @@ type funcname(buf, ebuf, radix) \ STR_TO_TYPE_FUNC(lstrtopos, POSITION) STR_TO_TYPE_FUNC(lstrtoi, int) +STR_TO_TYPE_FUNC(lstrtoul, unsigned long) /* * Output an integer in a given radix. diff --git a/version.c b/version.c index 3c4d7220..0529dfdc 100644 --- a/version.c +++ b/version.c @@ -954,7 +954,8 @@ v606 7/17/22 Fix bug with multibyte chars and --incsearch; escape filenames in LESSCLOSE; fix bin_file overrun. v607 7/19/22 Update Unicode tables. v608 7/22/22 Fix highlighting on colored text boundaries. -v609 +v609 Add LESSUTFCHARDEF; fix overstrike color bug; + fix procfs bug; fix signal race. */ char version[] = "609x";