Skip to content

Commit

Permalink
Add LESSUTFCHARDEF.
Browse files Browse the repository at this point in the history
  • Loading branch information
gwsw committed Sep 25, 2022
1 parent 8c77e72 commit dc4fa8c
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 8 deletions.
4 changes: 4 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@

Major changes between "less" versions 608 and 609

* Add LESSUTFCHARDEF environment variable (github #275).

* Fix incorrect handling of some Private Use Unicode characters.

* Fix ANSI color bug when overstriking with colored chars (github #276).

* Fix compiler const warning (github #279).
Expand Down
132 changes: 126 additions & 6 deletions charset.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#endif

#include "charset.h"
#include "xbuf.h"

#if MSDOS_COMPILER==WIN32C
#define WIN32_LEAN_AND_MEAN
Expand Down Expand Up @@ -123,6 +124,118 @@ static char *binfmt = NULL;
static char *utfbinfmt = NULL;
public int binattr = AT_STANDOUT|AT_COLOR_BIN;

static struct xbuffer user_wide_array;
static struct xbuffer user_ubin_array;
static struct xbuffer user_compose_array;
static struct xbuffer user_prt_array;
static struct wchar_range_table user_wide_table;
static struct wchar_range_table user_ubin_table;
static struct wchar_range_table user_compose_table;
static struct wchar_range_table user_prt_table;

/*
* Set a wchar_range_table to the table in an xbuffer.
*/
static void
wchar_range_table_set(tbl, arr)
struct wchar_range_table *tbl;
struct xbuffer *arr;
{
tbl->table = (struct wchar_range *) arr->data;
tbl->count = arr->end / sizeof(struct wchar_range);
}

/*
* Skip over a "U" or "U+" prefix before a hex codepoint.
*/
static char *
skip_uprefix(s)
char *s;
{
if (*s == 'U' || *s == 'u')
if (*++s == '+') ++s;
return s;
}

/*
* Parse a dash-separated range of hex values.
*/
static void
wchar_range_get(ss, range)
char **ss;
struct wchar_range *range;
{
char *s = skip_uprefix(*ss);
range->first = lstrtoul(s, &s, 16);
if (s[0] == '-')
{
s = skip_uprefix(&s[1]);
range->last = lstrtoul(s, &s, 16);
} else
{
range->last = range->first;
}
*ss = s;
}

/*
* Parse the LESSUTFCHARDEF variable.
*/
static void
ichardef_utf(s)
char *s;
{
xbuf_init(&user_wide_array);
xbuf_init(&user_ubin_array);
xbuf_init(&user_compose_array);
xbuf_init(&user_prt_array);

if (s != NULL)
{
while (s[0] != '\0')
{
struct wchar_range range;
wchar_range_get(&s, &range);
if (range.last == 0)
{
error("invalid hex number(s) in LESSUTFCHARDEF", NULL_PARG);
quit(QUIT_ERROR);
}
if (*s++ != ':')
{
error("missing colon in LESSUTFCHARDEF", NULL_PARG);
quit(QUIT_ERROR);
}
switch (*s++)
{
case 'b':
xbuf_add_data(&user_ubin_array, (unsigned char *) &range, sizeof(range));
break;
case 'c':
xbuf_add_data(&user_compose_array, (unsigned char *) &range, sizeof(range));
break;
case 'w':
xbuf_add_data(&user_wide_array, (unsigned char *) &range, sizeof(range));
xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
break;
case 'p': case '.':
xbuf_add_data(&user_prt_array, (unsigned char *) &range, sizeof(range));
break;
case '\0':
s--;
break;
default:
error("invalid unicode attribute in LESSUTFCHARDEF", NULL_PARG);
quit(QUIT_ERROR);
}
if (s[0] == ',') ++s;
}
}
wchar_range_table_set(&user_wide_table, &user_wide_array);
wchar_range_table_set(&user_ubin_table, &user_ubin_array);
wchar_range_table_set(&user_compose_table, &user_compose_array);
wchar_range_table_set(&user_prt_table, &user_prt_array);
}

/*
* Define a charset, given a description string.
Expand Down Expand Up @@ -327,6 +440,9 @@ set_charset(VOID_PARAM)
if (icharset("utf-8", 1))
return;
#endif

ichardef_utf(lgetenv("LESSUTFCHARDEF"));

/*
* See if environment variable LESSCHARSET is defined.
*/
Expand Down Expand Up @@ -767,7 +883,7 @@ is_in_table(ch, table)
int lo;

/* Binary search in the table. */
if (ch < table->table[0].first)
if (table->table == NULL || table->count == 0 || ch < table->table[0].first)
return 0;
lo = 0;
hi = table->count - 1;
Expand All @@ -792,7 +908,9 @@ is_in_table(ch, table)
is_composing_char(ch)
LWCHAR ch;
{
return is_in_table(ch, &compose_table) ||
if (is_in_table(ch, &user_prt_table)) return 0;
return is_in_table(ch, &user_compose_table) ||
is_in_table(ch, &compose_table) ||
(bs_mode != BS_CONTROL && is_in_table(ch, &fmt_table));
}

Expand All @@ -803,9 +921,10 @@ is_composing_char(ch)
is_ubin_char(ch)
LWCHAR ch;
{
int ubin = is_in_table(ch, &ubin_table) ||
(bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
return ubin;
if (is_in_table(ch, &user_prt_table)) return 0;
return is_in_table(ch, &user_ubin_table) ||
is_in_table(ch, &ubin_table) ||
(bs_mode == BS_CONTROL && is_in_table(ch, &fmt_table));
}

/*
Expand All @@ -815,7 +934,8 @@ is_ubin_char(ch)
is_wide_char(ch)
LWCHAR ch;
{
return is_in_table(ch, &wide_table);
return is_in_table(ch, &user_wide_table) ||
is_in_table(ch, &wide_table);
}

/*
Expand Down
1 change: 1 addition & 0 deletions less.h
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,7 @@ void linenumtoa LESSPARAMS ((LINENUM, char*));
void inttoa LESSPARAMS ((int, char*));
int lstrtoi LESSPARAMS ((char*, char**, int));
POSITION lstrtopos LESSPARAMS ((char*, char**, int));
unsigned long lstrtoul LESSPARAMS ((char*, char**, int));
#if MSDOS_COMPILER==WIN32C
int pclose LESSPARAMS ((FILE*));
#endif
40 changes: 39 additions & 1 deletion less.nro.VER
Original file line number Diff line number Diff line change
Expand Up @@ -1671,7 +1671,43 @@ octets of a complete but non-shortest form sequence, invalid octets,
and stray trailing octets)
are displayed individually using LESSBINFMT so as to facilitate diagnostic
of how the UTF-8 file is ill-formed.
.
.PP
When the character set is utf-8, in rare cases it may be desirable to
override the Unicode definition of the type of certain characters.
For example, characters in a Private Use Area are normally treated as control
characters, but if you are using a custom font with printable characters
in that range, it may be desirable to tell
.I less
to treat such characters as printable.
This can be done by setting the LESSUTFCHARDEF environment variable
to a comma-separated list of
.I "character type"
definitions.
Each character type definition consists of either one hexadecimal codepoint
or a pair of codepoints separated by a dash,
followed by a colon and a type character.
Each hexadecimal codepoint may optionally be preceded by a "U" or "U+".
If a pair of codepoints is given, the type is set for
all characters inclusively between the two values.
The type character may be one of:
.RS
.IP "p"
A normal printable character.
.IP "w"
A wide (2-space) printable character.
.IP "b"
A binary (non-printable) character.
.IP "c"
A composing (zero width) character.
.RE
.PP
For example, setting LESSUTFCHARDEF to
.nf
.sp
E000-F8FF:p,F0000-FFFFD:p,100000-10FFFD:p
.sp
.fi
would make all Private Use Area characters be treated as printable.
.SH "PROMPTS"
The \-P option allows you to tailor the prompt to your preference.
The string given to the \-P option replaces the specified prompt string.
Expand Down Expand Up @@ -2025,6 +2061,8 @@ See discussion under SECURITY.
String to be appended to a directory name in filename completion.
.IP LESSUTFBINFMT
Format for displaying non-printable Unicode code points.
.IP LESSUTFCHARDEF
Overrides the type of specified Unicode characters.
.IP LESS_IS_MORE
Emulate the
.IR more (1)
Expand Down
1 change: 1 addition & 0 deletions output.c
Original file line number Diff line number Diff line change
Expand Up @@ -522,6 +522,7 @@ type funcname(buf, ebuf, radix) \

STR_TO_TYPE_FUNC(lstrtopos, POSITION)
STR_TO_TYPE_FUNC(lstrtoi, int)
STR_TO_TYPE_FUNC(lstrtoul, unsigned long)

/*
* Output an integer in a given radix.
Expand Down
3 changes: 2 additions & 1 deletion version.c
Original file line number Diff line number Diff line change
Expand Up @@ -954,7 +954,8 @@ v606 7/17/22 Fix bug with multibyte chars and --incsearch;
escape filenames in LESSCLOSE; fix bin_file overrun.
v607 7/19/22 Update Unicode tables.
v608 7/22/22 Fix highlighting on colored text boundaries.
v609
v609 Add LESSUTFCHARDEF; fix overstrike color bug;
fix procfs bug; fix signal race.
*/

char version[] = "609x";

0 comments on commit dc4fa8c

Please sign in to comment.