Skip to content

Commit

Permalink
join,uniq: support multi-byte separators
Browse files Browse the repository at this point in the history
* NEWS: Mention this.
* bootstrap.conf (gnulib_modules): Remove cu-ctype, as this module
is now more trouble than it’s worth.  All uses removed.
Add skipchars.
* gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype:
Remove.
* gl/lib/skipchars.c, gl/lib/skipchars.h, gl/modules/skipchars:
* tests/misc/join-utf8.sh:
New files.
* src/join.c: Include skipchars.h and mcel.h instead of cu-ctype.h.
(tab): Now mcel_t, not int.  All uses changed.
(output_separator, output_seplen): New static vars.
(eq_tab, newline_or_blank, comma_or_blank): New functions.
(xfields, prfields, prjoin, add_field_list, main):
Support multi-byte characters.
* src/numfmt.c: Include ctype.h, skipchars.h.
Do not include cu-ctype.h.
(newline_or_blank): New function.
(next_field): Support multi-byte characters.
* src/sort.c: Include ctype.h instead of cu-ctype.h.
(inittables): Open-code field_sep since it no longer exists.
‘sort’ is not multi-byte safe yet, but when it is this code
will need revamping anyway.
* src/uniq.c: Include mcel.h and skipchars.h instead of cu-ctype.h.
(newline_or_blank): New function.
(find_field): Support multi-byte characters.
* tests/local.mk (all_tests): Add tests/misc/join-utf8.sh
  • Loading branch information
eggert committed Oct 30, 2023
1 parent 2709bea commit 11b01fc
Show file tree
Hide file tree
Showing 14 changed files with 244 additions and 128 deletions.
5 changes: 5 additions & 0 deletions NEWS
Expand Up @@ -8,6 +8,11 @@ GNU coreutils NEWS -*- outline -*-
to preserve ownership" when copying to GNU/Linux CIFS file systems.
They do this by working around some Linux CIFS bugs.

join and uniq now support multi-byte characters better.
For example, 'join -tX' now works even if X is a multi-byte character,
and both programs now treat multi-byte characters like U+3000
IDEOGRAPHIC SPACE as blanks if the current locale treats them so.

numfmt options like --suffix no longer have an arbitrary 127-byte limit.
[bug introduced with numfmt in coreutils-8.21]

Expand Down
2 changes: 1 addition & 1 deletion bootstrap.conf
Expand Up @@ -70,7 +70,6 @@ gnulib_modules="
crypto/sha256
crypto/sha512
crypto/sm3
cu-ctype
cycle-check
d-ino
d-type
Expand Down Expand Up @@ -241,6 +240,7 @@ gnulib_modules="
settime
sig2str
sigaction
skipchars
smack
ssize_t
stat-macros
Expand Down
3 changes: 0 additions & 3 deletions gl/lib/cu-ctype.c

This file was deleted.

35 changes: 0 additions & 35 deletions gl/lib/cu-ctype.h

This file was deleted.

3 changes: 3 additions & 0 deletions gl/lib/skipchars.c
@@ -0,0 +1,3 @@
#include <config.h>
#define SKIPCHARS_INLINE _GL_EXTERN_INLINE
#include <skipchars.h>
56 changes: 56 additions & 0 deletions gl/lib/skipchars.h
@@ -0,0 +1,56 @@
/* Skipping sequences of characters satisfying a predicate
Copyright 2023 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */

#include "mcel.h"

_GL_INLINE_HEADER_BEGIN
#ifndef SKIPCHARS_INLINE
# define SKIPCHARS_INLINE _GL_INLINE
#endif

/* Return the address just past the leading sequence of possibly
multi-byte characters or encoding errors G in STR that satisfy
PREDICATE (G) if OK is true, or that do not satisfy the predicate
call if OK is false. */

SKIPCHARS_INLINE char *
skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok)
{
char const *s = str;
for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok;
s += g.len)
continue;
return (char *) s;
}

/* Return the address just past the leading sequence of possibly
multi-byte characters or encoding errors G in BUF (which ends at LIM)
that satisfy PREDICATE (G) if OK is true, or that do not satisfy
the predicate call if OK is false. */

SKIPCHARS_INLINE char *
skip_buf_matching (char const *buf, char const *lim,
bool (*predicate) (mcel_t), bool ok)
{
char const *s = buf;
for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok;
s += g.len)
continue;
return (char *) s;
}

_GL_INLINE_HEADER_END
24 changes: 0 additions & 24 deletions gl/modules/cu-ctype

This file was deleted.

24 changes: 24 additions & 0 deletions gl/modules/skipchars
@@ -0,0 +1,24 @@
Description:
Skip sequences of multi-byte characters or encoding errors

Files:
lib/skipchars.c
lib/skipchars.h

Depends-on:
extern-inline
mcel

configure.ac:

Makefile.am:
lib_SOURCES += skipchars.c

Include:
"skipchars.h"

License:
GPL

Maintainer:
all
119 changes: 74 additions & 45 deletions src/join.c
Expand Up @@ -23,12 +23,13 @@

#include "system.h"
#include "assure.h"
#include "cu-ctype.h"
#include "fadvise.h"
#include "hard-locale.h"
#include "linebuffer.h"
#include "mcel.h"
#include "memcasecmp.h"
#include "quote.h"
#include "skipchars.h"
#include "stdio--.h"
#include "xmemcoll.h"
#include "xstrtol.h"
Expand Down Expand Up @@ -135,10 +136,14 @@ static struct outlist outlist_head;
/* Last element in 'outlist', where a new element can be added. */
static struct outlist *outlist_end = &outlist_head;

/* Tab character separating fields. If negative, fields are separated
by any nonempty string of blanks, otherwise by exactly one
tab character whose value (when cast to unsigned char) equals TAB. */
static int tab = -1;
/* Tab character (or encoding error) separating fields. If TAB.len == 0,
fields are separated by any nonempty string of blanks, otherwise by
exactly one tab character (or encoding error) equal to TAB. */
static mcel_t tab;

/* The output separator to use, and its length in bytes. */
static char const *output_separator = " ";
static idx_t output_seplen = 1;

/* If nonzero, check that the input is correctly ordered. */
static enum
Expand Down Expand Up @@ -267,6 +272,18 @@ extract_field (struct line *line, char *field, idx_t len)
++(line->nfields);
}

static bool
eq_tab (mcel_t g)
{
return mcel_cmp (g, tab) == 0;
}

static bool
newline_or_blank (mcel_t g)
{
return g.ch == '\n' || c32isblank (g.ch);
}

/* Fill in the 'fields' structure in LINE. */

static void
Expand All @@ -278,34 +295,29 @@ xfields (struct line *line)
if (ptr == lim)
return;

if (0 <= tab && tab != '\n')
{
char *sep;
for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
extract_field (line, ptr, sep - ptr);
}
else if (tab < 0)
if (!tab.len)
{
/* Skip leading blanks before the first field. */
while (field_sep (*ptr))
if (++ptr == lim)
return;

do
while (ptr < lim)
{
char *sep;
for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
continue;
ptr = skip_buf_matching (ptr, lim, newline_or_blank, true);
if (!*ptr)
break;
char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
extract_field (line, ptr, sep - ptr);
if (sep == lim)
return;
for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
continue;
ptr = sep;
}
while (ptr != lim);
}
else
{
if (tab.ch != '\n')
for (char *sep;
((sep = skip_buf_matching (ptr, lim, eq_tab, false))
< lim);
ptr = sep + mcel_scan (sep, lim).len)
extract_field (line, ptr, sep - ptr);

extract_field (line, ptr, lim - ptr);
extract_field (line, ptr, lim - ptr);
}
}

static void
Expand Down Expand Up @@ -568,16 +580,15 @@ prfields (struct line const *line, idx_t join_field, idx_t autocount)
{
idx_t i;
idx_t nfields = autoformat ? autocount : line->nfields;
char output_separator = tab < 0 ? ' ' : tab;

for (i = 0; i < join_field && i < nfields; ++i)
{
putchar (output_separator);
fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
for (i = join_field + 1; i < nfields; ++i)
{
putchar (output_separator);
fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
}
Expand All @@ -588,7 +599,6 @@ static void
prjoin (struct line const *line1, struct line const *line2)
{
const struct outlist *outlist;
char output_separator = tab < 0 ? ' ' : tab;
idx_t field;
struct line const *line;

Expand Down Expand Up @@ -622,7 +632,7 @@ prjoin (struct line const *line1, struct line const *line2)
o = o->next;
if (o == nullptr)
break;
putchar (output_separator);
fwrite (output_separator, 1, output_seplen, stdout);
}
putchar (eolchar);
}
Expand Down Expand Up @@ -886,6 +896,12 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index)
}
}

static bool
comma_or_blank (mcel_t g)
{
return g.ch == ',' || c32isblank (g.ch);
}

/* Add the comma or blank separated field spec(s) in STR to 'outlist'. */

static void
Expand All @@ -898,14 +914,17 @@ add_field_list (char *str)
int file_index;
idx_t field_index;
char const *spec_item = p;

p = strpbrk (p, ", \t");
if (p)
*p++ = '\0';
p = skip_str_matching (spec_item, comma_or_blank, false);
if (*p)
{
mcel_t g = mcel_scanz (p);
*p = '\0';
p += g.len;
}
decode_field_spec (spec_item, &file_index, &field_index);
add_field (file_index, field_index);
}
while (p);
while (*p);
}

/* Set the join field *VAR to VAL, but report an error if *VAR is set
Expand Down Expand Up @@ -1087,20 +1106,30 @@ main (int argc, char **argv)

case 't':
{
unsigned char newtab = optarg[0];
if (! newtab)
newtab = '\n'; /* '' => process the whole line. */
else if (optarg[1])
mcel_t newtab;
if (!*optarg)
{
/* '' => process the whole line. */
newtab = mcel_ch ('\n', 1);
/* output_separator does not matter. */
}
else if (STREQ (optarg, "\\0"))
{
newtab = mcel_ch ('\0', 1);
output_separator = "";
}
else
{
if (STREQ (optarg, "\\0"))
newtab = '\0';
else
newtab = mcel_scanz (optarg);
if (optarg[newtab.len])
error (EXIT_FAILURE, 0, _("multi-character tab %s"),
quote (optarg));
output_separator = optarg;
}
if (0 <= tab && tab != newtab)
if (tab.len && mcel_cmp (tab, newtab) != 0)
error (EXIT_FAILURE, 0, _("incompatible tabs"));
tab = newtab;
output_seplen = newtab.len;
}
break;

Expand Down

0 comments on commit 11b01fc

Please sign in to comment.