Skip to content

Commit 06d2280

Browse files
committed
paste: support multi-byte delimiters
* src/paste.c (collapse_escapes): This is the central --delimiters parsing function, so adjust to handle multi-byte chars with mcel_scanz(). Populate a delimiters length array to support characters of differing lengths. (paste_serial): Use the delimiters length array to output the appropriate delimiter. (paste_parallel): Likewise. * tests/paste/multi-byte.sh: A new test. * tests/local.mk: Reference the new test. * NEWS: Mention the improvement.
1 parent e326d0d commit 06d2280

File tree

5 files changed

+211
-74
lines changed

5 files changed

+211
-74
lines changed

NEWS

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,8 @@ GNU coreutils NEWS -*- outline -*-
8181
'du' now processes directories with 10,000 or more entries up to 9 times
8282
faster on the Lustre file system.
8383

84+
'paste' now supports multi-byte --delimiters characters.
85+
8486
'pinky' will now exit immediately upon receiving a write error, which is
8587
significant when reading large plan or project files.
8688

src/paste.c

Lines changed: 104 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include <sys/types.h>
4343
#include "system.h"
4444
#include "fadvise.h"
45+
#include "mcel.h"
4546

4647
/* The official name of this program (e.g., no 'g' prefix). */
4748
#define PROGRAM_NAME "paste"
@@ -50,21 +51,23 @@
5051
proper_name ("David M. Ihnat"), \
5152
proper_name ("David MacKenzie")
5253

53-
/* Indicates that no delimiter should be added in the current position. */
54-
#define EMPTY_DELIM '\0'
55-
5654
/* If nonzero, we have read standard input at some point. */
5755
static bool have_read_stdin;
5856

5957
/* If nonzero, merge subsequent lines of each file rather than
6058
corresponding lines from each file in parallel. */
6159
static bool serial_merge;
6260

63-
/* The delimiters between lines of input files (used cyclically). */
61+
/* The delimiters between lines of input files (used cyclically).
62+
This stores the raw bytes of all delimiters concatenated. */
6463
static char *delims;
6564

66-
/* A pointer to the character after the end of 'delims'. */
67-
static char const *delim_end;
65+
/* Length of each delimiter in bytes (supports multi-byte characters).
66+
A length of 0 indicates no delimiter at this position (from \0 escape). */
67+
static size_t *delim_lens;
68+
69+
/* Number of delimiters. */
70+
static idx_t num_delims;
6871

6972
static unsigned char line_delim = '\n';
7073

@@ -78,10 +81,10 @@ static struct option const longopts[] =
7881
{nullptr, 0, nullptr, 0}
7982
};
8083

81-
/* Set globals delims and delim_end. Copy STRPTR to DELIMS, converting
82-
backslash representations of special characters in STRPTR to their actual
83-
values. The set of possible backslash characters has been expanded beyond
84-
that recognized by the Unix version.
84+
/* Set globals delims, delim_lens, and num_delims.
85+
Process STRPTR converting backslash representations of special characters
86+
to their actual values. The set of possible backslash characters has been
87+
expanded beyond that recognized by the Unix version.
8588
Return 0 upon success.
8689
If the string ends in an odd number of backslashes, ignore the
8790
final backslash and return nonzero. */
@@ -93,62 +96,65 @@ collapse_escapes (char const *strptr)
9396
bool backslash_at_end = false;
9497

9598
delims = strout;
99+
delim_lens = xnmalloc (MAX (1, strlen (strptr)), sizeof *delim_lens);
100+
101+
char const *s = strptr;
102+
idx_t idx = 0;
96103

97-
while (*strptr)
104+
while (*s)
98105
{
99-
if (*strptr != '\\') /* Is it an escape character? */
100-
*strout++ = *strptr++; /* No, just transfer it. */
101-
else
106+
if (*s == '\\')
102107
{
103-
switch (*++strptr)
108+
s++;
109+
if (*s == '\0')
104110
{
105-
case '0':
106-
*strout++ = EMPTY_DELIM;
107-
break;
108-
109-
case 'b':
110-
*strout++ = '\b';
111-
break;
112-
113-
case 'f':
114-
*strout++ = '\f';
115-
break;
116-
117-
case 'n':
118-
*strout++ = '\n';
119-
break;
120-
121-
case 'r':
122-
*strout++ = '\r';
111+
backslash_at_end = true;
123112
break;
113+
}
114+
else if (*s == '0')
115+
{
116+
/* Empty delimiter at this position. */
117+
s++;
118+
delim_lens[idx++] = 0;
119+
}
120+
else
121+
{
122+
switch (*s)
123+
{
124+
case 'b': *strout++ = '\b'; break;
125+
case 'f': *strout++ = '\f'; break;
126+
case 'n': *strout++ = '\n'; break;
127+
case 'r': *strout++ = '\r'; break;
128+
case 't': *strout++ = '\t'; break;
129+
case 'v': *strout++ = '\v'; break;
130+
case '\\': *strout++ = '\\'; break;
131+
default: goto copy_character;
132+
}
124133

125-
case 't':
126-
*strout++ = '\t';
127-
break;
134+
s++;
135+
delim_lens[idx++] = 1;
136+
}
128137

129-
case 'v':
130-
*strout++ = '\v';
131-
break;
138+
continue;
139+
}
132140

133-
case '\\':
134-
*strout++ = '\\';
135-
break;
141+
copy_character:
142+
mcel_t g = mcel_scanz (s);
143+
strout = mempcpy (strout, s, g.len);
144+
s += g.len;
145+
delim_lens[idx++] = g.len;
146+
}
136147

137-
case '\0':
138-
backslash_at_end = true;
139-
goto done;
148+
*strout = '\0';
140149

141-
default:
142-
*strout++ = *strptr;
143-
break;
144-
}
145-
strptr++;
146-
}
150+
if (idx == 0)
151+
{
152+
delim_lens[0] = 0;
153+
idx = 1;
147154
}
148155

149-
done:
156+
num_delims = idx;
150157

151-
delim_end = strout;
152158
return backslash_at_end ? 1 : 0;
153159
}
154160

@@ -161,6 +167,16 @@ xputchar (char c)
161167
write_error ();
162168
}
163169

170+
/* Output the delimiter at DELIMPTR with length LEN.
171+
If LEN is 0, nothing is output (empty delimiter from \0 escape). */
172+
173+
static inline void
174+
output_delim (char const *delimptr, size_t len)
175+
{
176+
if (len > 0 && fwrite (delimptr, 1, len, stdout) != len)
177+
write_error ();
178+
}
179+
164180
/* Perform column paste on the NFILES files named in FNAMPTR.
165181
Return true if successful, false if one or more files could not be
166182
opened or read. */
@@ -171,9 +187,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
171187
bool ok = true;
172188
/* If all files are just ready to be closed, or will be on this
173189
round, the string of delimiters must be preserved.
174-
delbuf[0] through delbuf[nfiles]
175-
store the delimiters for closed files. */
176-
char *delbuf = xmalloc (nfiles + 2);
190+
delbuf stores the delimiter bytes for closed files.
191+
Size it to hold up to (nfiles - 1) delimiters. */
192+
char *delbuf = xmalloc ((nfiles - 1) * MB_CUR_MAX + 1);
177193

178194
/* Streams open to the files to process; null if the corresponding
179195
stream is closed. */
@@ -218,8 +234,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
218234
{
219235
/* Set up for the next line. */
220236
bool somedone = false;
221-
char const *delimptr = delims;
222-
size_t delims_saved = 0; /* Number of delims saved in 'delbuf'. */
237+
idx_t delimidx = 0; /* Current delimiter index. */
238+
idx_t delimoff = 0; /* Current offset into delims. */
239+
idx_t delims_saved = 0; /* Bytes saved in 'delbuf'. */
223240

224241
for (size_t i = 0; i < nfiles && files_open; i++)
225242
{
@@ -292,10 +309,18 @@ paste_parallel (size_t nfiles, char **fnamptr)
292309
else
293310
{
294311
/* Closed file; add delimiter to 'delbuf'. */
295-
if (*delimptr != EMPTY_DELIM)
296-
delbuf[delims_saved++] = *delimptr;
297-
if (++delimptr == delim_end)
298-
delimptr = delims;
312+
size_t len = delim_lens[delimidx];
313+
if (len > 0)
314+
{
315+
memcpy (delbuf + delims_saved, delims + delimoff, len);
316+
delims_saved += len;
317+
}
318+
delimoff += len;
319+
if (++delimidx == num_delims)
320+
{
321+
delimidx = 0;
322+
delimoff = 0;
323+
}
299324
}
300325
}
301326
else
@@ -308,10 +333,13 @@ paste_parallel (size_t nfiles, char **fnamptr)
308333
{
309334
if (chr != line_delim && chr != EOF)
310335
xputchar (chr);
311-
if (*delimptr != EMPTY_DELIM)
312-
xputchar (*delimptr);
313-
if (++delimptr == delim_end)
314-
delimptr = delims;
336+
output_delim (delims + delimoff, delim_lens[delimidx]);
337+
delimoff += delim_lens[delimidx];
338+
if (++delimidx == num_delims)
339+
{
340+
delimidx = 0;
341+
delimoff = 0;
342+
}
315343
}
316344
else
317345
{
@@ -337,7 +365,6 @@ paste_serial (size_t nfiles, char **fnamptr)
337365
{
338366
bool ok = true; /* false if open or read errors occur. */
339367
int charnew, charold; /* Current and previous char read. */
340-
char const *delimptr; /* Current delimiter char. */
341368
FILE *fileptr; /* Open for reading current file. */
342369

343370
for (; nfiles; nfiles--, fnamptr++)
@@ -361,7 +388,8 @@ paste_serial (size_t nfiles, char **fnamptr)
361388
fadvise (fileptr, FADVISE_SEQUENTIAL);
362389
}
363390

364-
delimptr = delims; /* Set up for delimiter string. */
391+
idx_t delimidx = 0; /* Current delimiter index. */
392+
idx_t delimoff = 0; /* Current offset into delims. */
365393

366394
charold = getc (fileptr);
367395
saved_errno = errno;
@@ -378,11 +406,13 @@ paste_serial (size_t nfiles, char **fnamptr)
378406
/* Process the old character. */
379407
if (charold == line_delim)
380408
{
381-
if (*delimptr != EMPTY_DELIM)
382-
xputchar (*delimptr);
383-
384-
if (++delimptr == delim_end)
385-
delimptr = delims;
409+
output_delim (delims + delimoff, delim_lens[delimidx]);
410+
delimoff += delim_lens[delimidx];
411+
if (++delimidx == num_delims)
412+
{
413+
delimidx = 0;
414+
delimoff = 0;
415+
}
386416
}
387417
else
388418
xputchar (charold);
@@ -520,6 +550,7 @@ main (int argc, char **argv)
520550
(nfiles, &argv[optind]));
521551

522552
free (delims);
553+
free (delim_lens);
523554

524555
if (have_read_stdin && fclose (stdin) == EOF)
525556
error (EXIT_FAILURE, errno, "-");

tests/local.mk

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,10 @@ all_tests = \
377377
tests/od/od-j.sh \
378378
tests/od/od-multiple-t.sh \
379379
tests/od/od-x8.sh \
380-
tests/misc/paste.pl \
381380
tests/misc/pathchk.sh \
382381
tests/misc/printenv.sh \
382+
tests/paste/paste.pl \
383+
tests/paste/multi-byte.sh \
383384
tests/printf/printf.sh \
384385
tests/printf/printf-cov.pl \
385386
tests/printf/printf-hex.sh \

0 commit comments

Comments
 (0)