Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

add & use Alexey Vatchenko's utf8-to-wchar routines

  • Loading branch information...
commit 6e2be7626fa5bcc02c41012362069c2d99198373 1 parent 2d26ed1
@juphoff juphoff authored
Showing with 427 additions and 13 deletions.
  1. +3 −3 util/Makefile
  2. +22 −10 util/fault.c
  3. +342 −0 util/utf8.c
  4. +60 −0 util/utf8.h
View
6 util/Makefile
@@ -65,7 +65,7 @@ include ../Makedefs
FAULT_DEBUG = -DDEBUG
CC += -g
-all: log.o config.o fault.o misc.o wc.o hash.o data.o euca_auth.o euca_axis.o ipc.o windows-bundle.o euca_rootwrap euca_mountwrap
+all: utf8.o log.o config.o fault.o misc.o wc.o hash.o data.o euca_auth.o euca_axis.o ipc.o windows-bundle.o euca_rootwrap euca_mountwrap
build: all
@@ -86,8 +86,8 @@ test_wc: wc.c misc.o log.o ../storage/diskutil.o ipc.o
# I plan to roll the test_fault main() back into fault.c:
# I only kept them separate to ensure fault.c could be used cleanly via extern
-test_fault: fault.c misc.o log.o wc.o ../storage/diskutil.o ipc.o
- $(CC) $(CFLAGS) $(INCLUDES) `xslt-config --cflags` $(FAULT_DEBUG) -D_UNIT_TEST -o test_fault fault.c misc.o log.o wc.o ../storage/diskutil.o ipc.o $(LIBS) $(LDFLAGS)
+test_fault: fault.c misc.o log.o wc.o ../storage/diskutil.o ipc.o utf8.o
+ $(CC) $(CFLAGS) $(INCLUDES) `xslt-config --cflags` $(FAULT_DEBUG) -D_UNIT_TEST -o test_fault fault.c misc.o log.o wc.o ../storage/diskutil.o ipc.o utf8.o $(LIBS) $(LDFLAGS)
../storage/diskutil.o:
make -C ../storage
View
32 util/fault.c
@@ -26,6 +26,7 @@
#include <stdio.h>
#include <stdlib.h>
#define _GNU_SOURCE
+#include <assert.h>
#include <dirent.h>
#include <errno.h>
#include <limits.h>
@@ -43,6 +44,7 @@
#include <misc.h>
#include <fault.h>
#include <wc.h>
+#include <utf8.h>
/*
* These definitions are all easily customized.
@@ -542,28 +544,39 @@ format_eucafault (const char *fault_id, const char_map **map)
// Determine alignment (but only once)
if (!max_label_len) {
for (int i = 0; fault_labels[i]; i++) {
- int this_label_len = 0;
+ int label_len = 0;
+ int w_label_len = 0;
+
char *label = get_common_var (fault_labels[i]);
- this_label_len = strlen (label);
+ label_len = strlen (label);
+
+ w_label_len = utf8_to_wchar (label, label_len, NULL, 0, 0);
free (label);
- if (this_label_len > max_label_len) {
- max_label_len = this_label_len;
+ if (w_label_len > max_label_len) {
+ max_label_len = w_label_len;
}
}
}
-
- // Now spit it out!
fprintf (logfile, "%s\n", STARS);
for (int i = 0; fault_labels[i]; i++) {
+ int w_common_var_len = 0;
+ int common_var_len = 0;
+ int padding = 0;
char *fault_var = NULL;
char *common_var = get_common_var (fault_labels[i]);
- fprintf (logfile, "%s %*s: ", BARS, max_label_len, common_var);
+
+ common_var_len = strlen (common_var);
+ w_common_var_len = utf8_to_wchar (common_var, common_var_len, NULL, 0,
+ 0);
+ padding = max_label_len - w_common_var_len + 1;
+ fprintf (logfile, "%s%*s %s: ", BARS, padding, " ", common_var);
free (common_var);
fault_var = get_fault_var (fault_labels[i], fault_node);
if (fault_var != NULL) {
char *fault_subbed = NULL;
+
if ((fault_subbed = c_varsub (fault_var, map)) != NULL) {
fprintf (logfile, "%s", fault_subbed);
} else {
@@ -577,7 +590,6 @@ format_eucafault (const char *fault_id, const char_map **map)
}
fprintf (logfile, "\n");
}
-
fprintf (logfile, "%s\n\n", STARS);
}
@@ -590,7 +602,7 @@ int
log_eucafault (char *fault_id, const char_map **map)
{
//va_list argv;
- char *token;
+ //char *token;
int count = 0;
initialize_eucafaults ();
@@ -635,7 +647,7 @@ main (int argc, char ** argv)
int dump = 0;
int opt;
- setlocale (LC_ALL, "en_US.utf-8");
+ //setlocale (LC_ALL, "en_US.utf-8");
while ((opt = getopt (argc, argv, "d")) != -1) {
switch (opt) {
View
342 util/utf8.c
@@ -0,0 +1,342 @@
+/*************************************************************************
+ * Copyright 2012 Eucalyptus Systems, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/.
+ *
+ * Please contact Eucalyptus Systems, Inc., 6755 Hollister Ave., Goleta
+ * CA 93117, USA or visit http://www.eucalyptus.com/licenses/ if you need
+ * additional information or have any questions.
+ *
+ * This file may incorporate work covered under the following copyright
+ * and permission notice:
+ *
+ * Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ ************************************************************************/
+#include <arpa/inet.h> /* for htonl() */
+#include <sys/types.h>
+
+#include <wchar.h>
+
+#include "utf8.h"
+
+#define _NXT 0x80
+#define _SEQ2 0xc0
+#define _SEQ3 0xe0
+#define _SEQ4 0xf0
+#define _SEQ5 0xf8
+#define _SEQ6 0xfc
+
+#define _BOM 0xfeff
+
+static int __wchar_forbitten(wchar_t sym);
+static int __utf8_forbitten(u_char octet);
+
+static int
+__wchar_forbitten(wchar_t sym)
+{
+
+ /* Surrogate pairs */
+ if (sym >= 0xd800 && sym <= 0xdfff)
+ return (-1);
+
+ return (0);
+}
+
+static int
+__utf8_forbitten(u_char octet)
+{
+
+ switch (octet) {
+ case 0xc0:
+ case 0xc1:
+ case 0xf5:
+ case 0xff:
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * DESCRIPTION
+ * This function translates UTF-8 string into UCS-4 string (all symbols
+ * will be in local machine byte order).
+ *
+ * It takes the following arguments:
+ * in - input UTF-8 string. It can be null-terminated.
+ * insize - size of input string in bytes.
+ * out - result buffer for UCS-4 string. If out is NULL,
+ * function returns size of result buffer.
+ * outsize - size of out buffer in wide characters.
+ *
+ * RETURN VALUES
+ * The function returns size of result buffer (in wide characters).
+ * Zero is returned in case of error.
+ *
+ * CAVEATS
+ * 1. If UTF-8 string contains zero symbols, they will be translated
+ * as regular symbols.
+ * 2. If UTF8_IGNORE_ERROR or UTF8_SKIP_BOM flag is set, sizes may vary
+ * when `out' is NULL and not NULL. It's because of special UTF-8
+ * sequences which may result in forbitten (by RFC3629) UNICODE
+ * characters. So, the caller must check return value every time and
+ * not prepare buffer in advance (\0 terminate) but after calling this
+ * function.
+ */
+size_t
+utf8_to_wchar(const char *in, size_t insize, wchar_t *out, size_t outsize,
+ int flags)
+{
+ u_char *p, *lim;
+ wchar_t *wlim, high;
+ size_t n, total, i, n_bits;
+
+ if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
+ return (0);
+
+ total = 0;
+ p = (u_char *)in;
+ lim = p + insize;
+ wlim = out + outsize;
+
+ for (; p < lim; p += n) {
+ if (__utf8_forbitten(*p) != 0 &&
+ (flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+
+ /*
+ * Get number of bytes for one wide character.
+ */
+ n = 1; /* default: 1 byte. Used when skipping bytes. */
+ if ((*p & 0x80) == 0)
+ high = (wchar_t)*p;
+ else if ((*p & 0xe0) == _SEQ2) {
+ n = 2;
+ high = (wchar_t)(*p & 0x1f);
+ } else if ((*p & 0xf0) == _SEQ3) {
+ n = 3;
+ high = (wchar_t)(*p & 0x0f);
+ } else if ((*p & 0xf8) == _SEQ4) {
+ n = 4;
+ high = (wchar_t)(*p & 0x07);
+ } else if ((*p & 0xfc) == _SEQ5) {
+ n = 5;
+ high = (wchar_t)(*p & 0x03);
+ } else if ((*p & 0xfe) == _SEQ6) {
+ n = 6;
+ high = (wchar_t)(*p & 0x01);
+ } else {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ continue;
+ }
+
+ /* does the sequence header tell us truth about length? */
+ if (lim - p <= n - 1) {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ n = 1;
+ continue; /* skip */
+ }
+
+ /*
+ * Validate sequence.
+ * All symbols must have higher bits set to 10xxxxxx
+ */
+ if (n > 1) {
+ for (i = 1; i < n; i++) {
+ if ((p[i] & 0xc0) != _NXT)
+ break;
+ }
+ if (i != n) {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ n = 1;
+ continue; /* skip */
+ }
+ }
+
+ total++;
+
+ if (out == NULL)
+ continue;
+
+ if (out >= wlim)
+ return (0); /* no space left */
+
+ *out = 0;
+ n_bits = 0;
+ for (i = 1; i < n; i++) {
+ *out |= (wchar_t)(p[n - i] & 0x3f) << n_bits;
+ n_bits += 6; /* 6 low bits in every byte */
+ }
+ *out |= high << n_bits;
+
+ if (__wchar_forbitten(*out) != 0) {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0); /* forbitten character */
+ else {
+ total--;
+ out--;
+ }
+ } else if (*out == _BOM && (flags & UTF8_SKIP_BOM) != 0) {
+ total--;
+ out--;
+ }
+
+ out++;
+ }
+
+ return (total);
+}
+
+/*
+ * DESCRIPTION
+ * This function translates UCS-4 symbols (given in local machine
+ * byte order) into UTF-8 string.
+ *
+ * It takes the following arguments:
+ * in - input unicode string. It can be null-terminated.
+ * insize - size of input string in wide characters.
+ * out - result buffer for utf8 string. If out is NULL,
+ * function returns size of result buffer.
+ * outsize - size of result buffer.
+ *
+ * RETURN VALUES
+ * The function returns size of result buffer (in bytes). Zero is returned
+ * in case of error.
+ *
+ * CAVEATS
+ * If UCS-4 string contains zero symbols, they will be translated
+ * as regular symbols.
+ */
+size_t
+wchar_to_utf8(const wchar_t *in, size_t insize, char *out, size_t outsize,
+ int flags)
+{
+ wchar_t *w, *wlim, ch;
+ u_char *p, *lim, *oc;
+ size_t total, n;
+
+ if (in == NULL || insize == 0 || (outsize == 0 && out != NULL))
+ return (0);
+
+ w = (wchar_t *)in;
+ wlim = w + insize;
+ p = (u_char *)out;
+ lim = p + outsize;
+ total = 0;
+ for (; w < wlim; w++) {
+ if (__wchar_forbitten(*w) != 0) {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ else
+ continue;
+ }
+
+ if (*w == _BOM && (flags & UTF8_SKIP_BOM) != 0)
+ continue;
+
+ if (*w < 0) {
+ if ((flags & UTF8_IGNORE_ERROR) == 0)
+ return (0);
+ continue;
+ } else if (*w <= 0x0000007f)
+ n = 1;
+ else if (*w <= 0x000007ff)
+ n = 2;
+ else if (*w <= 0x0000ffff)
+ n = 3;
+ else if (*w <= 0x001fffff)
+ n = 4;
+ else if (*w <= 0x03ffffff)
+ n = 5;
+ else /* if (*w <= 0x7fffffff) */
+ n = 6;
+
+ total += n;
+
+ if (out == NULL)
+ continue;
+
+ if (lim - p <= n - 1)
+ return (0); /* no space left */
+
+ /* make it work under different endians */
+ ch = htonl(*w);
+ oc = (u_char *)&ch;
+ switch (n) {
+ case 1:
+ *p = oc[3];
+ break;
+
+ case 2:
+ p[1] = _NXT | (oc[3] & 0x3f);
+ p[0] = _SEQ2 | (oc[3] >> 6) | ((oc[2] & 0x07) << 2);
+ break;
+
+ case 3:
+ p[2] = _NXT | (oc[3] & 0x3f);
+ p[1] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[0] = _SEQ3 | ((oc[2] & 0xf0) >> 4);
+ break;
+
+ case 4:
+ p[3] = _NXT | (oc[3] & 0x3f);
+ p[2] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[1] = _NXT | ((oc[2] & 0xf0) >> 4) |
+ ((oc[1] & 0x03) << 4);
+ p[0] = _SEQ4 | ((oc[1] & 0x1f) >> 2);
+ break;
+
+ case 5:
+ p[4] = _NXT | (oc[3] & 0x3f);
+ p[3] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[2] = _NXT | ((oc[2] & 0xf0) >> 4) |
+ ((oc[1] & 0x03) << 4);
+ p[1] = _NXT | (oc[1] >> 2);
+ p[0] = _SEQ5 | (oc[0] & 0x03);
+ break;
+
+ case 6:
+ p[5] = _NXT | (oc[3] & 0x3f);
+ p[4] = _NXT | (oc[3] >> 6) | ((oc[2] & 0x0f) << 2);
+ p[3] = _NXT | (oc[2] >> 4) | ((oc[1] & 0x03) << 4);
+ p[2] = _NXT | (oc[1] >> 2);
+ p[1] = _NXT | (oc[0] & 0x3f);
+ p[0] = _SEQ6 | ((oc[0] & 0x40) >> 6);
+ break;
+ }
+
+ /*
+ * NOTE: do not check here for forbitten UTF-8 characters.
+ * They cannot appear here because we do proper convertion.
+ */
+
+ p += n;
+ }
+
+ return (total);
+}
View
60 util/utf8.h
@@ -0,0 +1,60 @@
+/*************************************************************************
+ * Copyright 2012 Eucalyptus Systems, Inc.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 3 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see http://www.gnu.org/licenses/.
+ *
+ * Please contact Eucalyptus Systems, Inc., 6755 Hollister Ave., Goleta
+ * CA 93117, USA or visit http://www.eucalyptus.com/licenses/ if you need
+ * additional information or have any questions.
+ *
+ * This file may incorporate work covered under the following copyright
+ * and permission notice:
+ *
+ * Copyright (c) 2007 Alexey Vatchenko <av@bsdua.org>
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ ************************************************************************/
+
+/*
+ * utf8: implementation of UTF-8 charset encoding (RFC3629).
+ */
+#ifndef _UTF8_H_
+#define _UTF8_H_
+
+#include <sys/types.h>
+
+#include <wchar.h>
+
+#define UTF8_IGNORE_ERROR 0x01
+#define UTF8_SKIP_BOM 0x02
+
+__BEGIN_DECLS
+
+size_t utf8_to_wchar(const char *in, size_t insize, wchar_t *out,
+ size_t outsize, int flags);
+size_t wchar_to_utf8(const wchar_t *in, size_t insize, char *out,
+ size_t outsize, int flags);
+
+__END_DECLS
+
+#endif /* !_UTF8_H_ */
Please sign in to comment.
Something went wrong with that request. Please try again.