Skip to content

Commit

Permalink
Fix #252: Make EncodingUtil platform-independent.
Browse files Browse the repository at this point in the history
With this CL, EncodingUtil::SJISToUTF8 no longer depends on any
platform-dependent API, including iconv and Win32 APIs.

Keep in mind that EncodingUtil::SJISToUTF8 is mainly used for importing
user dictionary for MS-IME.  Hence we want EncodingUtil::SJISToUTF8 to
be compatible with the behavior on Windows.

This also closes #27 because we no longer depend on iconv.

BUG=#27,#252
TEST=unittest
REF_BUG=19010851
REF_CL=91370456,107291445
  • Loading branch information
Noriyuki Takahashi authored and yukawa committed Nov 15, 2015
1 parent f2cc056 commit 766685b
Show file tree
Hide file tree
Showing 5 changed files with 11,537 additions and 130 deletions.
211 changes: 95 additions & 116 deletions src/gui/base/encoding_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,145 +29,124 @@

#include "gui/base/encoding_util.h"

// No implementation for Android.
#ifndef OS_ANDROID
#include "base/port.h"
#include "base/string_piece.h"
#include "base/util.h"

#ifndef OS_WIN
#include <iconv.h>
#include <algorithm>
#else
#include <windows.h>
#include <memory>
#endif
namespace mozc {
namespace {

#include <string>
#include "base/logging.h"
#include "gui/base/sjis_to_ucs2_table.h"

#ifdef OS_WIN
using std::unique_ptr;
#endif // OS_WIN
// Each character of SJIS is encoded in one or two bytes.
//
// For first byte, there are 4 valid ranges (closed intervals):
// * FirstByteRange1: [0x00, 0x80]
// * FirstByteRange2: [0x81, 0x9F]
// * FirstByteRange3: [0xA1, 0xDF]
// * FirstByteRange4: [0xE0, 0xFF]
// Ranges 2 and 4 are for two bytes encoding, so one more byte is needed to
// decode a character.
//
// For second byte, there are 2 valid ranges (closed intervals):
// * SecondByteRange1: [0x40, 0x7E]
// * SecondByteRange2: [0x80, 0xFF]
// Two byte characters are decoded using the conversion table defined in
// sjis_to_ucs2_table.h.
inline bool IsInFirstByteRange1(uint8_t byte) {
return byte <= 0x80;
}

namespace {
inline bool IsInFirstByteRange2(uint8_t byte) {
return 0x81 <= byte && byte <= 0x9F;
}

#ifndef OS_WIN

bool IconvHelper(iconv_t ic, const string &input, string *output) {
size_t ilen = input.size();
size_t olen = ilen * 4;
string tmp;
tmp.reserve(olen);
char *ibuf = const_cast<char *>(input.data());
char *obuf_org = const_cast<char *>(tmp.data());
char *obuf = obuf_org;
std::fill(obuf, obuf + olen, 0);
size_t olen_org = olen;
iconv(ic, 0, &ilen, 0, &olen); // reset iconv state
while (ilen != 0) {
if (iconv(ic, reinterpret_cast<char **>(&ibuf), &ilen, &obuf, &olen)
== static_cast<size_t>(-1)) {
return false;
}
}
output->assign(obuf_org, olen_org - olen);
return true;
inline bool IsInFirstByteRange3(uint8_t byte) {
return 0xA1 <= byte && byte <= 0xDF;
}

inline bool Convert(const char *from, const char *to,
const string &input, string *output) {
iconv_t ic = iconv_open(to, from); // note the order
if (ic == reinterpret_cast<iconv_t>(-1)) {
LOG(WARNING) << "iconv_open failed";
*output = input;
return false;
}
bool result = IconvHelper(ic, input, output);
iconv_close(ic);
return result;
inline bool IsInFirstByteRange4(uint8_t byte) {
return 0xE0 <= byte;
}
#else
// Returns the code-page identifier for the specified encoding string.
// This function scans a list of mappings from an encoding name to a
// code-page identifier of Windows according to the encoding name.
// If the given encoding string does not have any matching code-page
// identifiers, this function returns 0.
// To add a mapping from an encoding name to its code-page identifier:
// 1. Read the list of code-page identifiers supported by Windows (*1), and;
// 2. Find a code-page identifier matching to the encoding name:
// (*1) "http://msdn.microsoft.com/en-us/library/ms776446(VS.85).aspx".
int GetCodepage(const char* name) {
static const struct {
const char* name;
int codepage;
} kCodePageMap[] = {
{ "UTF8", CP_UTF8 }, // Unicode UTF-8
{ "SJIS", 932 }, // ANSI/OEM - Japanese, Shift-JIS
};

for (size_t i = 0; i < arraysize(kCodePageMap); i++) {
if (strcmp(kCodePageMap[i].name, name) == 0) {
return kCodePageMap[i].codepage;
}
}
return 0;

inline bool IsInSecondByteRange1(uint8_t byte) {
return 0x40 <= byte && byte <= 0x7E;
}

// Converts the encoding of the specified string.
// This function firstly converts the source string to create a temporary
// UTF-16 string, and encodes the UTF-16 string with the destination encoding.
inline bool Convert(const char *from, const char *to,
const string &input, string *output) {
const int codepage_from = GetCodepage(from);
const int codepage_to = GetCodepage(to);
if (codepage_from == 0 || codepage_to == 0) {
return false;
}
inline bool IsInSecondByteRange2(uint8_t byte) {
return 0x80 <= byte;
}

const int wide_length = MultiByteToWideChar(codepage_from, 0, input.c_str(),
-1, nullptr, 0);
if (wide_length == 0) {
return false;
size_t ComputeIndex(uint8_t first, uint8_t second) {
size_t first_index = 0;
if (IsInFirstByteRange2(first)) {
// first_index = "offset of first in FirstByteRange2".
first_index = first - 0x81;
} else if (IsInFirstByteRange4(first)) {
// first_index = "offset of first in FirstByteRange4" +
// length(FirstByteRange2)
first_index = (first - 0xE0) + (0x9F - 0x81 + 1);
}

unique_ptr<wchar_t[]> wide(new wchar_t[wide_length + 1]);
if (wide.get() == nullptr) {
return false;
size_t second_index = 0;
if (IsInSecondByteRange1(second)) {
// second_index = "offset of second in SecondByteRange1";
second_index = second - 0x40;
} else if (IsInSecondByteRange2(second)) {
// second_index = "offset of second in SecondByteRange2" +
// length(SecondByteRange1)
second_index = (second - 0x80) + (0x7E - 0x40 + 1);
}

if (MultiByteToWideChar(codepage_from, 0, input.c_str(), -1,
wide.get(), wide_length + 1) == 0)
return false;

const int output_length = WideCharToMultiByte(codepage_to, 0, wide.get(), -1,
nullptr, 0, nullptr, nullptr);
if (output_length == 0) {
return false;
}
// width = length(SecondByteRange1) + length(SecondByteRange2)
const size_t width = (0x7E - 0x40 + 1) + (0xFF - 0x80 + 1);
return first_index * width + second_index;
}

unique_ptr<char[]> multibyte(new char[output_length + 1]);
if (multibyte.get() == nullptr) {
return false;
}
bool SJISToUTF8Internal(StringPiece input, string* output) {
bool expect_first_byte = true;
uint8_t first_byte = 0;
for (const char c : input) {
const uint8_t byte = static_cast<uint8_t>(c);

if (expect_first_byte) {
if (IsInFirstByteRange1(byte)) {
Util::UCS4ToUTF8Append(byte, output);
} else if (IsInFirstByteRange3(byte)) {
Util::UCS4ToUTF8Append(byte + 0xFEC0, output);
} else if (IsInFirstByteRange2(byte) || IsInFirstByteRange4(byte)) {
first_byte = byte;
expect_first_byte = false;
} else {
return false; // Invalid first byte.
}
continue;
}

const int result = WideCharToMultiByte(codepage_to, 0, wide.get(),
wide_length, multibyte.get(),
output_length + 1, nullptr, nullptr);
if (result == 0) {
return false;
if (!IsInSecondByteRange1(byte) && !IsInSecondByteRange2(byte)) {
return false;
}
const size_t index = ComputeIndex(first_byte, byte);
if (index >= sizeof(kSJISToUCS2Table)) {
return false;
}
const uint16_t ucs2 = kSJISToUCS2Table[index];
if (ucs2 == 0) {
return false;
}
Util::UCS4ToUTF8Append(ucs2, output);
expect_first_byte = true;
}

output->assign(multibyte.get());
return true;
return expect_first_byte;
}

#endif
} // namespace

namespace mozc {

void EncodingUtil::SJISToUTF8(const string &input, string *output) {
Convert("SJIS", "UTF8", input, output);
output->clear();
if (!SJISToUTF8Internal(input, output)) {
output->clear();
}
}

} // namespace mozc

#endif // OS_ANDROID
128 changes: 124 additions & 4 deletions src/gui/base/encoding_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,23 +29,143 @@

#include "gui/base/encoding_util.h"

#ifdef OS_WIN
#include <windows.h>
#include <codecvt>
#include <cstring>
#include <memory>
#endif // OS_WIN

#include <string>

#include "base/logging.h"
#include "base/port.h"
#ifdef OS_WIN
#include "base/string_piece.h"
#endif // OS_WIN
#include "testing/base/public/gunit.h"

namespace mozc {
namespace {

#ifdef OS_ANDROID
// At the moment, encoding is not the target of build for Android.
#else
#ifdef OS_WIN

bool Convert(StringPiece input, string* output) {
const int CP_932 = 932;

output->clear();
if (input.empty()) {
return true;
}

const int wide_length = MultiByteToWideChar(
CP_932, MB_ERR_INVALID_CHARS, input.data(), input.size(), nullptr, 0);
if (wide_length == 0) {
return false;
}

unique_ptr<wchar_t[]> wide(new wchar_t[wide_length + 1]);
if (MultiByteToWideChar(CP_932, MB_ERR_INVALID_CHARS, input.data(),
input.size(), wide.get(),
wide_length + 1) != wide_length) {
return false;
}

std::wstring_convert<std::codecvt_utf8<wchar_t>> wide_to_utf8;
*output = wide_to_utf8.to_bytes(wide.get(), wide.get() + wide_length);
return true;
}

TEST(EncodingUtilTest, CompareToWinAPI) {
const char* kTestCases[] = {
// "私の名前はGoogleです。"
"\x8E\x84\x82\xCC\x96\xBC\x91\x4F\x82\xCD\x47\x6F\x6F\x67\x6C\x65"
"\x82\xC5\x82\xB7\x81\x42",
// "今日はとても良い天気です。"
"\x8D\xA1\x93\xFA\x82\xCD\x82\xC6\x82\xC4\x82\xE0\x97\xC7\x82\xA2"
"\x93\x56\x8B\x43\x82\xC5\x82\xB7\x81\x42",
// "This is a test for SJIS."
"\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x66"
"\x6F\x72\x20\x53\x4A\x49\x53\x2E",
// "あいうえおアイウエオアイウエオ"
"\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8\x83\x41\x83\x43\x83\x45"
"\x83\x47\x83\x49\xB1\xB2\xB3\xB4\xB5",
};
for (const char* sjis : kTestCases) {
string actual;
EncodingUtil::SJISToUTF8(sjis, &actual);
string expected;
ASSERT_TRUE(Convert(sjis, &expected));
EXPECT_EQ(expected, actual);
}
}

#endif // OS_WIN

TEST(EncodingUtilTest, Issue2190350) {
string result = "";
EncodingUtil::SJISToUTF8("\x82\xA0", &result);
EXPECT_EQ(3, result.length());
EXPECT_EQ("\xE3\x81\x82", result);
}
#endif // OS_ANDROID

TEST(EncodingUtilTest, ValidSJIS) {
struct {
const char *sjis;
const char *utf8;
} kTestCases[] = {
// "私の名前はGoogleです。"
{"\x8E\x84\x82\xCC\x96\xBC\x91\x4F\x82\xCD\x47\x6F\x6F\x67\x6C\x65"
"\x82\xC5\x82\xB7\x81\x42",
"\xE7\xA7\x81\xE3\x81\xAE\xE5\x90\x8D\xE5\x89\x8D\xE3\x81\xAF\x47"
"\x6F\x6F\x67\x6C\x65\xE3\x81\xA7\xE3\x81\x99\xE3\x80\x82"
},
// "今日はとても良い天気です。"
{"\x8D\xA1\x93\xFA\x82\xCD\x82\xC6\x82\xC4\x82\xE0\x97\xC7\x82\xA2"
"\x93\x56\x8B\x43\x82\xC5\x82\xB7\x81\x42",
"\xE4\xBB\x8A\xE6\x97\xA5\xE3\x81\xAF\xE3\x81\xA8\xE3\x81\xA6\xE3"
"\x82\x82\xE8\x89\xAF\xE3\x81\x84\xE5\xA4\xA9\xE6\xB0\x97\xE3\x81"
"\xA7\xE3\x81\x99\xE3\x80\x82"},
// "This is a test for SJIS."
{"\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x66"
"\x6F\x72\x20\x53\x4A\x49\x53\x2E",
"\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x66"
"\x6F\x72\x20\x53\x4A\x49\x53\x2E"},
// "あいうえおアイウエオアイウエオ"
{"\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8\x83\x41\x83\x43\x83\x45"
"\x83\x47\x83\x49\xB1\xB2\xB3\xB4\xB5",
"\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A\xE3"
"\x82\xA2\xE3\x82\xA4\xE3\x82\xA6\xE3\x82\xA8\xE3\x82\xAA\xEF\xBD"
"\xB1\xEF\xBD\xB2\xEF\xBD\xB3\xEF\xBD\xB4\xEF\xBD\xB5"},
};
for (const auto &tc : kTestCases) {
string actual;
EncodingUtil::SJISToUTF8(tc.sjis, &actual);
EXPECT_EQ(tc.utf8, actual);
}
}

TEST(EncodingUtilTest, InvalidSJIS) {
const char* kInvalidInputs[] = {
// Invalid first byte (0xA0) at 1st byte
"\xA0\x61\x62\x63",
// Invalid first byte (0xA0) at 4-th byte
"\x61\x62\x63\xA0\x64\x65\x66",
// Invalid first byte (0xA0) at the last byte
"\x61\x62\x63\xA0",
// Valid first byte (0xE0) but there's no second byte
"\x61\x62\x63\xE0",
// Valid first byte (0x90) in range 2 + invalid second byte (0x15)
"\x61\x62\x63\x90\x15\x64\x65\x66",
// Valid first byte (0xEE) in range 4 + invalid second byte (0x01)
"\x61\x62\x63\xEE\x01\x64\x65\x66",
};
for (const char* input : kInvalidInputs) {
string actual = "to be cleared";
EncodingUtil::SJISToUTF8(input, &actual);
EXPECT_TRUE(actual.empty());
}
}

} // namespace
} // namespace mozc

0 comments on commit 766685b

Please sign in to comment.