Fix #252: Make EncodingUtil platform-independent.

With this CL, EncodingUtil::SJISToUTF8 no longer depends on any platform-dependent API, including iconv and Win32 APIs. Keep in mind that EncodingUtil::SJISToUTF8 is mainly used for importing user dictionary for MS-IME. Hence we want EncodingUtil::SJISToUTF8 to be compatible with the behavior on Windows. This also closes #27 because we no longer depend on iconv. BUG=#27,#252 TEST=unittest REF_BUG=19010851 REF_CL=91370456,107291445
google · Nov 15, 2015 · 766685b · 766685b
1 parent f2cc056
commit 766685b
Show file tree

Hide file tree

Showing 5 changed files with 11,537 additions and 130 deletions.
diff --git a/src/gui/base/encoding_util.cc b/src/gui/base/encoding_util.cc
@@ -29,145 +29,124 @@
 
 #include "gui/base/encoding_util.h"
 
-// No implementation for Android.
-#ifndef OS_ANDROID
+#include "base/port.h"
+#include "base/string_piece.h"
+#include "base/util.h"
 
-#ifndef OS_WIN
-#include <iconv.h>
-#include <algorithm>
-#else
-#include <windows.h>
-#include <memory>
-#endif
+namespace mozc {
+namespace {
 
-#include <string>
-#include "base/logging.h"
+#include "gui/base/sjis_to_ucs2_table.h"
 
-#ifdef OS_WIN
-using std::unique_ptr;
-#endif  // OS_WIN
+// Each character of SJIS is encoded in one or two bytes.
+//
+// For first byte, there are 4 valid ranges (closed intervals):
+//   * FirstByteRange1: [0x00, 0x80]
+//   * FirstByteRange2: [0x81, 0x9F]
+//   * FirstByteRange3: [0xA1, 0xDF]
+//   * FirstByteRange4: [0xE0, 0xFF]
+// Ranges 2 and 4 are for two bytes encoding, so one more byte is needed to
+// decode a character.
+//
+// For second byte, there are 2 valid ranges (closed intervals):
+//   * SecondByteRange1: [0x40, 0x7E]
+//   * SecondByteRange2: [0x80, 0xFF]
+// Two byte characters are decoded using the conversion table defined in
+// sjis_to_ucs2_table.h.
+inline bool IsInFirstByteRange1(uint8_t byte) {
+  return byte <= 0x80;
+}
 
-namespace {
+inline bool IsInFirstByteRange2(uint8_t byte) {
+  return 0x81 <= byte && byte <= 0x9F;
+}
 
-#ifndef OS_WIN
-
-bool IconvHelper(iconv_t ic, const string &input, string *output) {
-  size_t ilen = input.size();
-  size_t olen = ilen * 4;
-  string tmp;
-  tmp.reserve(olen);
-  char *ibuf = const_cast<char *>(input.data());
-  char *obuf_org = const_cast<char *>(tmp.data());
-  char *obuf = obuf_org;
-  std::fill(obuf, obuf + olen, 0);
-  size_t olen_org = olen;
-  iconv(ic, 0, &ilen, 0, &olen);  // reset iconv state
-  while (ilen != 0) {
-    if (iconv(ic, reinterpret_cast<char **>(&ibuf), &ilen, &obuf, &olen)
-        == static_cast<size_t>(-1)) {
-      return false;
-    }
-  }
-  output->assign(obuf_org, olen_org - olen);
-  return true;
+inline bool IsInFirstByteRange3(uint8_t byte) {
+  return 0xA1 <= byte && byte <= 0xDF;
 }
 
-inline bool Convert(const char *from, const char *to,
-                    const string &input, string *output) {
-  iconv_t ic = iconv_open(to, from);   // note the order
-  if (ic == reinterpret_cast<iconv_t>(-1)) {
-    LOG(WARNING) << "iconv_open failed";
-    *output = input;
-    return false;
-  }
-  bool result = IconvHelper(ic, input, output);
-  iconv_close(ic);
-  return result;
+inline bool IsInFirstByteRange4(uint8_t byte) {
+  return 0xE0 <= byte;
 }
-#else
-// Returns the code-page identifier for the specified encoding string.
-// This function scans a list of mappings from an encoding name to a
-// code-page identifier of Windows according to the encoding name.
-// If the given encoding string does not have any matching code-page
-// identifiers, this function returns 0.
-// To add a mapping from an encoding name to its code-page identifier:
-// 1. Read the list of code-page identifiers supported by Windows (*1), and;
-// 2. Find a code-page identifier matching to the encoding name:
-// (*1) "http://msdn.microsoft.com/en-us/library/ms776446(VS.85).aspx".
-int GetCodepage(const char* name) {
-  static const struct {
-    const char* name;
-    int codepage;
-  } kCodePageMap[] = {
-    { "UTF8",      CP_UTF8 },  // Unicode UTF-8
-    { "SJIS",      932     },  // ANSI/OEM - Japanese, Shift-JIS
-  };
-
-  for (size_t i = 0; i < arraysize(kCodePageMap); i++) {
-    if (strcmp(kCodePageMap[i].name, name) == 0) {
-      return kCodePageMap[i].codepage;
-    }
-  }
-  return 0;
+
+inline bool IsInSecondByteRange1(uint8_t byte) {
+  return 0x40 <= byte && byte <= 0x7E;
 }
 
-// Converts the encoding of the specified string.
-// This function firstly converts the source string to create a temporary
-// UTF-16 string, and encodes the UTF-16 string with the destination encoding.
-inline bool Convert(const char *from, const char *to,
-                    const string &input, string *output) {
-  const int codepage_from = GetCodepage(from);
-  const int codepage_to = GetCodepage(to);
-  if (codepage_from == 0 || codepage_to == 0) {
-    return false;
-  }
+inline bool IsInSecondByteRange2(uint8_t byte) {
+  return 0x80 <= byte;
+}
 
-  const int wide_length = MultiByteToWideChar(codepage_from, 0, input.c_str(),
-                                              -1, nullptr, 0);
-  if (wide_length == 0) {
-    return false;
+size_t ComputeIndex(uint8_t first, uint8_t second) {
+  size_t first_index = 0;
+  if (IsInFirstByteRange2(first)) {
+    // first_index = "offset of first in FirstByteRange2".
+    first_index = first - 0x81;
+  } else if (IsInFirstByteRange4(first)) {
+    // first_index = "offset of first in FirstByteRange4" +
+    //               length(FirstByteRange2)
+    first_index = (first - 0xE0) + (0x9F - 0x81 + 1);
   }
 
-  unique_ptr<wchar_t[]> wide(new wchar_t[wide_length + 1]);
-  if (wide.get() == nullptr) {
-    return false;
+  size_t second_index = 0;
+  if (IsInSecondByteRange1(second)) {
+    // second_index = "offset of second in SecondByteRange1";
+    second_index = second - 0x40;
+  } else if (IsInSecondByteRange2(second)) {
+    // second_index = "offset of second in SecondByteRange2" +
+    //                length(SecondByteRange1)
+    second_index = (second - 0x80) + (0x7E - 0x40 + 1);
   }
 
-  if (MultiByteToWideChar(codepage_from, 0, input.c_str(), -1,
-                          wide.get(), wide_length + 1) == 0)
-    return false;
-
-  const int output_length = WideCharToMultiByte(codepage_to, 0, wide.get(), -1,
-                                                nullptr, 0, nullptr, nullptr);
-  if (output_length == 0) {
-    return false;
-  }
+  // width = length(SecondByteRange1) + length(SecondByteRange2)
+  const size_t width = (0x7E - 0x40 + 1) + (0xFF - 0x80 + 1);
+  return first_index * width + second_index;
+}
 
-  unique_ptr<char[]> multibyte(new char[output_length + 1]);
-  if (multibyte.get() == nullptr) {
-    return false;
-  }
+bool SJISToUTF8Internal(StringPiece input, string* output) {
+  bool expect_first_byte = true;
+  uint8_t first_byte = 0;
+  for (const char c : input) {
+    const uint8_t byte = static_cast<uint8_t>(c);
+
+    if (expect_first_byte) {
+      if (IsInFirstByteRange1(byte)) {
+        Util::UCS4ToUTF8Append(byte, output);
+      } else if (IsInFirstByteRange3(byte)) {
+        Util::UCS4ToUTF8Append(byte + 0xFEC0, output);
+      } else if (IsInFirstByteRange2(byte) || IsInFirstByteRange4(byte)) {
+        first_byte = byte;
+        expect_first_byte = false;
+      } else {
+        return false;  // Invalid first byte.
+      }
+      continue;
+    }
 
-  const int result = WideCharToMultiByte(codepage_to, 0, wide.get(),
-                                         wide_length, multibyte.get(),
-                                         output_length + 1, nullptr, nullptr);
-  if (result == 0) {
-    return false;
+    if (!IsInSecondByteRange1(byte) && !IsInSecondByteRange2(byte)) {
+      return false;
+    }
+    const size_t index = ComputeIndex(first_byte, byte);
+    if (index >= sizeof(kSJISToUCS2Table)) {
+      return false;
+    }
+    const uint16_t ucs2 = kSJISToUCS2Table[index];
+    if (ucs2 == 0) {
+      return false;
+    }
+    Util::UCS4ToUTF8Append(ucs2, output);
+    expect_first_byte = true;
   }
-
-  output->assign(multibyte.get());
-  return true;
+  return expect_first_byte;
 }
 
-#endif
 }   // namespace
 
-namespace mozc {
-
 void EncodingUtil::SJISToUTF8(const string &input, string *output) {
-  Convert("SJIS", "UTF8", input, output);
+  output->clear();
+  if (!SJISToUTF8Internal(input, output)) {
+    output->clear();
+  }
 }
 
 }  // namespace mozc
-
-#endif  // OS_ANDROID
diff --git a/src/gui/base/encoding_util_test.cc b/src/gui/base/encoding_util_test.cc
@@ -29,23 +29,143 @@
 
 #include "gui/base/encoding_util.h"
 
+#ifdef OS_WIN
+#include <windows.h>
+#include <codecvt>
+#include <cstring>
+#include <memory>
+#endif  // OS_WIN
+
 #include <string>
 
+#include "base/logging.h"
+#include "base/port.h"
+#ifdef OS_WIN
+#include "base/string_piece.h"
+#endif  // OS_WIN
 #include "testing/base/public/gunit.h"
 
 namespace mozc {
 namespace {
 
-#ifdef OS_ANDROID
-// At the moment, encoding is not the target of build for Android.
-#else
+#ifdef OS_WIN
+
+bool Convert(StringPiece input, string* output) {
+  const int CP_932 = 932;
+
+  output->clear();
+  if (input.empty()) {
+    return true;
+  }
+
+  const int wide_length = MultiByteToWideChar(
+      CP_932, MB_ERR_INVALID_CHARS, input.data(), input.size(), nullptr, 0);
+  if (wide_length == 0) {
+    return false;
+  }
+
+  unique_ptr<wchar_t[]> wide(new wchar_t[wide_length + 1]);
+  if (MultiByteToWideChar(CP_932, MB_ERR_INVALID_CHARS, input.data(),
+                          input.size(), wide.get(),
+                          wide_length + 1) != wide_length) {
+    return false;
+  }
+
+  std::wstring_convert<std::codecvt_utf8<wchar_t>> wide_to_utf8;
+  *output = wide_to_utf8.to_bytes(wide.get(), wide.get() + wide_length);
+  return true;
+}
+
+TEST(EncodingUtilTest, CompareToWinAPI) {
+  const char* kTestCases[] = {
+    // "私の名前はGoogleです。"
+    "\x8E\x84\x82\xCC\x96\xBC\x91\x4F\x82\xCD\x47\x6F\x6F\x67\x6C\x65"
+    "\x82\xC5\x82\xB7\x81\x42",
+    // "今日はとても良い天気です。"
+    "\x8D\xA1\x93\xFA\x82\xCD\x82\xC6\x82\xC4\x82\xE0\x97\xC7\x82\xA2"
+    "\x93\x56\x8B\x43\x82\xC5\x82\xB7\x81\x42",
+    // "This is a test for SJIS."
+    "\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x66"
+    "\x6F\x72\x20\x53\x4A\x49\x53\x2E",
+    // "あいうえおアイウエオｱｲｳｴｵ"
+    "\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8\x83\x41\x83\x43\x83\x45"
+    "\x83\x47\x83\x49\xB1\xB2\xB3\xB4\xB5",
+  };
+  for (const char* sjis : kTestCases) {
+    string actual;
+    EncodingUtil::SJISToUTF8(sjis, &actual);
+    string expected;
+    ASSERT_TRUE(Convert(sjis, &expected));
+    EXPECT_EQ(expected, actual);
+  }
+}
+
+#endif  // OS_WIN
+
 TEST(EncodingUtilTest, Issue2190350) {
   string result = "";
   EncodingUtil::SJISToUTF8("\x82\xA0", &result);
   EXPECT_EQ(3, result.length());
   EXPECT_EQ("\xE3\x81\x82", result);
 }
-#endif  // OS_ANDROID
+
+TEST(EncodingUtilTest, ValidSJIS) {
+  struct {
+    const char *sjis;
+    const char *utf8;
+  } kTestCases[] = {
+    // "私の名前はGoogleです。"
+    {"\x8E\x84\x82\xCC\x96\xBC\x91\x4F\x82\xCD\x47\x6F\x6F\x67\x6C\x65"
+     "\x82\xC5\x82\xB7\x81\x42",
+     "\xE7\xA7\x81\xE3\x81\xAE\xE5\x90\x8D\xE5\x89\x8D\xE3\x81\xAF\x47"
+     "\x6F\x6F\x67\x6C\x65\xE3\x81\xA7\xE3\x81\x99\xE3\x80\x82"
+     },
+    // "今日はとても良い天気です。"
+    {"\x8D\xA1\x93\xFA\x82\xCD\x82\xC6\x82\xC4\x82\xE0\x97\xC7\x82\xA2"
+     "\x93\x56\x8B\x43\x82\xC5\x82\xB7\x81\x42",
+     "\xE4\xBB\x8A\xE6\x97\xA5\xE3\x81\xAF\xE3\x81\xA8\xE3\x81\xA6\xE3"
+     "\x82\x82\xE8\x89\xAF\xE3\x81\x84\xE5\xA4\xA9\xE6\xB0\x97\xE3\x81"
+     "\xA7\xE3\x81\x99\xE3\x80\x82"},
+    // "This is a test for SJIS."
+    {"\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x66"
+     "\x6F\x72\x20\x53\x4A\x49\x53\x2E",
+     "\x54\x68\x69\x73\x20\x69\x73\x20\x61\x20\x74\x65\x73\x74\x20\x66"
+     "\x6F\x72\x20\x53\x4A\x49\x53\x2E"},
+    // "あいうえおアイウエオｱｲｳｴｵ"
+    {"\x82\xA0\x82\xA2\x82\xA4\x82\xA6\x82\xA8\x83\x41\x83\x43\x83\x45"
+     "\x83\x47\x83\x49\xB1\xB2\xB3\xB4\xB5",
+     "\xE3\x81\x82\xE3\x81\x84\xE3\x81\x86\xE3\x81\x88\xE3\x81\x8A\xE3"
+     "\x82\xA2\xE3\x82\xA4\xE3\x82\xA6\xE3\x82\xA8\xE3\x82\xAA\xEF\xBD"
+     "\xB1\xEF\xBD\xB2\xEF\xBD\xB3\xEF\xBD\xB4\xEF\xBD\xB5"},
+  };
+  for (const auto &tc : kTestCases) {
+    string actual;
+    EncodingUtil::SJISToUTF8(tc.sjis, &actual);
+    EXPECT_EQ(tc.utf8, actual);
+  }
+}
+
+TEST(EncodingUtilTest, InvalidSJIS) {
+  const char* kInvalidInputs[] = {
+    // Invalid first byte (0xA0) at 1st byte
+    "\xA0\x61\x62\x63",
+    // Invalid first byte (0xA0) at 4-th byte
+    "\x61\x62\x63\xA0\x64\x65\x66",
+    // Invalid first byte (0xA0) at the last byte
+    "\x61\x62\x63\xA0",
+    // Valid first byte (0xE0) but there's no second byte
+    "\x61\x62\x63\xE0",
+    // Valid first byte (0x90) in range 2 + invalid second byte (0x15)
+    "\x61\x62\x63\x90\x15\x64\x65\x66",
+    // Valid first byte (0xEE) in range 4 + invalid second byte (0x01)
+    "\x61\x62\x63\xEE\x01\x64\x65\x66",
+  };
+  for (const char* input : kInvalidInputs) {
+    string actual = "to be cleared";
+    EncodingUtil::SJISToUTF8(input, &actual);
+    EXPECT_TRUE(actual.empty());
+  }
+}
 
 }  // namespace
 }  // namespace mozc