Skip to content

Commit

Permalink
encoding, refactor: use icu built-in encoding for iconv if icu built-…
Browse files Browse the repository at this point in the history
…in encoding.
  • Loading branch information
xicilion committed Oct 11, 2023
1 parent 5e16d74 commit 4cb10eb
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 103 deletions.
41 changes: 40 additions & 1 deletion fibjs/src/encoding/encoding_iconv.cpp
Expand Up @@ -47,7 +47,7 @@ static size_t iconv(iconv_t cd, const char** inbuf, size_t* inbytesleft,
#include "object.h"
#include "encoding_iconv.h"
#include "ifs/encoding.h"

#include <unicode/include/unicode/ucnv.h>
namespace fibjs {

DECLARE_MODULE(iconv);
Expand Down Expand Up @@ -92,9 +92,25 @@ void encoding_iconv::open(const char* charset)

result_t encoding_iconv::encode(exlib::string data, exlib::string& retVal)
{
if (data.empty()) {
retVal.clear();
return 0;
}

if (ucs_encode(data, retVal) == 0)
return 0;

int32_t _sz;
UErrorCode errorCode = U_ZERO_ERROR;

_sz = ucnv_convert(m_charset.c_str(), "utf-8", NULL, 0, data.c_str(), data.length(), &errorCode);
if (_sz) {
retVal.resize(_sz);
errorCode = U_ZERO_ERROR;
ucnv_convert(m_charset.c_str(), "utf-8", retVal.c_buffer(), _sz, data.c_str(), data.length(), &errorCode);
return 0;
}

if (!m_iconv_en) {
m_iconv_en = iconv_open(m_charset.c_str(), "utf-8");
if (m_iconv_en == (iconv_t)(-1)) {
Expand Down Expand Up @@ -135,6 +151,11 @@ result_t encoding_iconv::encode(exlib::string data, obj_ptr<Buffer_base>& retVal

result_t encoding_iconv::decode(const char* data, size_t sz, exlib::string& retVal)
{
if (sz == 0) {
retVal.clear();
return 0;
}

if (ucs_decode(data, sz, retVal) == 0)
return 0;

Expand All @@ -146,6 +167,17 @@ result_t encoding_iconv::decode(const char* data, size_t sz, exlib::string& retV
}
}

int32_t _sz;
UErrorCode errorCode = U_ZERO_ERROR;

_sz = ucnv_convert("utf-8", m_charset.c_str(), NULL, 0, data, sz, &errorCode);
if (_sz) {
retVal.resize(_sz);
errorCode = U_ZERO_ERROR;
ucnv_convert("utf-8", m_charset.c_str(), retVal.c_buffer(), _sz, data, sz, &errorCode);
return 0;
}

exlib::string strBuf;

strBuf.resize(sz * 2);
Expand Down Expand Up @@ -210,6 +242,13 @@ bool encoding_iconv::is_encoding(exlib::string charset)
if (is_ucs_encoding(charset))
return true;

UErrorCode err = U_ZERO_ERROR;
UConverter* icu_ec = ucnv_open(charset.c_str(), &err);
if (icu_ec) {
ucnv_close(icu_ec);
return true;
}

void* iconv_ec = iconv_open(charset.c_str(), "utf-8");
if (iconv_ec != (iconv_t)(-1)) {
iconv_close((iconv_t)iconv_ec);
Expand Down
237 changes: 135 additions & 102 deletions test/encoding_test.js
Expand Up @@ -328,111 +328,144 @@ describe('encoding', () => {
}
});

it('iconv ucs2', () => {
for (var i = 0; i < 0xd800; i++) {
var s = String.fromCharCode(i);
var buf = iconv.encode('utf16le', s);
var n = buf.readUInt16LE();
assert.equal(i, n);
assert.equal(iconv.decode('utf16le', buf), s);
}
describe("iconv", () => {
it('ucs2', () => {
for (var i = 0; i < 0xd800; i++) {
var s = String.fromCharCode(i);
var buf = iconv.encode('utf16le', s);
var n = buf.readUInt16LE();
assert.equal(i, n);
assert.equal(iconv.decode('utf16le', buf), s);
}

for (var i = 0; i < 0xd800; i++) {
var s = String.fromCharCode(i);
var buf = iconv.encode('utf16be', s);
var n = buf.readUInt16BE();
assert.equal(i, n);
assert.equal(iconv.decode('utf16be', buf), s);
}
for (var i = 0; i < 0xd800; i++) {
var s = String.fromCharCode(i);
var buf = iconv.encode('utf16be', s);
var n = buf.readUInt16BE();
assert.equal(i, n);
assert.equal(iconv.decode('utf16be', buf), s);
}

assert.equal(new Buffer([0xc8]).toString(), '\ufffd');
assert.equal(Buffer.from('3DD84DDC', 'hex').toString('utf16le'), '👍');
});
assert.equal(new Buffer([0xc8]).toString(), '\ufffd');
assert.equal(Buffer.from('3DD84DDC', 'hex').toString('utf16le'), '👍');
});

var datas = [
[
0x7f,
"7f000000",
"7f000000"
],
[
0x80,
"80000000",
"80000000"
],
[
0x7ff,
"ff070000",
"ff070000"
],
[
0x800,
"00080000",
"00080000"
],
[
0xffff,
"ffff0000",
"ffff0000"
],
[
0x10000,
"00000100",
"00000100"
],
[
0x10ffff,
"ffff1000",
"ffff1000"
],
[
0x110000,
"00001100",
"00001100"
],
[
0x1fffff,
"ffff1f00",
"ffff1f00"
],
[
0x200000,
"00002000",
"00002000"
],
[
0x3ffffff,
"ffffff03",
"bfff0000ffdf0000"
],
[
0x4000000,
"00000004",
"c0ff000000dc0000"
]
];

var datas = [
[
0x7f,
"7f000000",
"7f000000"
],
[
0x80,
"80000000",
"80000000"
],
[
0x7ff,
"ff070000",
"ff070000"
],
[
0x800,
"00080000",
"00080000"
],
[
0xffff,
"ffff0000",
"ffff0000"
],
[
0x10000,
"00000100",
"00000100"
],
[
0x10ffff,
"ffff1000",
"ffff1000"
],
[
0x110000,
"00001100",
"00001100"
],
[
0x1fffff,
"ffff1f00",
"ffff1f00"
],
[
0x200000,
"00002000",
"00002000"
],
[
0x3ffffff,
"ffffff03",
"bfff0000ffdf0000"
],
[
0x4000000,
"00000004",
"c0ff000000dc0000"
]
];

it('iconv ucs2 multi', () => {
datas.forEach(d => {
var buf = Buffer.alloc(4);
buf.writeUInt32LE(d[0]);
var s = iconv.decode('utf32le', buf);
var buf2 = Buffer.alloc(s.length * 2);
buf2.writeUInt16LE(s.charCodeAt(0));
if (s.length > 1)
buf2.writeUInt16LE(s.charCodeAt(1), 2);
assert.equal(iconv.decode('utf16le', buf2), s);
});
});
it('ucs2 multi', () => {
datas.forEach(d => {
var buf = Buffer.alloc(4);
buf.writeUInt32LE(d[0]);
var s = iconv.decode('utf32le', buf);
var buf2 = Buffer.alloc(s.length * 2);
buf2.writeUInt16LE(s.charCodeAt(0));
if (s.length > 1)
buf2.writeUInt16LE(s.charCodeAt(1), 2);
assert.equal(iconv.decode('utf16le', buf2), s);
});
});

it('ucs4', () => {
datas.forEach(d => {
var buf = Buffer.alloc(4);
buf.writeUInt32LE(d[0]);
var s = iconv.decode('utf32le', buf);
var buf1 = iconv.encode('utf32le', s);
assert.deepEqual(buf.hex(), d[1]);
assert.deepEqual(buf1.hex(), d[2]);
});
});

const builtin_datas = [
{
"name": "gbk",
"text": "你好",
"hex": "c4e3bac3"
},
{
"name": "big5",
"text": "你好",
"hex": "a741a66e"
},
{
"name": "shift_jis",
"text": "こんにちは",
"hex": "82b182f182c982bf82cd"
},
{
"name": "euc-kr",
"text": "안녕하세요",
"hex": "bec8b3e7c7cfbcbcbfe4"
}
];

it('iconv ucs4', () => {
datas.forEach(d => {
var buf = Buffer.alloc(4);
buf.writeUInt32LE(d[0]);
var s = iconv.decode('utf32le', buf);
var buf1 = iconv.encode('utf32le', s);
assert.deepEqual(buf.hex(), d[1]);
assert.deepEqual(buf1.hex(), d[2]);
it("builtin codec", () => {
for (var d of builtin_datas) {
var buf = iconv.encode(d.name, d.text);
assert.equal(buf.hex(), d.hex);
assert.equal(iconv.decode(d.name, buf), d.text);
}
});
});

Expand Down Expand Up @@ -755,7 +788,7 @@ describe('encoding', () => {
});

it('test for Map', () => {
var tmp = {a: 12, b: [2, 3, 5], c: true};
var tmp = { a: 12, b: [2, 3, 5], c: true };
var map = new Map(Object.entries(tmp));
assert.deepEqual(tmp, msgpack.decode(msgpack.encode(map)));
assert.isObject(msgpack.decode(msgpack.encode(map)));
Expand Down

0 comments on commit 4cb10eb

Please sign in to comment.