Skip to content

Commit

Permalink
Merge pull request #12 from jlank/utf8-fix
Browse files Browse the repository at this point in the history
added fix for utf8 multi-byte boundary when it is split between chunks, ...
  • Loading branch information
dscape committed Jun 18, 2013
2 parents 2dcfd6d + b9a16c1 commit 4828c01
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 10 deletions.
63 changes: 53 additions & 10 deletions clarinet.js
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ if(typeof FastList === 'function') {
, NULL2 : S++ // l
, NULL3 : S++ // l
, NUMBER_DECIMAL_POINT : S++ // .
, NUMBER_DIGIT : S++ // [0-9]
, NUMBER_DIGIT : S++ // [0-9]
};

for (var s_ in clarinet.STATE) clarinet.STATE[clarinet.STATE[s_]] = s_;
Expand Down Expand Up @@ -122,7 +122,7 @@ if(typeof FastList === 'function') {
parser[buffers[i]] = "";
}
}

var stringTokenPattern = /[\\"\n]/g;

function CParser (opt) {
Expand Down Expand Up @@ -165,6 +165,12 @@ if(typeof FastList === 'function') {
this.writable = true;
this.readable = true;

//var Buffer = this.Buffer || function Buffer () {}; // if we don't have Buffers, fake it so we can do `var instanceof Buffer` and not throw an error
this.bytes_remaining = 0; // number of bytes remaining in multi byte utf8 char to read after split boundary
this.bytes_in_sequence = 0; // bytes in multi byte utf8 char to read
this.temp_buffs = { "2": new Buffer(2), "3": new Buffer(3), "4": new Buffer(4) }; // for rebuilding chars split before boundary is reached
this.string = '';

var me = this;
Stream.apply(me);

Expand Down Expand Up @@ -195,9 +201,46 @@ if(typeof FastList === 'function') {
{ constructor: { value: CStream } });

CStream.prototype.write = function (data) {
this._parser.write(data.toString());
this.emit("data", data);
return true;
var data = new Buffer(data);
for (var i = 0; i < data.length; i++) {
var n = data[i];
// check for carry over of a multi byte char split between data chunks
// & fill temp buffer it with start of this data chunk up to the boundary limit set in the last iteration
if (this.bytes_remaining > 0) {
for (var j = 0; j < this.bytes_remaining; j++) {
this.temp_buffs[this.bytes_in_sequence][this.bytes_in_sequence - this.bytes_remaining + j] = data[j];
}
this.string = this.temp_buffs[this.bytes_in_sequence].toString();
this.bytes_in_sequence = this.bytes_remaining = 0;
i = i + j - 1;

this._parser.write(this.string);
this.emit("data", this.string);
return true;
} else if (this.bytes_remaining === 0 && n >= 128) { // else if no remainder bytes carried over, parse multi byte (>=128) chars one at a time
if ((n >= 194) && (n <= 223)) this.bytes_in_sequence = 2;
if ((n >= 224) && (n <= 239)) this.bytes_in_sequence = 3;
if ((n >= 240) && (n <= 244)) this.bytes_in_sequence = 4;
if ((this.bytes_in_sequence + i) > data.length) { // if bytes needed to complete char fall outside data length, we have a boundary split
for (var k = 0; k <= (data.length - 1 - i); k++) {
this.temp_buffs[this.bytes_in_sequence][k] = data[i + k]; // fill temp data of correct size with bytes available in this chunk
}
this.bytes_remaining = (i + this.bytes_in_sequence) - data.length;
i = data.length - 1;
} else {
this.string = data.slice(i, (i + this.bytes_in_sequence)).toString();
i = i + this.bytes_in_sequence - 1;

this._parser.write(this.string);
this.emit("data", this.string);
return true;
}
} else {
this._parser.write(data.toString());
this.emit("data", data);
return true;
}
}
};

CStream.prototype.end = function (chunk) {
Expand Down Expand Up @@ -243,7 +286,7 @@ if(typeof FastList === 'function') {
}

function closeNumber(parser) {
if (parser.numberNode)
if (parser.numberNode)
emit(parser, "onvalue", parseFloat(parser.numberNode));
parser.numberNode = "";
}
Expand Down Expand Up @@ -306,7 +349,7 @@ if(typeof FastList === 'function') {
case S.BEGIN:
if (c === "{") parser.state = S.OPEN_OBJECT;
else if (c === "[") parser.state = S.OPEN_ARRAY;
else if (c !== '\r' && c !== '\n' && c !== ' ' && c !== '\t')
else if (c !== '\r' && c !== '\n' && c !== ' ' && c !== '\t')
error(parser, "Non-whitespace before {[.");
continue;

Expand Down Expand Up @@ -424,7 +467,7 @@ if(typeof FastList === 'function') {
}
break;
}
if (c === '\\' && !slashed) {
if (c === '\\' && !slashed) {
slashed = true;
parser.textNode += chunk.substring(starti, i-1);
c = chunk.charAt(i++);
Expand All @@ -449,7 +492,7 @@ if(typeof FastList === 'function') {
if (!c) break;
else continue;
}

stringTokenPattern.lastIndex = i;
var reResult = stringTokenPattern.exec(chunk);
if (reResult === null) {
Expand Down Expand Up @@ -548,7 +591,7 @@ if(typeof FastList === 'function') {
error(parser, 'Invalid number has two dots');
parser.numberNode += c;
} else if (c==='e' || c==='E') {
if(parser.numberNode.indexOf('e')!==-1 ||
if(parser.numberNode.indexOf('e')!==-1 ||
parser.numberNode.indexOf('E')!==-1 )
error(parser, 'Invalid number has two exponential');
parser.numberNode += c;
Expand Down
34 changes: 34 additions & 0 deletions test/utf8-chunks.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
var fs = require('fs')
, clarinet = require('../clarinet.js')
, chunks = clarinet.createStream()
, assert = require('assert')
;

var han_value = '我';
var han_obj = '{"thing":"' + han_value + '"}';
var han_test_obj = '';
var han_buffer_first = new Buffer([0xe6, 0x88]);
var han_buffer_second = new Buffer([0x91]);
var han_buffer_full = new Buffer([0xe6, 0x88, 0x91]);

describe('clarinet', function(){
describe('#utf8-chunks', function() {
it('should be able to parse utf8 characters split across data chunks', function (done){
chunks.on("error", function (err) { done(err); });
chunks.on("data", function (data) {
han_test_obj += data;
});
chunks.on("end", function () {
assert.equal(han_obj, han_test_obj);
done();
});

chunks.write('{"thing":"');
chunks.write(han_buffer_first);
chunks.write(han_buffer_second);
//chunks.write(han_buffer_full);
chunks.write('"}');
chunks.end();
});
});
});

0 comments on commit 4828c01

Please sign in to comment.