Merge pull request #12 from jlank/utf8-fix

added fix for utf8 multi-byte boundary when it is split between chunks, ...
dscape · Jun 18, 2013 · 4828c01 · 4828c01
2 parents 2dcfd6d + b9a16c1
commit 4828c01
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 10 deletions.
diff --git a/clarinet.js b/clarinet.js
@@ -65,7 +65,7 @@ if(typeof FastList === 'function') {
     , NULL2                             : S++ // l
     , NULL3                             : S++ // l
     , NUMBER_DECIMAL_POINT              : S++ // .
-    , NUMBER_DIGIT                      : S++ // [0-9]    
+    , NUMBER_DIGIT                      : S++ // [0-9]
     };
 
   for (var s_ in clarinet.STATE) clarinet.STATE[clarinet.STATE[s_]] = s_;
@@ -122,7 +122,7 @@ if(typeof FastList === 'function') {
       parser[buffers[i]] = "";
     }
   }
-  
+
   var stringTokenPattern = /[\\"\n]/g;
 
   function CParser (opt) {
@@ -165,6 +165,12 @@ if(typeof FastList === 'function') {
     this.writable = true;
     this.readable = true;
 
+    //var Buffer = this.Buffer || function Buffer () {}; // if we don't have Buffers, fake it so we can do `var instanceof Buffer` and not throw an error
+    this.bytes_remaining = 0; // number of bytes remaining in multi byte utf8 char to read after split boundary
+    this.bytes_in_sequence = 0; // bytes in multi byte utf8 char to read
+    this.temp_buffs = { "2": new Buffer(2), "3": new Buffer(3), "4": new Buffer(4) }; // for rebuilding chars split before boundary is reached
+    this.string = '';
+
     var me = this;
     Stream.apply(me);
 
@@ -195,9 +201,46 @@ if(typeof FastList === 'function') {
     { constructor: { value: CStream } });
 
   CStream.prototype.write = function (data) {
-    this._parser.write(data.toString());
-    this.emit("data", data);
-    return true;
+    var data = new Buffer(data);
+    for (var i = 0; i < data.length; i++) {
+      var n = data[i];
+      // check for carry over of a multi byte char split between data chunks
+      // & fill temp buffer it with start of this data chunk up to the boundary limit set in the last iteration
+      if (this.bytes_remaining > 0) {
+        for (var j = 0; j < this.bytes_remaining; j++) {
+          this.temp_buffs[this.bytes_in_sequence][this.bytes_in_sequence - this.bytes_remaining + j] = data[j];
+        }
+        this.string = this.temp_buffs[this.bytes_in_sequence].toString();
+        this.bytes_in_sequence = this.bytes_remaining = 0;
+        i = i + j - 1;
+
+        this._parser.write(this.string);
+        this.emit("data", this.string);
+        return true;
+      } else if (this.bytes_remaining === 0 && n >= 128) { // else if no remainder bytes carried over, parse multi byte (>=128) chars one at a time
+        if ((n >= 194) && (n <= 223)) this.bytes_in_sequence = 2;
+        if ((n >= 224) && (n <= 239)) this.bytes_in_sequence = 3;
+        if ((n >= 240) && (n <= 244)) this.bytes_in_sequence = 4;
+        if ((this.bytes_in_sequence + i) > data.length) { // if bytes needed to complete char fall outside data length, we have a boundary split
+          for (var k = 0; k <= (data.length - 1 - i); k++) {
+            this.temp_buffs[this.bytes_in_sequence][k] = data[i + k]; // fill temp data of correct size with bytes available in this chunk
+          }
+          this.bytes_remaining = (i + this.bytes_in_sequence) - data.length;
+          i = data.length - 1;
+        } else {
+          this.string = data.slice(i, (i + this.bytes_in_sequence)).toString();
+          i = i + this.bytes_in_sequence - 1;
+
+          this._parser.write(this.string);
+          this.emit("data", this.string);
+          return true;
+        }
+      } else {
+        this._parser.write(data.toString());
+        this.emit("data", data);
+        return true;
+      }
+    }
   };
 
   CStream.prototype.end = function (chunk) {
@@ -243,7 +286,7 @@ if(typeof FastList === 'function') {
   }
 
   function closeNumber(parser) {
-    if (parser.numberNode) 
+    if (parser.numberNode)
       emit(parser, "onvalue", parseFloat(parser.numberNode));
     parser.numberNode = "";
   }
@@ -306,7 +349,7 @@ if(typeof FastList === 'function') {
         case S.BEGIN:
           if (c === "{") parser.state = S.OPEN_OBJECT;
           else if (c === "[") parser.state = S.OPEN_ARRAY;
-          else if (c !== '\r' && c !== '\n' && c !== ' ' && c !== '\t') 
+          else if (c !== '\r' && c !== '\n' && c !== ' ' && c !== '\t')
             error(parser, "Non-whitespace before {[.");
         continue;
 
@@ -424,7 +467,7 @@ if(typeof FastList === 'function') {
               }
               break;
             }
-            if (c === '\\' && !slashed) { 
+            if (c === '\\' && !slashed) {
               slashed = true;
               parser.textNode += chunk.substring(starti, i-1);
               c = chunk.charAt(i++);
@@ -449,7 +492,7 @@ if(typeof FastList === 'function') {
               if (!c) break;
               else continue;
             }
-            
+
             stringTokenPattern.lastIndex = i;
             var reResult = stringTokenPattern.exec(chunk);
             if (reResult === null) {
@@ -548,7 +591,7 @@ if(typeof FastList === 'function') {
               error(parser, 'Invalid number has two dots');
             parser.numberNode += c;
           } else if (c==='e' || c==='E') {
-            if(parser.numberNode.indexOf('e')!==-1 || 
+            if(parser.numberNode.indexOf('e')!==-1 ||
                parser.numberNode.indexOf('E')!==-1 )
                error(parser, 'Invalid number has two exponential');
             parser.numberNode += c;

diff --git a/test/utf8-chunks.js b/test/utf8-chunks.js
@@ -0,0 +1,34 @@
+var fs             = require('fs')
+  , clarinet       = require('../clarinet.js')
+  , chunks         = clarinet.createStream()
+  , assert         = require('assert')
+  ;
+
+var han_value = '我';
+var han_obj = '{"thing":"' + han_value + '"}';
+var han_test_obj = '';
+var han_buffer_first = new Buffer([0xe6, 0x88]);
+var han_buffer_second = new Buffer([0x91]);
+var han_buffer_full = new Buffer([0xe6, 0x88, 0x91]);
+
+describe('clarinet', function(){
+  describe('#utf8-chunks', function() {
+    it('should be able to parse utf8 characters split across data chunks', function (done){
+      chunks.on("error", function (err) { done(err); });
+      chunks.on("data", function (data) {
+        han_test_obj += data;
+      });
+      chunks.on("end", function () {
+        assert.equal(han_obj, han_test_obj);
+        done();
+      });
+
+      chunks.write('{"thing":"');
+      chunks.write(han_buffer_first);
+      chunks.write(han_buffer_second);
+      //chunks.write(han_buffer_full);
+      chunks.write('"}');
+      chunks.end();
+    });
+  });
+});