Permalink
Browse files

initial commit

  • Loading branch information...
goldenlentils committed Jan 5, 2013
0 parents commit 5e9627611d8346e1f9e68cab9fa7edad8249ddc4
Showing with 3,405 additions and 0 deletions.
  1. +18 −0 LICENSE
  2. +167 −0 README.md
  3. +80 −0 lib/dom.js
  4. +141 −0 lib/encoder.js
  5. +97 −0 lib/handlers.js
  6. +15 −0 lib/html.js
  7. +2,233 −0 lib/named.json
  8. +408 −0 lib/parser.js
  9. +36 −0 lib/special.json
  10. +26 −0 package.json
  11. +41 −0 tests/bench.js
  12. +3 −0 tests/runtests.js
  13. +24 −0 tests/test-dom.js
  14. +10 −0 tests/test-encoder.js
  15. +106 −0 tests/test-parser.js
18 LICENSE
@@ -0,0 +1,18 @@
+Copyright (c) 2012 Igor Sadikov <igor@vokidas.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
167 README.md
@@ -0,0 +1,167 @@
+# html-pe
+
+`html-pe` is for those who expect accuracy from their HTML parser, but are
+unwilling to compromise on performance. This parser handles a variety of edge
+cases as prescribed by the HTML5 standard and remains performant thanks to
+optimizations such as avoiding regular expressions.
+
+Chunked parsing is supported, and an efficient HTML entity encoder/decoder is
+included.
+
+<!-- ## Installation -->
+
+## Usage
+
+```javascript
+var html = require('html-pe');
+
+var parser = new html.Parser();
+
+// parse chunks
+parser.write('<p style="color:red">foo');
+parser.write('</p>');
+parser.end();
+
+// parse entire string
+parser.parse('<div id="main">bar</div>');
+```
+
+Optionally, an `options` object may be passed to the constructor (default values
+shown):
+
+```javascript
+var parser = new html.Parser({
+ trim: false, // trim whitespace in text nodes
+ decode: true // decode HTML character entities in attribute values and text nodes
+});
+```
+
+### Event Handling
+
+The parser is an `EventEmitter` and emits the following events:
+
+```javascript
+// opening tag
+parser.on('open', function (name, attributes, empty) {
+ // <string> name
+ // <object> attributes
+ // <bool> empty: is this an empty (self-closing) tag
+});
+
+// text node
+parser.on('text', function (text) {
+ // <string> text
+});
+
+// closing tag
+parser.on('close', function (name) {
+ // <string> name
+});
+
+// end
+parser.on('end', function () {});
+```
+
+Two handlers are provided:
+
+```javascript
+var dom = new html.DOM(function (document) {
+ // <object> document
+});
+dom.listen(parser);
+// ...
+dom.reset();
+
+var printer = new html.Printer();
+printer.listen(parser);
+```
+
+The `Printer` handler pretty-prints the parsed HTML to the console. The `DOM`
+handler passes a `document` object to its callback, which is detailed below.
+Follow the structure in `handlers.js` to create new handlers.
+
+### DOM Manipulation
+
+The `DOM` handler will parse `<div id="main">hello <b>world</b></div>` into the
+following `document`:
+
+```javascript
+{
+ type: 'element',
+ parent: {},
+ name: '',
+ attributes: {},
+ children: [{
+ type: 'element',
+ parent : [Circular],
+ name: 'div',
+ attributes: {
+ id: 'main'
+ },
+ children: [{
+ type: 'text',
+ parent: [Circular],
+ text: 'hello '
+ },
+ {
+ type: 'element',
+ parent: [Circular],
+ name: 'b',
+ attributes: {},
+ children: [{
+ type: 'text',
+ parent: [Circular],
+ text: 'world'
+ }]
+ }]
+ }]
+}
+```
+
+Nodes are either `Element` nodes or `Text` nodes. All nodes have a `textContent`
+property. `Element` nodes additionally have two primitive methods:
+`getElementsBy(test)` and `getElementBy(test)` where `test` is a function
+`Element -> bool`. In this example, `getElementsBy` is used to implement
+`getElementsByTagName`:
+
+```javascript
+var Element = html.Element;
+
+Element.prototype.getElementsByTagName = function (name) {
+ this.getElementsBy(function (elem) {
+ return elem.name == name;
+ });
+}
+```
+
+See `dom.js` for details.
+
+### HTML Character Entity Encoder/Decoder
+
+```javascript
+var html = require('html-pe');
+
+html.encode('<p id="main">Hello &amp; world!</p>',
+ false // encode &<>"' only (default)
+ );
+// '&lt;p id=&quot;main&quot;&gt;Hello &amp;amp; world!&lt;/p&gt;'
+html.encode('<p>здравствуйте!</p>',
+ true // also encode all non-ASCII and non-printable characters
+ );
+// '&lt;p&gt;&#1079;&#1076;&#1088;&#1072;&#1074;&#1089;&#1090;&#1074;&#1091;&#1081;&#1090;&#1077;!&lt;/p&gt;'
+
+html.decode('It&apos;s &not me');
+// 'It\'s ¬ me'
+```
+
+See `encoder.js` for details.
+
+## Performance
+
+`html-pe` has slightly worse performance than
+[htmlparser2](https://github.com/fb55/node-htmlparser). A benchmark is available
+in `tests/bench.js`.
+
+## License
+
+MIT. See `LICENSE`.
@@ -0,0 +1,80 @@
+/* Basic DOM methods */
+
+var Node = function(type) {
+ this.type = type; // 'element' or 'text'
+ this.parent = {};
+};
+
+var Text = exports.Text = function(text) {
+ Node.call(this, 'text');
+ this.text = text;
+};
+
+var Element = exports.Element = function(name, attributes) {
+ Node.call(this, 'element');
+ this.name = name || '';
+ this.attributes = attributes || '';
+ this.children = [];
+};
+
+Text.prototype = Object.create(Node.prototype);
+Element.prototype = Object.create(Node.prototype);
+
+Node.prototype.__defineGetter__('textContent', function() {
+ var text = '';
+
+ if (this.type == 'text') {
+ return this.text;
+ }
+
+ for (var i = 0; i < this.children.length; i++) {
+ text += this.children[i].textContent;
+ }
+
+ return text;
+});
+
+Element.prototype.getElementsBy = function(test) {
+ // test: element -> bool
+ // recursively filter element's children by test
+ var elems = [], children = this.children;
+
+ for (var i = 0; i < children.length; i++) {
+ if (children[i].type != 'element') {
+ continue;
+ }
+
+ if (test(children[i]) === true) {
+ elems[elems.length] = children[i];
+ }
+
+ if (children[i].children.length > 0) {
+ elems = elems.concat(children[i].getElementsBy(test));
+ }
+ }
+ return elems;
+};
+
+Element.prototype.getElementBy = function(test) {
+ // test: element -> bool
+ // equivalent to getElementsBy(test)[0]
+ var elem, children = this.children;
+
+ for (var i = 0; i < children.length; i++) {
+ if (children[i].type != 'element') {
+ continue;
+ }
+
+ if (test(children[i]) === true) {
+ return children[i];
+ }
+
+ if (children[i].children.length > 0 &&
+ (elem = children[i].getElementBy(test), elem !== {})) {
+
+ return elem;
+ }
+ }
+
+ return {};
+};
Oops, something went wrong.

0 comments on commit 5e96276

Please sign in to comment.