feat: improve depth of data returned on match. enable multi-word patt…

…ern matching
distsys-labs · Jun 13, 2018 · 3722015 · 3722015
1 parent 33acb4d
commit 3722015
Show file tree

Hide file tree

Showing 8 changed files with 312 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -143,14 +143,30 @@ match.value(match.data.values)
 
 The data returned in the match will contain the properties:
 
+ * `sentiment` - 'positive'|'neutral'|'mixed'|'negative'
+ * `confidence` - %
+ * `degree` - %
+ * `dirtiness` - %
+ * `ordered` - true|false
+ * `politeness` - %
+ * `tense` - 'past'|'present'
+ * `tokens` - the full token array extracted during sentence analysis
+ * `type` - 'declarative'|'imperative'|'interrogative'
  * `values` - a hash of name/value pairs extracted
- * `ordered`: true|false,
- * `type`: 'declarative'|'imperative'|'interrogative',
- * `sentiment`: 'positive'|'neutral'|'mixed'|'negative',
- * `degree`: %,
- * `confidence`: %,
- * `tense`: 'past'|'present'
 
+Each token has the following properties:
+
+ * `abbreviation` - true|false
+ * `acronym` - true|false
+ * `alt` - alternate text for the tag's value
+ * `entity` - either undefined or a hash with the following details:
+    * `alt` - alternate text content
+    * `value` - the value of the entity detected
+    * `type` - 'unknown'|'email'|'ip'|etc.
+ * `plural` - true|false
+ * `pos` - part of speech tag,
+ * `value` - text content for the tag,
+ * `verb` - true|false
 
 [travis-image]: https://travis-ci.org/deftly/nlp-router.svg?branch=master
 [travis-url]: https://travis-ci.org/deftly/nlp-router

diff --git a/docs/PoS-tags.md b/docs/PoS-tags.md
@@ -12,6 +12,7 @@
 | `CC` | Coordinating conjunction  | `and`, `but`, `or` |
 | `CD` | Cardinal number  | `one`, `two`, `1`, `2` |
 | `DT` | Determiner  | `the`, `some` |
+| `EM` | Emoticon | `:)`, `:(` |
 | `EX` | Existential there | `there` |
 | `FW` | Foreign word | `mon dieu` |
 | `IN` | Preposition | `of`, `in`, `by` |

diff --git a/docs/rule-definition.md b/docs/rule-definition.md
@@ -23,6 +23,7 @@ Each rule can be defined using the following data structure. After showing the l
       pos: ''|[], // part of speech tag(s) to filter by
       match: ''|[], // literal values to limit matches by
       pattern: //,  // regex to limit matches
+      multiple: true|false, // allows regex to match multiple words - not allowed with pos
       negated: true|false // only valid if one or more pos tags are included
     }
   ]
@@ -46,7 +47,7 @@ Examples:
  * `{state,!NN=[on,running,healthy]}` - negated tag named 'state'
  * `{address,NN=/.../}` - tag named 'address' with regex for matching valid forms of address
  * `{inquiry,[WDT,WP]}` - tag named 'inquiry' which allows for either part of speech 
-
+ * `{phrase,=/.../+}` - tag named 'phrase' with regex. the `+` allows it to span multiple words. Only valid without parts of speech tags
 
 ### ordered
 
@@ -62,11 +63,19 @@ Allows for matching based on detected sentiment.
 
 ### degree
 
-Match if the degree of the sentiment matches the percentage (expressed in decimal form) or higher.
+Match if the degree of the sentiment is equal to or greater than the percentage (expressed in decimal form).
 
 ### confidence
 
-Match if the degree of confidence in the sentence's evaluation meets the percentage (expressed in decimal form) or higher.
+Match if the degree of confidence in the sentence's evaluation is equal to or greater than the percentage (expressed in decimal form).
+
+### politeness
+
+Matches if the politeness ranking is equal to or greater than the percentage (in decimal form).
+
+### dirtiness
+
+Matches if the dirtiness ranking is equal to or greater than the percentage (in decimal form).
 
 ### tense
 
@@ -90,7 +99,11 @@ The words that are legal matches for the part(s) of speech.
 
 ### tags - pattern
 
-A regex to limit legal matches for the part(s) of speech.
+A regular expression to limit legal matches for the part(s) of speech. Can also be used without a pos tag to capture a token or multiple tokens (see `multiple`)
+
+### tags - multiple
+
+Allows the regular expression to capture more than one token. Not allowed with `pos`.
 
 ### tags - negated
 

diff --git a/spec/evaluator.spec.js b/spec/evaluator.spec.js
@@ -4,7 +4,7 @@ const evaluator = require('../src/evaluator')
 
 describe('Evaluator', function () {
   let rules = []
-  before(function() {
+  before(function () {
     rules.push({ fn:
       evaluator.compile({
         name: 'command',
@@ -23,7 +23,7 @@ describe('Evaluator', function () {
       })
     })
 
-    rules.push({ fn :
+    rules.push({ fn:
       evaluator.compile({
         name: 'whatever',
         tokens: [
@@ -47,16 +47,60 @@ describe('Evaluator', function () {
     })
   })
 
-  it('should match ordered rule', function() {
+  it('should match ordered rule', function () {
     return evaluator.match(rules, 'run the jewels fast')
       .should.eql({
         name: 'command',
         data: {
           confidence: 1,
           degree: 0.0825,
+          dirtiness: 0,
           ordered: true,
+          politeness: 0,
           sentiment: 'neutral',
           tense: 'present',
+          tokens: [
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'run',
+              entity: undefined,
+              plural: false,
+              pos: 'VB',
+              verb: false,
+              value: 'run'
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'the',
+              entity: undefined,
+              plural: false,
+              pos: 'DT',
+              verb: false,
+              value: 'the'
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'jewels',
+              entity: undefined,
+              plural: false,
+              pos: 'NNS',
+              verb: false,
+              value: 'jewels'
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'fast',
+              entity: undefined,
+              plural: false,
+              pos: 'RB',
+              verb: false,
+              value: 'fast'
+            }
+          ],
           type: 'imperative',
           values: {
             action: 'run',
@@ -66,16 +110,60 @@ describe('Evaluator', function () {
       })
   })
 
-  it('should match unordered rule', function() {
+  it('should match unordered rule', function () {
     return evaluator.match(rules, 'the jewels run fast')
       .should.eql({
         name: 'whatever',
         data: {
           confidence: 0.75,
           degree: 0.0825,
+          dirtiness: 0,
           ordered: true,
+          politeness: 0,
           sentiment: 'neutral',
           tense: 'present',
+          tokens: [
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'the',
+              entity: undefined,
+              plural: false,
+              pos: 'DT',
+              value: 'the',
+              verb: false
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'jewels',
+              entity: undefined,
+              plural: false,
+              pos: 'NNS',
+              value: 'jewels',
+              verb: false
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'run',
+              entity: undefined,
+              plural: false,
+              pos: 'VBP',
+              value: 'run',
+              verb: false
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'fast',
+              entity: undefined,
+              plural: false,
+              pos: 'RB',
+              value: 'fast',
+              verb: false
+            }
+          ],
           type: undefined,
           values: {
             action: 'run',
@@ -85,16 +173,50 @@ describe('Evaluator', function () {
       })
   })
 
-  it('should fall back to catch-all', function() {
+  it('should fall back to catch-all', function () {
     return evaluator.match(rules, 'what what what')
       .should.eql({
         name: 'catch-all',
         data: {
           confidence: 1,
           degree: 0,
+          dirtiness: 0,
           ordered: true,
+          politeness: 0,
           sentiment: 'neutral',
           tense: 'present',
+          tokens: [
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'what',
+              entity: undefined,
+              plural: false,
+              pos: 'WP',
+              value: 'what',
+              verb: false
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'what',
+              entity: undefined,
+              plural: false,
+              pos: 'WP',
+              value: 'what',
+              verb: false
+            },
+            {
+              abbreviation: false,
+              acronym: false,
+              alt: 'what',
+              entity: undefined,
+              plural: false,
+              pos: 'WP',
+              value: 'what',
+              verb: false
+            }
+          ],
           type: 'interrogative',
           values: {}
         }

diff --git a/spec/parser.spec.js b/spec/parser.spec.js
@@ -4,8 +4,8 @@ const parser = require('../src/parser')
 
 describe('Parser', function () {
   describe('when extracting tokens from pattern', function () {
-    it('should ', function () {
-      parser.extract(`{action,VBP} the {target,[NN NNP NNPS NNS]} and go {direction,IN=[up,down, left, right]} {location=/[a-z]+town/}`)
+    it('should extract pattern correctly', function () {
+      parser.extract(`{action,VBP} the {target,[NN NNP NNPS NNS]} and go {direction,IN=[up,down, left, right]} {location=/[a-z]+town/} {rest=/.+/+}`)
         .should.eql(
           [
             {
@@ -36,6 +36,11 @@ describe('Parser', function () {
             {
               name: 'location',
               pattern: /[a-z]+town/
+            },
+            {
+              name: 'rest',
+              pattern: /.+/,
+              multiple: true
             }
           ]
         )
@@ -222,30 +227,30 @@ describe('Parser', function () {
   describe('when performing full analysis', function () {
     it('should resolve rule when valid', function () {
       return parser.analyze({
-        pattern: '{action,VRB} {target,[NN NRP NNS NRPS]}',
+        pattern: '{action,VBP} {target,[NN NNP NNS NNPS]}',
         type: 'imperative'
       }, 'one').should.eventually.eql({
         name: 'one',
-        pattern: '{action,VRB} {target,[NN NRP NNS NRPS]}',
+        pattern: '{action,VBP} {target,[NN NNP NNS NNPS]}',
         type: 'imperative',
         ordered: true,
         rank: 21,
         tokens: [
           {
             name: 'action',
-            pos: ['VRB']
+            pos: ['VBP']
           },
           {
             name: 'target',
-            pos: ['NN', 'NRP', 'NNS', 'NRPS']
+            pos: ['NN', 'NNP', 'NNS', 'NNPS']
           }
         ]
       })
     })
 
     it('should reject with detailed error when invalid', function () {
       return parser.analyze({
-        pattern: '{action,VRB} {target,[NN NRP NNS NRPS]}',
+        pattern: '{action,VBP} {target,[NN NNP NNS NNPS]}',
         type: 'interclarative'
       }, 'one').should.be.rejectedWith(
         'child "type" fails because ["type" must be one of [declarative, imperative, interrogative]]'