forked from itsabot/itsabot
/
nlp.go
188 lines (171 loc) · 4.85 KB
/
nlp.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
// Package nlp enables natural language processing through functions around
// tokenization and stemming.
package nlp
import (
"database/sql/driver"
"encoding/csv"
"errors"
"regexp"
"strings"
"github.com/dchest/stemmer/porter2"
"github.com/itsabot/abot/shared/log"
)
// StructuredInput is generated by Ava and sent to plugins as a helper tool.
// Additional fields should be added, covering Times, Places, etc. to
// make plugin development even easier. Note that right now People is unused.
type StructuredInput struct {
Commands StringSlice
Objects StringSlice
// TODO
// People StringSlice
// Places StringSlice
// Times []time.Time
}
// SIT is a Structured Input Type. It corresponds to either a Command or an
// Object with additional Structured Input Types to be added later.
type SIT int
// TokenizeSentence returns a sentence broken into tokens. Tokens are individual
// words as well as punctuation. For example, "Hi! How are you?" becomes
// []string{"Hi", "!", "How", "are", "you", "?"}
func TokenizeSentence(sent string) []string {
tokens := []string{}
for _, w := range strings.Fields(sent) {
found := []int{}
for i, r := range w {
switch r {
case '\'', '"', ',', '.', ':', ';', '!', '?':
found = append(found, i)
}
}
if len(found) == 0 {
tokens = append(tokens, w)
continue
}
for i, j := range found {
if j > 0 {
tokens = append(tokens, w[:j])
}
tokens = append(tokens, string(w[j]))
if i+1 == len(found) {
tokens = append(tokens, w[j+1:])
}
}
}
log.Debug("found tokens", tokens)
return tokens
}
// StemTokens returns the porter2 (snowball) stems for each token passed into
// it.
func StemTokens(tokens []string) []string {
eng := porter2.Stemmer
stems := []string{}
for _, w := range tokens {
if len(w) == 1 {
switch w {
case "'", "\"", ",", ".", ":", ";", "!", "?":
continue
}
}
w = strings.ToLower(w)
stems = append(stems, eng.Stem(w))
}
return stems
}
// StringSlice replaces []string, adding custom sql support for arrays in lieu
// of pq.
type StringSlice []string
// QuoteEscapeRegex replaces escaped quotes except if it is preceded by a
// literal backslash, e.g. "\\" should translate to a quoted element whose value
// is \
var QuoteEscapeRegex = regexp.MustCompile(`([^\\]([\\]{2})*)\\"`)
// Scan converts to a slice of strings. See:
// http://www.postgresql.org/docs/9.1/static/arrays.html#ARRAYS-IO
func (s *StringSlice) Scan(src interface{}) error {
asBytes, ok := src.([]byte)
if !ok {
return error(errors.New("scan source was not []bytes"))
}
str := string(asBytes)
str = QuoteEscapeRegex.ReplaceAllString(str, `$1""`)
str = strings.Replace(str, `\\`, `\`, -1)
str = str[1 : len(str)-1]
csvReader := csv.NewReader(strings.NewReader(str))
slice, err := csvReader.Read()
if err != nil && err.Error() != "EOF" {
return err
}
*s = StringSlice(slice)
return nil
}
// Value converts to a slice of strings. See:
// http://www.postgresql.org/docs/9.1/static/arrays.html#ARRAYS-IO
func (s StringSlice) Value() (driver.Value, error) {
// string escapes.
// \ => \\\
// " => \"
for i, elem := range s {
s[i] = `"` + strings.Replace(strings.Replace(elem, `\`, `\\\`, -1), `"`, `\"`, -1) + `"`
}
return "{" + strings.Join(s, ",") + "}", nil
}
// Last safely returns the last item in a StringSlice, which is most often the
// target of a pronoun, e.g. (In "Where is that?", "that" will most often refer
// to the last Object named in the previous sentence.
func (s StringSlice) Last() string {
if len(s) == 0 {
return ""
}
return s[len(s)-1]
}
// String converts a StringSlice into a string with each word separated by
// spaces.
func (s StringSlice) String() string {
if len(s) == 0 {
return ""
}
var ss string
for _, w := range s {
ss += " " + w
}
return ss[1:]
}
// StringSlice converts a StringSlice into a []string.
func (s StringSlice) StringSlice() []string {
ss := []string{}
for _, tmp := range s {
if len(tmp) <= 2 {
continue
}
ss = append(ss, tmp)
}
return ss
}
/*
// TODO with addContext
const (
CommandI SIT = iota + 1
PersonI
ObjectI
)
// Pronouns converts pronouns to the type of object it represents. This will be
// useful for adding context into user messages. For example, when a user says,
// "buy that", Ava should know "that" refers to an Object and is most likely a
// reference to the most recent object discussed.
var Pronouns map[string]SIT = map[string]SIT{
"me": PersonI,
"us": PersonI,
"you": PersonI,
"him": PersonI,
"her": PersonI,
"them": PersonI,
"it": ObjectI,
"that": ObjectI,
// Ultimately Place and Time would be nice-to-have in a structured
// input, but they don't outweigh the cost of training a full NER on
// each new plugin at this point. Additional thought should be given as
// to how this can be enabled more simply than requiring training an ML
// plugin.
// "there": PlaceI,
// "then": TimeI,
}
*/