shared/nlp/nlp.go

// Package nlp enables natural language processing through functions around
// tokenization and stemming.
package nlp

import (
	"database/sql/driver"
	"encoding/csv"
	"errors"
	"regexp"
	"strings"

	"github.com/dchest/stemmer/porter2"
	"github.com/itsabot/abot/shared/log"
)

// StructuredInput is generated by Ava and sent to plugins as a helper tool.
// Additional fields should be added, covering Times, Places, etc. to
// make plugin development even easier. Note that right now People is unused.
type StructuredInput struct {
	Commands StringSlice
	Objects  StringSlice

	// TODO
	// People   StringSlice
	// Places   StringSlice
	// Times    []time.Time
}

// SIT is a Structured Input Type. It corresponds to either a Command or an
// Object with additional Structured Input Types to be added later.
type SIT int

// TokenizeSentence returns a sentence broken into tokens. Tokens are individual
// words as well as punctuation. For example, "Hi! How are you?" becomes
// []string{"Hi", "!", "How", "are", "you", "?"}
func TokenizeSentence(sent string) []string {
	tokens := []string{}
	for _, w := range strings.Fields(sent) {
		found := []int{}
		for i, r := range w {
			switch r {
			case '\'', '"', ',', '.', ':', ';', '!', '?':
				found = append(found, i)
			}
		}
		if len(found) == 0 {
			tokens = append(tokens, w)
			continue
		}
		for i, j := range found {
			if j > 0 {
				tokens = append(tokens, w[:j])
			}
			tokens = append(tokens, string(w[j]))
			if i+1 == len(found) {
				tokens = append(tokens, w[j+1:])
			}
		}
	}
	log.Debug("found tokens", tokens)
	return tokens
}

// StemTokens returns the porter2 (snowball) stems for each token passed into
// it.
func StemTokens(tokens []string) []string {
	eng := porter2.Stemmer
	stems := []string{}
	for _, w := range tokens {
		if len(w) == 1 {
			switch w {
			case "'", "\"", ",", ".", ":", ";", "!", "?":
				continue
			}
		}
		w = strings.ToLower(w)
		stems = append(stems, eng.Stem(w))
	}
	return stems
}

// StringSlice replaces []string, adding custom sql support for arrays in lieu
// of pq.
type StringSlice []string

// QuoteEscapeRegex replaces escaped quotes except if it is preceded by a
// literal backslash, e.g. "\\" should translate to a quoted element whose value
// is \
var QuoteEscapeRegex = regexp.MustCompile(`([^\\]([\\]{2})*)\\"`)

// Scan converts to a slice of strings. See:
// http://www.postgresql.org/docs/9.1/static/arrays.html#ARRAYS-IO
func (s *StringSlice) Scan(src interface{}) error {
	asBytes, ok := src.([]byte)
	if !ok {
		return error(errors.New("scan source was not []bytes"))
	}
	str := string(asBytes)
	str = QuoteEscapeRegex.ReplaceAllString(str, `$1""`)
	str = strings.Replace(str, `\\`, `\`, -1)
	str = str[1 : len(str)-1]
	csvReader := csv.NewReader(strings.NewReader(str))
	slice, err := csvReader.Read()
	if err != nil && err.Error() != "EOF" {
		return err
	}
	*s = StringSlice(slice)
	return nil
}

// Value converts to a slice of strings. See:
// http://www.postgresql.org/docs/9.1/static/arrays.html#ARRAYS-IO
func (s StringSlice) Value() (driver.Value, error) {
	// string escapes.
	// \ => \\\
	// " => \"
	for i, elem := range s {
		s[i] = `"` + strings.Replace(strings.Replace(elem, `\`, `\\\`, -1), `"`, `\"`, -1) + `"`
	}
	return "{" + strings.Join(s, ",") + "}", nil
}

// Last safely returns the last item in a StringSlice, which is most often the
// target of a pronoun, e.g. (In "Where is that?", "that" will most often refer
// to the last Object named in the previous sentence.
func (s StringSlice) Last() string {
	if len(s) == 0 {
		return ""
	}
	return s[len(s)-1]
}

// String converts a StringSlice into a string with each word separated by
// spaces.
func (s StringSlice) String() string {
	if len(s) == 0 {
		return ""
	}
	var ss string
	for _, w := range s {
		ss += " " + w
	}
	return ss[1:]
}

// StringSlice converts a StringSlice into a []string.
func (s StringSlice) StringSlice() []string {
	ss := []string{}
	for _, tmp := range s {
		if len(tmp) <= 2 {
			continue
		}
		ss = append(ss, tmp)
	}
	return ss
}

/*
// TODO with addContext
const (
	CommandI SIT = iota + 1
	PersonI
	ObjectI
)

// Pronouns converts pronouns to the type of object it represents. This will be
// useful for adding context into user messages. For example, when a user says,
// "buy that", Ava should know "that" refers to an Object and is most likely a
// reference to the most recent object discussed.
var Pronouns map[string]SIT = map[string]SIT{
	"me":   PersonI,
	"us":   PersonI,
	"you":  PersonI,
	"him":  PersonI,
	"her":  PersonI,
	"them": PersonI,
	"it":   ObjectI,
	"that": ObjectI,

	// Ultimately Place and Time would be nice-to-have in a structured
	// input, but they don't outweigh the cost of training a full NER on
	// each new plugin at this point. Additional thought should be given as
	// to how this can be enabled more simply than requiring training an ML
	// plugin.
	// "there": PlaceI,
	// "then":  TimeI,
}
*/