Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jgrahamc committed Nov 22, 2013
1 parent 204604b commit 8acc45e
Show file tree
Hide file tree
Showing 4 changed files with 580 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -20,3 +20,5 @@ _cgo_export.*
_testmain.go

*.exe

*~
67 changes: 67 additions & 0 deletions Makefile
@@ -0,0 +1,67 @@
NAME := ahocorasick
VERSION := 0.1
ITERATION := $(shell date +%s)
# REVISION := $(shell git log -n1 --pretty=format:%h)

export GOPATH := $(PWD)

BUILD_DEPS := go

.PHONY: all
all: $(NAME)

.PHONY: test
test:
@go test $(NAME)

.PHONY: bench
bench:
@go test -bench . $(NAME)

.PHONY: print-builddeps
print-builddeps:
@echo $(BUILD_DEPS)

.PHONY: $(NAME)
$(NAME): bin/$(NAME)

SRC := $(shell find src/$(NAME) -type f)
bin/$(NAME): $(SRC)
@go fmt $(NAME)
@go install -tags "$(TAGS)" -ldflags "$(LDFLAGS)" $(NAME)

BUILD_PATH := build
INSTALL_PREFIX := usr/local
CFSSL_BUILD_PATH := $(BUILD_PATH)/$(INSTALL_PREFIX)/$(NAME)

FPM := fakeroot fpm -C $(BUILD_PATH) \
-s dir \
-t deb \
--deb-compression bzip2 \
-v $(VERSION) \
--iteration $(ITERATION)

DEB_PACKAGE := $(NAME)_$(VERSION)-$(ITERATION)_amd64.deb
$(DEB_PACKAGE): TAGS := release
$(DEB_PACKAGE): LDFLAGS := -X main.version $(VERSION) -X main.revision $(REVISION)
$(DEB_PACKAGE): clean all
mkdir -p $(CFSSL_BUILD_PATH)
cp bin/$(NAME) $(CFSSL_BUILD_PATH)
$(FPM) -n $(NAME) $(INSTALL_PREFIX)/$(NAME)

register-%.deb: ; $(PACKAGE_REGISTER_BIN) $*.deb

.PHONY: package
package: $(DEB_PACKAGE)

.PHONY: clean-package
clean-package:
$(RM) -r $(BUILD_PATH)
$(RM) $(DEB_PACKAGE)

.PHONY: clean
clean: clean-package
@go clean -i $(NAME)/...
@$(RM) -r pkg

print-%: ; @echo $*=$($*)
232 changes: 232 additions & 0 deletions src/ahocorasick/ahocorasick.go
@@ -0,0 +1,232 @@
// ahocorasick.go: implementation of the Aho-Corasick string matching
// algorithm. Actually implemented as matching against []byte rather
// than the Go string type. Throughout this code []byte is referred to
// as a blice.
//
// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
//
// Copyright (c) 2013 CloudFlare, Inc.

package ahocorasick

import (
"container/list"
)

// A node in the trie structure used to implement Aho-Corasick
type node struct {
b []byte // The blice at this node

output bool // True means this node represents a blice that should
// be output when matching
index int // index into original dictionary if output is true

counter int // Set to the value of the Matcher.counter when a
// match is output to prevent duplicates output

// The use of fixed size arrays is space-inefficient but fast for
// lookups.

child [256]*node // A non-nil entry in this array means that the
// index represents a byte value which can be
// appended to the current node. Blices in the
// trie are built up byte by byte through these
// child node pointers.

suffix *node // Pointer to the longest possible strict suffix of
// this node

fail *node // Pointer to the next node which is in the dictionary
// which can be reached from here following suffixes. Called fail
// because it is used to fallback in the trie when a match fails.
}

// Matcher is returned by NewMatcher and contains a list of blices to
// match against
type Matcher struct {
root *node // Points to the root of the trie
counter int // Counts the number of matches done, and is used to
// prevent output of multiple matches of the same string
trie []node // preallocated block of memory containing all the
// nodes
extent int // offset into trie that is currently free
}

// finndBlice looks for a blice in the trie starting from the root and
// returns a pointer to the node representing the end of the blice. If
// the blice is not found it returns nil.
func (m *Matcher) findBlice(b []byte) *node {
n := m.root

for n != nil && len(b) > 0 {
n = n.child[int(b[0])]
b = b[1:]
}

return n
}

// getFreeNode: gets a free node structure from the Matcher's trie
// pool and updates the extent to point to the next free node.
func (m *Matcher) getFreeNode() *node {
m.extent += 1

return &m.trie[m.extent-1]
}

// buildTrie builds the fundamental trie structure from a set of
// blices.
func (m *Matcher) buildTrie(dictionary [][]byte) {

// Work out the maximum size for the trie (all dictionary entries
// are distinct plus the root). This is used to preallocate memory
// for it.

max := 1
for _, blice := range dictionary {
max += len(blice)
}
m.trie = make([]node, max)

m.root = m.getFreeNode()

// This loop builds the nodes in the trie by following through
// each dictionary entry building the children pointers.

for i, blice := range dictionary {
n := m.root
path := make([]byte, 0)
for _, b := range blice {
path = append(path, b)

c := n.child[int(b)]

if c == nil {
c = m.getFreeNode()
n.child[int(b)] = c
c.b = make([]byte, len(path))
copy(c.b, path)

// Nodes directly under the root node will have the
// root as their fail point as there are no suffixes
// possible.

if len(path) == 1 {
c.fail = m.root
}

c.suffix = m.root
}

n = c
}

// The last value of n points to the node representing a
// dictionary entry

n.output = true
n.index = i
}

l := new(list.List)
l.PushBack(m.root)

for l.Len() > 0 {
n := l.Remove(l.Front()).(*node)

for i := 0; i < 256; i++ {
c := n.child[i]
if c != nil {
l.PushBack(c)

for j := 1; j < len(c.b); j++ {
c.fail = m.findBlice(c.b[j:])
if c.fail != nil {
break
}
}

if c.fail == nil {
c.fail = m.root
}

for j := 1; j < len(c.b); j++ {
s := m.findBlice(c.b[j:])
if s != nil && s.output {
c.suffix = s
break
}
}
}
}
}
}

// NewMatcher creates a new Matcher used to match against a set of
// blices
func NewMatcher(dictionary [][]byte) *Matcher {
m := new(Matcher)

m.buildTrie(dictionary)

return m
}

// NewStringMatcher creates a new Matcher used to match against a set
// of strings (this is a helper to make initialization easy)
func NewStringMatcher(dictionary []string) *Matcher {
m := new(Matcher)

d := make([][]byte, 0)
for _, s := range dictionary {
d = append(d, []byte(s))
}

m.buildTrie(d)

return m
}

// Match searches in for blices and returns all the blices found as
// indexes into the original dictionary
func (m *Matcher) Match(in []byte) []int {
m.counter += 1
hits := make([]int, 0)

n := m.root

for _, b := range in {
c := int(b)

for n.child[c] == nil && n != m.root {
n = n.fail
}

if n.child[c] != nil {
f := n.child[c]
n = f

if f.output && f.counter != m.counter {
hits = append(hits, f.index)
f.counter = m.counter
}

for f.suffix != m.root {
f = f.suffix
if f.counter != m.counter {
hits = append(hits, f.index)
f.counter = m.counter
} else {

// There's no point working our way up the
// suffixes if it's been done before for this call
// to Match. The matches are already in hits.

break
}
}
}
}

return hits
}

0 comments on commit 8acc45e

Please sign in to comment.