Skip to content

Commit

Permalink
chore: adds memoize implementation for regexes and ahocorasick (#836)
Browse files Browse the repository at this point in the history
* chore: adds memoize implementation for regexes.

Currently we create and allocate memory for every regex we compile, however there are cases where you compile the same regex over and over e.g. corazawaf/coraza-caddy#76. Here we implement the memoize pattern to be able to reuse the regex and reduce the memory consumption.

* docs: adds comments to code.

* chore: simplify the memoize package by using sync.Map.

* feat: extends memoize to ahocorasick and allow impl for tinygo but not synced as no concurrency.

* tests: covers memoize_builders in tinygo.

* chore: fixes nosync for tinygo.

* docs: updates docs.

---------

Co-authored-by: Juan Pablo Tosso <jptosso@gmail.com>
  • Loading branch information
jcchavezs and jptosso committed Aug 6, 2023
1 parent b3490b4 commit 4f30afe
Show file tree
Hide file tree
Showing 21 changed files with 518 additions and 26 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/tinygo.yml
Expand Up @@ -47,3 +47,6 @@ jobs:
- name: Tests
run: tinygo test ./...

- name: Tests memoize
run: tinygo test -tags=memoize_builders ./...
3 changes: 3 additions & 0 deletions README.md
Expand Up @@ -106,6 +106,9 @@ have compatibility guarantees across minor versions - use with care.
the operator with `plugins.RegisterOperator` to reduce binary size / startup overhead.
* `coraza.rule.multiphase_valuation` - enables evaluation of rule variables in the phases that they are ready, not
only the phase the rule is defined for.
* `memoize_builders` - enables memoization of builders for regex and aho-corasick
dictionaries to reduce memory consumption in deployments that launch several coraza
instances. For more context check [this issue](https://github.com/corazawaf/coraza-caddy/issues/76)

## E2E Testing

Expand Down
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -24,6 +24,7 @@ require (
github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9
github.com/tidwall/gjson v1.14.4
golang.org/x/net v0.11.0
golang.org/x/sync v0.1.0
rsc.io/binaryregexp v0.2.0
)

Expand Down
1 change: 1 addition & 0 deletions go.sum
Expand Up @@ -37,6 +37,7 @@ golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
Expand Down
14 changes: 12 additions & 2 deletions internal/corazawaf/rule.go
Expand Up @@ -15,6 +15,7 @@ import (
"github.com/corazawaf/coraza/v3/experimental/plugins/macro"
"github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes"
"github.com/corazawaf/coraza/v3/internal/corazarules"
"github.com/corazawaf/coraza/v3/internal/memoize"
"github.com/corazawaf/coraza/v3/types"
"github.com/corazawaf/coraza/v3/types/variables"
)
Expand Down Expand Up @@ -456,7 +457,12 @@ func (r *Rule) AddVariable(v variables.RuleVariable, key string, iscount bool) e
var re *regexp.Regexp
if len(key) > 2 && key[0] == '/' && key[len(key)-1] == '/' {
key = key[1 : len(key)-1]
re = regexp.MustCompile(key)

if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil {
return err
} else {
re = vare.(*regexp.Regexp)
}
}

if multiphaseEvaluation {
Expand Down Expand Up @@ -521,7 +527,11 @@ func (r *Rule) AddVariableNegation(v variables.RuleVariable, key string) error {
var re *regexp.Regexp
if len(key) > 2 && key[0] == '/' && key[len(key)-1] == '/' {
key = key[1 : len(key)-1]
re = regexp.MustCompile(key)
if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil {
return err
} else {
re = vare.(*regexp.Regexp)
}
}
// Prevent sigsev
if r == nil {
Expand Down
17 changes: 17 additions & 0 deletions internal/memoize/README.md
@@ -0,0 +1,17 @@
# Memoize

Memoize allows to cache certain expensive function calls and
cache the result. The main advantage in Coraza is to memoize
the regexes and aho-corasick dictionaries when the connects
spins up more than one WAF in the same process and hence same
regexes are being compiled over and over.

Currently it is opt-in under the `memoize_builders` build tag
as under a misuse (e.g. using after build time) it could lead
to a memory leak as currently the cache is global.

**Important:** Connectors with *live reload* functionality (e.g. Caddy)
could lead to memory leaks which might or might not be negligible in
most of the cases as usually config changes in a WAF are about a few
rules, this is old objects will be still alive in memory until the program
stops.
10 changes: 10 additions & 0 deletions internal/memoize/noop.go
@@ -0,0 +1,10 @@
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
// SPDX-License-Identifier: Apache-2.0

//go:build !memoize_builders

package memoize

func Do(_ string, fn func() (interface{}, error)) (interface{}, error) {
return fn()
}
36 changes: 36 additions & 0 deletions internal/memoize/nosync.go
@@ -0,0 +1,36 @@
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
// SPDX-License-Identifier: Apache-2.0

//go:build tinygo && memoize_builders

package memoize

import "sync"

var doer = makeDoer(new(sync.Map))

// Do executes and returns the results of the given function, unless there was a cached
// value of the same key. Only one execution is in-flight for a given key at a time.
// The boolean return value indicates whether v was previously stored.
func Do(key string, fn func() (interface{}, error)) (interface{}, error) {
value, err, _ := doer(key, fn)
return value, err
}

// makeDoer returns a function that executes and returns the results of the given function
func makeDoer(cache *sync.Map) func(string, func() (interface{}, error)) (interface{}, error, bool) {
return func(key string, fn func() (interface{}, error)) (interface{}, error, bool) {
// Check cache
value, found := cache.Load(key)
if found {
return value, nil, true
}

data, err := fn()
if err == nil {
cache.Store(key, data)
}

return data, err, false
}
}
167 changes: 167 additions & 0 deletions internal/memoize/nosync_test.go
@@ -0,0 +1,167 @@
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
// SPDX-License-Identifier: Apache-2.0

//go:build tinygo && memoize_builders

// https://github.com/kofalt/go-memoize/blob/master/memoize.go

package memoize

import (
"errors"
"sync"
"testing"
)

func TestDo(t *testing.T) {
expensiveCalls := 0

// Function tracks how many times its been called
expensive := func() (interface{}, error) {
expensiveCalls++
return expensiveCalls, nil
}

// First call SHOULD NOT be cached
result, err := Do("key1", expensive)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 1, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

// Second call on same key SHOULD be cached
result, err = Do("key1", expensive)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 1, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

// First call on a new key SHOULD NOT be cached
result, err = Do("key2", expensive)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 2, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}
}

func TestSuccessCall(t *testing.T) {
do := makeDoer(new(sync.Map))

expensiveCalls := 0

// Function tracks how many times its been called
expensive := func() (interface{}, error) {
expensiveCalls++
return expensiveCalls, nil
}

// First call SHOULD NOT be cached
result, err, cached := do("key1", expensive)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 1, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

if want, have := false, cached; want != have {
t.Fatalf("unexpected caching, want %t, have %t", want, have)
}

// Second call on same key SHOULD be cached
result, err, cached = do("key1", expensive)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 1, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

if want, have := true, cached; want != have {
t.Fatalf("unexpected caching, want %t, have %t", want, have)
}

// First call on a new key SHOULD NOT be cached
result, err, cached = do("key2", expensive)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 2, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

if want, have := false, cached; want != have {
t.Fatalf("unexpected caching, want %t, have %t", want, have)
}
}

func TestFailedCall(t *testing.T) {
do := makeDoer(new(sync.Map))

calls := 0

// This function will fail IFF it has not been called before.
twoForTheMoney := func() (interface{}, error) {
calls++

if calls == 1 {
return calls, errors.New("Try again")
} else {
return calls, nil
}
}

// First call should fail, and not be cached
result, err, cached := do("key1", twoForTheMoney)
if err == nil {
t.Fatalf("expected error")
}

if want, have := 1, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

if want, have := false, cached; want != have {
t.Fatalf("unexpected caching, want %t, have %t", want, have)
}

// Second call should succeed, and not be cached
result, err, cached = do("key1", twoForTheMoney)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 2, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

if want, have := false, cached; want != have {
t.Fatalf("unexpected caching, want %t, have %t", want, have)
}

// Third call should succeed, and be cached
result, err, cached = do("key1", twoForTheMoney)
if err != nil {
t.Fatalf("unexpected error: %s", err.Error())
}

if want, have := 2, result.(int); want != have {
t.Fatalf("unexpected value, want %d, have %d", want, have)
}

if want, have := true, cached; want != have {
t.Fatalf("unexpected caching, want %t, have %t", want, have)
}
}
47 changes: 47 additions & 0 deletions internal/memoize/sync.go
@@ -0,0 +1,47 @@
// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors
// SPDX-License-Identifier: Apache-2.0

//go:build !tinygo && memoize_builders

// https://github.com/kofalt/go-memoize/blob/master/memoize.go

package memoize

import (
"sync"

"golang.org/x/sync/singleflight"
)

var doer = makeDoer(new(sync.Map), new(singleflight.Group))

// Do executes and returns the results of the given function, unless there was a cached
// value of the same key. Only one execution is in-flight for a given key at a time.
// The boolean return value indicates whether v was previously stored.
func Do(key string, fn func() (interface{}, error)) (interface{}, error) {
value, err, _ := doer(key, fn)
return value, err
}

// makeDoer returns a function that executes and returns the results of the given function
func makeDoer(cache *sync.Map, group *singleflight.Group) func(string, func() (interface{}, error)) (interface{}, error, bool) {
return func(key string, fn func() (interface{}, error)) (interface{}, error, bool) {
// Check cache
value, found := cache.Load(key)
if found {
return value, nil, true
}

// Combine memoized function with a cache store
value, err, _ := group.Do(key, func() (interface{}, error) {
data, innerErr := fn()
if innerErr == nil {
cache.Store(key, data)
}

return data, innerErr
})

return value, err, false
}
}

0 comments on commit 4f30afe

Please sign in to comment.