forked from redpanda-data/connect
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hash_sample.go
150 lines (123 loc) · 4.58 KB
/
hash_sample.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
package processor
import (
"math"
"time"
"github.com/OneOfOne/xxhash"
"github.com/dafanshu/benthos/v3/internal/docs"
"github.com/dafanshu/benthos/v3/lib/log"
"github.com/dafanshu/benthos/v3/lib/metrics"
"github.com/dafanshu/benthos/v3/lib/response"
"github.com/dafanshu/benthos/v3/lib/types"
)
//------------------------------------------------------------------------------
func init() {
Constructors[TypeHashSample] = TypeSpec{
constructor: NewHashSample,
Status: docs.StatusDeprecated,
Footnotes: `
## Alternatives
All functionality of this processor has been superseded by the
[bloblang](/docs/components/processors/bloblang) processor.`,
FieldSpecs: docs.FieldSpecs{
docs.FieldCommon("retain_min", "The lower percentage of the sample range."),
docs.FieldCommon("retain_max", "The upper percentage of the sample range."),
docs.FieldAdvanced("parts", "An array of message indexes within the batch to sample based on. If left empty all messages are included. This field is only applicable when batching messages [at the input level](/docs/configuration/batching).").Array(),
},
}
}
//------------------------------------------------------------------------------
// hashSamplingNorm is the constant factor to normalise a uint64 into the
// (0.0, 100.0) range.
const hashSamplingNorm = 100.0 / float64(math.MaxUint64)
func scaleNum(n uint64) float64 {
return float64(n) * hashSamplingNorm
}
//------------------------------------------------------------------------------
// HashSampleConfig contains configuration fields for the HashSample processor.
type HashSampleConfig struct {
RetainMin float64 `json:"retain_min" yaml:"retain_min"`
RetainMax float64 `json:"retain_max" yaml:"retain_max"`
Parts []int `json:"parts" yaml:"parts"` // message parts to hash
}
// NewHashSampleConfig returns a HashSampleConfig with default values.
func NewHashSampleConfig() HashSampleConfig {
return HashSampleConfig{
RetainMin: 0.0,
RetainMax: 10.0, // retain the first [0, 10%) interval
Parts: []int{0}, // only consider the 1st part
}
}
//------------------------------------------------------------------------------
// HashSample is a processor that removes messages based on a sample factor by
// hashing its contents.
type HashSample struct {
conf Config
log log.Modular
stats metrics.Type
mCount metrics.StatCounter
mDropOOB metrics.StatCounter
mDropped metrics.StatCounter
mErr metrics.StatCounter
mSent metrics.StatCounter
mBatchSent metrics.StatCounter
}
// NewHashSample returns a HashSample processor.
func NewHashSample(
conf Config, mgr types.Manager, log log.Modular, stats metrics.Type,
) (Type, error) {
return &HashSample{
conf: conf,
log: log,
stats: stats,
mCount: stats.GetCounter("count"),
mDropOOB: stats.GetCounter("dropped_part_out_of_bounds"),
mDropped: stats.GetCounter("dropped"),
mErr: stats.GetCounter("error"),
mSent: stats.GetCounter("sent"),
mBatchSent: stats.GetCounter("batch.sent"),
}, nil
}
//------------------------------------------------------------------------------
// ProcessMessage applies the processor to a message, either creating >0
// resulting messages or a response to be sent back to the message source.
func (s *HashSample) ProcessMessage(msg types.Message) ([]types.Message, types.Response) {
s.mCount.Incr(1)
hash := xxhash.New64()
lParts := msg.Len()
for _, index := range s.conf.HashSample.Parts {
if index < 0 {
// Negative indexes count backwards from the end.
index = lParts + index
}
// Check boundary of part index.
if index < 0 || index >= lParts {
s.mDropOOB.Incr(1)
s.mDropped.Incr(1)
s.log.Debugf("Cannot sample message part %v for parts count: %v\n", index, lParts)
return nil, response.NewAck()
}
// Attempt to add part to hash.
if _, err := hash.Write(msg.Get(index).Get()); err != nil {
s.mErr.Incr(1)
s.log.Debugf("Cannot hash message part for sampling: %v\n", err)
return nil, response.NewAck()
}
}
rate := scaleNum(hash.Sum64())
if rate >= s.conf.HashSample.RetainMin && rate < s.conf.HashSample.RetainMax {
s.mBatchSent.Incr(1)
s.mSent.Incr(int64(msg.Len()))
msgs := [1]types.Message{msg}
return msgs[:], nil
}
s.mDropped.Incr(int64(msg.Len()))
return nil, response.NewAck()
}
// CloseAsync shuts down the processor and stops processing requests.
func (s *HashSample) CloseAsync() {
}
// WaitForClose blocks until the processor has closed down.
func (s *HashSample) WaitForClose(timeout time.Duration) error {
return nil
}
//------------------------------------------------------------------------------