-
Notifications
You must be signed in to change notification settings - Fork 49
/
tokenizer_spec.rb
344 lines (325 loc) · 11.9 KB
/
tokenizer_spec.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# encoding: utf-8
#
require 'spec_helper'
describe Picky::Tokenizer do
describe 'with wrong/incorrectly spelled option' do
it 'informs the user nicely' do
expect {
described_class.new rejetcs_token_if: :blank?.to_proc
}.to raise_error(<<-MESSAGE)
The option "rejetcs_token_if" is not a valid option for a Picky tokenizer.
Please see https://github.com/floere/picky/wiki/Indexing-configuration for valid options.
A short overview:
removes_characters /regexp/
stopwords /regexp/
splits_text_on /regexp/ or "String", default /\s/
normalizes_words [[/replace (this)/, 'with this \\1'], ...]
rejects_token_if Proc/lambda, default :blank?.to_proc
substitutes_characters_with Picky::CharacterSubstituter or responds to #substitute(String)
stems_with Instance responds to #stem(String)
case_sensitive true/false
MESSAGE
end
end
context 'with special instance' do
let (:tokenizer) { described_class.new rejects_token_if: lambda { |token| token.to_s.length < 2 || token == :hello }, case_sensitive: true }
it 'rejects tokens with length < 2' do
tokenizer.reject([:'', :a, :ab, :abc]).should == [:ab, :abc]
end
it 'rejects tokens that are called :hello' do
tokenizer.reject([:hel, :hell, :hello]).should == [:hel, :hell]
end
describe 'to_s' do
it 'spits out the right text' do
tokenizer.to_s.should == <<-EXPECTED
Removes characters: -
Stopwords: -
Splits text on: /\\s/
Normalizes words: -
Rejects tokens? Yes, see line 29 in app/application.rb
Substitutes chars? -
Stems? -
Case sensitive? Yes.
EXPECTED
end
end
end
context 'with normal instance' do
let(:tokenizer) { described_class.new }
describe 'to_s' do
it 'spits out the right text' do
tokenizer.to_s.should == <<-EXPECTED
Removes characters: -
Stopwords: -
Splits text on: /\\s/
Normalizes words: -
Rejects tokens? -
Substitutes chars? -
Stems? -
Case sensitive? -
EXPECTED
end
end
describe 'rejects_token_if' do
it 'rejects empty tokens by default' do
tokenizer.reject(['a', nil, '', 'b']).should == ['a', 'b']
end
it 'rejects tokens based on the given rejection criteria if set' do
tokenizer.rejects_token_if :nil?.to_proc
tokenizer.reject(['a', nil, '', 'b']).should == ['a', '', 'b']
end
end
describe "substitute(s)_characters*" do
it "doesn't substitute if there is no substituter" do
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
end
it 'raises if nothing with #substitute is given' do
expect { tokenizer.substitutes_characters_with Object.new }.
to raise_error(<<-ERROR)
The substitutes_characters_with option needs a character substituter,
which responds to #substitute(text) and returns substituted_text."
ERROR
end
it "uses the substituter to replace characters" do
tokenizer.substitutes_characters_with Picky::CharacterSubstituters::WestEuropean.new
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
end
it "uses the european substituter as default" do
tokenizer.substitutes_characters_with
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
end
end
describe "normalizes_words" do
it 'handles broken arguments' do
expect { tokenizer.normalizes_words(:not_an_array) }.to raise_error(ArgumentError)
end
context "without normalizes_words called" do
it "has normalize_with_patterns" do
expect { tokenizer.normalize_with_patterns('any') }.to_not raise_error
end
it 'should define a method normalize_with_patterns does nothing' do
unchanging = stub :unchanging
tokenizer.normalize_with_patterns(unchanging).should == unchanging
end
end
context "with normalizes_words called" do
before(:each) do
tokenizer.normalizes_words([
[/st\./, 'sankt'],
[/stras?s?e?/, 'str'],
[/\+/, 'plus'],
[/\&/, 'and']
])
end
it "has normalize_with_patterns" do
expect { tokenizer.normalize_with_patterns('a b/c.d') }.to_not raise_error
end
it "normalizes, but just the first one" do
tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
end
it "works correctly" do
tokenizer.normalize_with_patterns('camera +').should == 'camera plus'
end
it "works correctly" do
tokenizer.normalize_with_patterns('alice & bob').should == 'alice and bob'
end
end
# TODO Reinstate.
#
# context 'with a normalizer' do
# let(:normalizer) {
# Class.new do
# def normalize_with_patterns text
# text.reverse
# end
# end.new
# }
# before(:each) do
# tokenizer.normalizes_words normalizer
# end
# it "has normalize_with_patterns" do
# expect { tokenizer.normalize_with_patterns('a b/c.d') }.to_not raise_error
# end
# it "normalizes, but just the first one" do
# tokenizer.normalize_with_patterns('1234567890').should == '0987654321'
# end
# it "works correctly" do
# tokenizer.normalize_with_patterns('camera +').should == '+ aremac'
# end
# it "works correctly" do
# tokenizer.normalize_with_patterns('alice & bob').should == 'bob & ecila'
# end
# end
end
describe "splits_text_on" do
it 'handles nonbroken arguments' do
expect { tokenizer.splits_text_on("hello") }.to_not raise_error(ArgumentError)
end
it 'handles broken arguments' do
expect { tokenizer.splits_text_on(:gnorf) }.to raise_error(ArgumentError)
end
context "without splits_text_on called" do
it "has split" do
lambda { tokenizer.split('any') }.should_not raise_error
end
it 'should define a method split that splits by default on \s' do
tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
end
it 'splits text on /\s/ by default' do
tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
end
end
context "with specific splitting pattern" do
before(:each) do
tokenizer.splits_text_on(/[\s\.\/]/)
end
it "splits text correctly" do
expect { tokenizer.split('a b/c.d') }.to_not raise_error
end
it "splits text correctly" do
tokenizer.split('a b/c.d').should == ['a','b','c','d']
end
end
context "with a splitter given" do
let(:splitter) do
Class.new do
def split text
text.split(/,/)
end
end.new
end
before(:each) do
tokenizer.splits_text_on splitter
end
it "splits text correctly" do
tokenizer.split('a,b/c.d').should == ['a', 'b/c.d']
end
end
end
describe "removes_characters" do
it 'handles broken arguments' do
expect { tokenizer.removes_characters("hello") }.to raise_error(ArgumentError)
end
context "without removes_characters called" do
it "has remove_illegals" do
expect { tokenizer.remove_illegals('any') }.to_not raise_error
end
it 'should define a method remove_illegals that does nothing' do
unchanging = stub :unchanging
tokenizer.remove_illegals unchanging
end
end
context "with removes_characters called" do
before(:each) do
tokenizer.removes_characters(/[afo]/)
end
it "has remove_illegals" do
expect { tokenizer.remove_illegals('abcdefghijklmnop') }.to_not raise_error
end
it "removes illegal characters" do
tokenizer.remove_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
end
end
end
describe 'stopwords' do
it 'handles broken arguments' do
expect { tokenizer.stopwords("hello") }.to raise_error(ArgumentError)
end
context 'without stopwords given' do
it 'should define a method remove_stopwords' do
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
end
it 'should define a method remove_stopwords that does nothing' do
tokenizer.remove_stopwords('from this text').should == 'from this text'
end
it 'should define a method remove_non_single_stopwords' do
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
end
end
context 'with stopwords given' do
before(:each) do
tokenizer.stopwords(/r|e/)
end
it 'should define a method remove_stopwords' do
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
end
it 'should define a method stopwords that removes stopwords' do
tokenizer.remove_stopwords('from this text').should == 'fom this txt'
end
it 'should define a method remove_non_single_stopwords' do
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
end
it 'should define a method remove_non_single_stopwords that removes non-single stopwords' do
tokenizer.remove_non_single_stopwords('rerere rerere').should == ' '
end
it 'should define a method remove_non_single_stopwords that does not single stopwords' do
tokenizer.remove_non_single_stopwords('rerere').should == 'rerere'
end
end
context 'error case' do
before(:each) do
tokenizer.stopwords(/any/)
end
it 'should not remove non-single stopwords with a star' do
tokenizer.remove_non_single_stopwords('a*').should == 'a*'
end
it 'should not remove non-single stopwords with a tilde' do
tokenizer.remove_non_single_stopwords('a~').should == 'a~'
end
end
end
end
context 'from' do
context 'options hash' do
it 'creates a tokenizer' do
described_class.from(splits_text_on: /\t/).
tokenize("hello\tworld").should == [['hello', 'world'], ['hello', 'world']]
end
end
context 'tokenizer' do
let(:tokenizer) do
Class.new do
def tokenize text
['unmoved', 'by', 'your', 'texts']
end
end.new
end
it 'creates a tokenizer' do
described_class.from(tokenizer).
tokenize("hello\tworld").should == ['unmoved', 'by', 'your', 'texts']
end
end
context 'invalid tokenizer' do
it 'raises with a nice error message' do
expect {
described_class.from Object.new
}.to raise_error(<<-ERROR)
indexing options should be either
* a Hash
or
* an object that responds to #tokenize(text) => [[token1, token2, ...], [original1, original2, ...]]
ERROR
end
it 'raises with a nice error message' do
expect {
described_class.from Object.new, 'some_index'
}.to raise_error(<<-ERROR)
indexing options for some_index should be either
* a Hash
or
* an object that responds to #tokenize(text) => [[token1, token2, ...], [original1, original2, ...]]
ERROR
end
it 'raises with a nice error message' do
expect {
described_class.from Object.new, 'some_index', 'some_category'
}.to raise_error(<<-ERROR)
indexing options for some_index:some_category should be either
* a Hash
or
* an object that responds to #tokenize(text) => [[token1, token2, ...], [original1, original2, ...]]
ERROR
end
end
end
end