-
Notifications
You must be signed in to change notification settings - Fork 49
/
tokenizer_spec.rb
214 lines (198 loc) · 7.78 KB
/
tokenizer_spec.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
# encoding: utf-8
#
require 'spec_helper'
describe Picky::Tokenizer do
context 'with special instance' do
let (:tokenizer) { described_class.new rejects_token_if: lambda { |token| token.to_s.length < 2 || token == :hello }, case_sensitive: true }
it 'rejects tokens with length < 2' do
tokenizer.reject([:'', :a, :ab, :abc]).should == [:ab, :abc]
end
it 'rejects tokens that are called :hello' do
tokenizer.reject([:hel, :hell, :hello]).should == [:hel, :hell]
end
describe 'to_s' do
it 'spits out the right text' do
tokenizer.to_s.should == <<-EXPECTED
Removes characters: -
Stopwords: -
Splits text on: /\\s/
Normalizes words: -
Rejects tokens? Yes, see line 8 in app/application.rb
Substitutes chars? -
Case sensitive? Yes.
EXPECTED
end
end
end
context 'with normal instance' do
let(:tokenizer) { described_class.new }
describe 'to_s' do
it 'spits out the right text' do
tokenizer.to_s.should == <<-EXPECTED
Removes characters: -
Stopwords: -
Splits text on: /\\s/
Normalizes words: -
Rejects tokens? -
Substitutes chars? -
Case sensitive? -
EXPECTED
end
end
describe 'rejects_token_if' do
it 'rejects empty tokens by default' do
tokenizer.reject(['a', nil, '', 'b']).should == ['a', 'b']
end
it 'rejects tokens based on the given rejection criteria if set' do
tokenizer.rejects_token_if &:nil?
tokenizer.reject(['a', nil, '', 'b']).should == ['a', '', 'b']
end
end
describe "substitute(s)_characters*" do
it "doesn't substitute if there is no substituter" do
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzäöü'
end
it 'raises if nothing with #substitute is given' do
expect { tokenizer.substitutes_characters_with Object.new }.to raise_error("The substitutes_characters_with option needs a character substituter, which responds to #substitute.")
end
it "uses the substituter to replace characters" do
tokenizer.substitutes_characters_with Picky::CharacterSubstituters::WestEuropean.new
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
end
it "uses the european substituter as default" do
tokenizer.substitutes_characters_with
tokenizer.substitute_characters('abcdefghijklmnopqrstuvwxyzäöü').should == 'abcdefghijklmnopqrstuvwxyzaeoeue'
end
end
describe "normalizes_words" do
it 'handles broken arguments' do
expect { tokenizer.normalizes_words(:not_an_array) }.to raise_error(ArgumentError)
end
context "without normalizes_words called" do
it "has normalize_with_patterns" do
expect { tokenizer.normalize_with_patterns('any') }.to_not raise_error
end
it 'should define a method normalize_with_patterns does nothing' do
unchanging = stub :unchanging
tokenizer.normalize_with_patterns(unchanging).should == unchanging
end
end
context "with normalizes_words called" do
before(:each) do
tokenizer.normalizes_words([
[/st\./, 'sankt'],
[/stras?s?e?/, 'str']
])
end
it "has normalize_with_patterns" do
expect { tokenizer.normalize_with_patterns('a b/c.d') }.to_not raise_error
end
it "normalizes, but just the first one" do
tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
end
end
end
describe "splits_text_on" do
it 'handles nonbroken arguments' do
expect { tokenizer.splits_text_on("hello") }.to_not raise_error(ArgumentError)
end
it 'handles broken arguments' do
expect { tokenizer.splits_text_on(:gnorf) }.to raise_error(ArgumentError)
end
context "without splits_text_on called" do
it "has split" do
lambda { tokenizer.split('any') }.should_not raise_error
end
it 'should define a method split that splits by default on \s' do
tokenizer.split('a b/c.d').should == ['a', 'b/c.d']
end
it 'splits text on /\s/ by default' do
tokenizer.split('this is a test').should == ['this', 'is', 'a', 'test']
end
end
context "with removes_characters called" do
before(:each) do
tokenizer.splits_text_on(/[\s\.\/]/)
end
it "has split" do
expect { tokenizer.split('a b/c.d') }.to_not raise_error
end
it "removes illegal characters" do
tokenizer.split('a b/c.d').should == ['a','b','c','d']
end
end
end
describe "removes_characters" do
it 'handles broken arguments' do
expect { tokenizer.removes_characters("hello") }.to raise_error(ArgumentError)
end
context "without removes_characters called" do
it "has remove_illegals" do
expect { tokenizer.remove_illegals('any') }.to_not raise_error
end
it 'should define a method remove_illegals that does nothing' do
unchanging = stub :unchanging
tokenizer.remove_illegals unchanging
end
end
context "with removes_characters called" do
before(:each) do
tokenizer.removes_characters(/[afo]/)
end
it "has remove_illegals" do
expect { tokenizer.remove_illegals('abcdefghijklmnop') }.to_not raise_error
end
it "removes illegal characters" do
tokenizer.remove_illegals('abcdefghijklmnop').should == 'bcdeghijklmnp'
end
end
end
describe 'stopwords' do
it 'handles broken arguments' do
expect { tokenizer.stopwords("hello") }.to raise_error(ArgumentError)
end
context 'without stopwords given' do
it 'should define a method remove_stopwords' do
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
end
it 'should define a method remove_stopwords that does nothing' do
tokenizer.remove_stopwords('from this text').should == 'from this text'
end
it 'should define a method remove_non_single_stopwords' do
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
end
end
context 'with stopwords given' do
before(:each) do
tokenizer.stopwords(/r|e/)
end
it 'should define a method remove_stopwords' do
lambda { tokenizer.remove_stopwords('from this text') }.should_not raise_error
end
it 'should define a method stopwords that removes stopwords' do
tokenizer.remove_stopwords('from this text').should == 'fom this txt'
end
it 'should define a method remove_non_single_stopwords' do
expect { tokenizer.remove_non_single_stopwords('from this text') }.to_not raise_error
end
it 'should define a method remove_non_single_stopwords that removes non-single stopwords' do
tokenizer.remove_non_single_stopwords('rerere rerere').should == ' '
end
it 'should define a method remove_non_single_stopwords that does not single stopwords' do
tokenizer.remove_non_single_stopwords('rerere').should == 'rerere'
end
end
context 'error case' do
before(:each) do
tokenizer.stopwords(/any/)
end
it 'should not remove non-single stopwords with a star' do
tokenizer.remove_non_single_stopwords('a*').should == 'a*'
end
it 'should not remove non-single stopwords with a tilde' do
tokenizer.remove_non_single_stopwords('a~').should == 'a~'
end
end
end
end
end