Skip to content
Browse files

+ needed functional specs to communicate functionality and intent

  • Loading branch information...
1 parent 1a004e9 commit 96a5d6e81273da40fc9a4f2f00c96b620e8d1330 @floere committed Mar 16, 2012
Showing with 81 additions and 1 deletion.
  1. +72 −0 server/spec/functional/tokenizer_spec.rb
  2. +9 −1 server/spec/lib/tokenizer_spec.rb
View
72 server/spec/functional/tokenizer_spec.rb
@@ -0,0 +1,72 @@
+# encoding: utf-8
+#
+require 'spec_helper'
+
+describe Picky::Tokenizer do
+ describe 'examples' do
+ it 'works correctly' do
+ tokenizer = described_class.new(split_words_on: /\&/, normalizes_words: [[/\&/, 'and']])
+
+ # TODO Is this really correct? Shouldn't we split after normalizing?
+ #
+ tokenizer.tokenize('M & M').should == [['m', 'and', 'm'], ['m', 'and', 'm']]
+ end
+ it 'works correctly' do
+ tokenizer = described_class.new(stopwords: /\b(and)\b/, normalizes_words: [[/\&/, 'and']])
+
+ # TODO Is this really correct? Shouldn't we stop words after normalizing?
+ #
+ tokenizer.tokenize('M & M').should == [['m', 'and', 'm'], ['m', 'and', 'm']]
+ end
+ it 'removes all stopwords if they do not occur alone' do
+ tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
+ tokenizer.tokenize('and then').should == [[], []]
+ end
+ it 'does not remove a stopword if it occurs alone' do
+ tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
+ tokenizer.tokenize('and').should == [['and'], ['and']]
+ end
+ it 'does not remove a stopword if it occurs alone (even with qualifier)' do
+ tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
+ tokenizer.tokenize('title:and').should == [['title:and'], ['title:and']]
+ end
+ it 'does not remove a stopword if it occurs alone (even with qualifier)' do
+ tokenizer = described_class.new(stopwords: /\b(and|then)\b/)
+ tokenizer.tokenize('title:and').should == [['title:and'], ['title:and']]
+ end
+ it 'removes stopwords, then only lets through max words words' do
+ tokenizer = described_class.new(stopwords: /\b(and|then|to|the)\b/, max_words: 2)
+ tokenizer.tokenize('and then they went to the house').should == [['they', 'went'], ['they', 'went']]
+ end
+ it 'can take freaky splits_text_on options' do
+ tokenizer = described_class.new(splits_text_on: /([A-Z]?[a-z]+)/, case_sensitive: false) # Explicit case, is false by default.
+ tokenizer.tokenize('TOTALCamelCaseExample').should == [
+ ["total", "camel", "case", "example"],
+ ["total", "camel", "case", "example"]
+ ]
+ end
+ it 'substitutes and removes in the right order' do
+ tokenizer = described_class.new(
+ substitutes_characters_with: Picky::CharacterSubstituters::WestEuropean.new,
+ removes_characters: /e/
+ )
+
+ # Ä -> Ae -> A
+ #
+ tokenizer.tokenize('Ä ä').should == [['a', 'a'], ['a', 'a']]
+ end
+ it 'removes characters, then only lets through an ok sized token' do
+ tokenizer = described_class.new(rejects_token_if: ->(token){ token.size >= 5 }, removes_characters: /e/)
+ tokenizer.tokenize('hullo').should == [[], []]
+ tokenizer.tokenize('hello').should == [['hllo'], ['hllo']]
+ end
+ it 'is case sensitive' do
+ tokenizer = described_class.new(case_sensitive: true)
+ tokenizer.tokenize('Kaspar codes').should == [['Kaspar', 'codes'], ['Kaspar', 'codes']]
+ end
+ it 'is case sensitive, also for removing characters' do
+ tokenizer = described_class.new(case_sensitive: true, removes_characters: /K/)
+ tokenizer.tokenize('Kaspar codes').should == [['aspar', 'codes'], ['aspar', 'codes']]
+ end
+ end
+end
View
10 server/spec/lib/tokenizer_spec.rb
@@ -96,7 +96,9 @@
before(:each) do
tokenizer.normalizes_words([
[/st\./, 'sankt'],
- [/stras?s?e?/, 'str']
+ [/stras?s?e?/, 'str'],
+ [/\+/, 'plus'],
+ [/\&/, 'and']
])
end
it "has normalize_with_patterns" do
@@ -105,6 +107,12 @@
it "normalizes, but just the first one" do
tokenizer.normalize_with_patterns('st. wegstrasse').should == 'sankt wegstrasse'
end
+ it "works correctly" do
+ tokenizer.normalize_with_patterns('camera +').should == 'camera plus'
+ end
+ it "works correctly" do
+ tokenizer.normalize_with_patterns('alice & bob').should == 'alice and bob'
+ end
end
end

0 comments on commit 96a5d6e

Please sign in to comment.
Something went wrong with that request. Please try again.