+ pushed to_s into the Picky tokenizer, + integration specs, ! doc

floere · Oct 30, 2011 · ca6d440 · ca6d440
1 parent 74af6fd
commit ca6d440
Show file tree

Hide file tree

Showing 7 changed files with 33 additions and 4 deletions.
diff --git a/history.textile b/history.textile
@@ -3,6 +3,7 @@ h2. Upcoming Version
 h2. Version 3.2.1
 
 * hanke: (server) ActiveRecord is not loaded anymore by default, as only few users use the Picky db source (if you do, Picky will try to require it).
+* hanke: (server) Only Picky's tokenizers call @to_s@ on data anymore. This means that you can write tokenizers that work on whatever kind of object you like. The Picky tokenizers just ensure that they get to work with a string.
 * hanke: (server) Exchangeable backends (please see blog post for an explanation).
 * hanke: (project) RSpec 1 has been updated to RSpec 2.
 

diff --git a/server/lib/picky/category_realtime.rb b/server/lib/picky/category_realtime.rb
@@ -14,7 +14,7 @@ def remove id
     # given object.
     #
     def add object
-      tokens, _ = tokenizer.tokenize object.send(from).to_s
+      tokens, _ = tokenizer.tokenize object.send(from)
       add_tokenized object.id, tokens
     end
 

diff --git a/server/lib/picky/index.rb b/server/lib/picky/index.rb
@@ -102,7 +102,7 @@ class Index
     # * source: Where the data comes from, e.g. Sources::CSV.new(...). Optional, can be defined in the block using #source.
     # * result_identifier: Use if you'd like a different identifier/name in the results than the name of the index.
     # * after_indexing: As of this writing only used in the db source. Executes the given after_indexing as SQL after the indexing process.
-    # * tokenizer: Call and pass either a tokenizer (responds to #tokenize) or the options for a tokenizer..
+    # * indexing: Call and pass either a tokenizer (responds to #tokenize) or the options for a tokenizer..
     # * key_format: Call and pass in a format method for the ids (default is #to_i).
     #
     # Example:

diff --git a/server/lib/picky/indexers/parallel.rb b/server/lib/picky/indexers/parallel.rb
@@ -45,7 +45,7 @@ def process categories
           # Is it a good idea that not the tokenizer has control over when he gets the next text?
           #
           combined.each do |category, cache, _, tokenizer|
-            tokens, _ = tokenizer.tokenize object.send(category.from).to_s # Note: Originals not needed.
+            tokens, _ = tokenizer.tokenize object.send(category.from) # Note: Originals not needed.
             tokens.each do |token_text|
               next unless token_text
               cache << id << comma << token_text << newline

diff --git a/server/lib/picky/tokenizer.rb b/server/lib/picky/tokenizer.rb
@@ -172,7 +172,7 @@ def initialize options = {}
     #  [[:token1, :token2], ["Original1", "Original2"]]
     #
     def tokenize text
-      text   = preprocess text  # processing the text
+      text   = preprocess text.to_s # processing the text
       return empty_tokens if text.blank?
       words  = pretokenize text # splitting and preparations for tokenizing
       return empty_tokens if words.empty?

diff --git a/server/test_project_sinatra/app.rb b/server/test_project_sinatra/app.rb
@@ -292,6 +292,25 @@ def []= key, value
     category :name
   end
 
+  # This checks that we can use a funky customized tokenizer.
+  #
+  NonStringDataSource = Struct.new :id, :nonstring
+  class NonStringTokenizer < Picky::Tokenizer
+    def tokenize nonstring
+      [nonstring.map(&:to_sym)]
+    end
+  end
+  nonstring_data_index = Picky::Index.new(:nonstring) do
+    source {
+      [
+        NonStringDataSource.new(1, ['gaga', :blabla, 'haha']),
+        NonStringDataSource.new(2, [:meow, 'moo', :bang, 'zap'])
+      ]
+    }
+    indexing NonStringTokenizer.new
+    category :nonstring
+  end
+
   weights = {
     [:author]         => +6,
     [:title, :author] => +5,
@@ -377,6 +396,10 @@ def []= key, value
   get %r{\A/backends\Z} do
     backends_search.search(params[:query], params[:ids] || 20, params[:offset] || 0).to_json
   end
+  nonstring_search = Search.new nonstring_data_index
+  get %r{\A/nonstring\Z} do
+    nonstring_search.search(params[:query], params[:ids] || 20, params[:offset] || 0).to_json
+  end
   all_search = Search.new books_index, csv_test_index, isbn_index, mgeo_index do boost weights end
   get %r{\A/all\Z} do
     all_search.search(params[:query], params[:ids] || 20, params[:offset] || 0).to_json

diff --git a/server/test_project_sinatra/spec/integration_spec.rb b/server/test_project_sinatra/spec/integration_spec.rb
@@ -24,6 +24,7 @@
   let(:file)            { Picky::TestClient.new(described_class, :path => '/file')            }
   let(:japanese)        { Picky::TestClient.new(described_class, :path => '/japanese')        }
   let(:backends)        { Picky::TestClient.new(described_class, :path => '/backends')        }
+  let(:nonstring)       { Picky::TestClient.new(described_class, :path => '/nonstring')       }
 
   it 'can generate a single index category without failing' do
     book_each_index = Picky::Indexes[:book_each][:title]
@@ -271,6 +272,10 @@
   #
   it { backends.search("Memor").ids.should == [1] }
 
+  # Different tokenizer.
+  #
+  it { nonstring.search("moo zap").ids.should == [2] }
+
   # Search#ignore option.
   #
   it { book_each.search("alan history").ids.should == ["259", "307"] } # Ignores History or Alan in title.