Permalink
Browse files

+ tokenize option

  • Loading branch information...
1 parent e73227e commit cea55fdc317b75c62a44e86593163bb9e4f73416 @floere committed Mar 2, 2013
@@ -28,6 +28,7 @@ class Category
#
# Advanced Options:
# * source: Use if the category should use a different source.
+ # * tokenize: Whether to use the tokenizer (default is true).
# * tokenizer: Use a subclass of Tokenizers::Base that implements #tokens_for and #empty_tokens.
# * weight: Weights::Logarithmic.new, Weights::Constant.new(int = 0),
# Weights::Dynamic.new(&block) or an object that responds
@@ -47,6 +48,7 @@ def configure_from options
# Instantly extracted to raise an error instantly.
#
@source = Source.from options[:source], true, @index.name
+ @tokenize = options[:tokenize] != false
@tokenizer = Tokenizer.from options[:indexing], @index.name, name
@ranger = options[:ranging] || Range
@@ -78,7 +80,7 @@ def configure_indexes_from options
#
# TODO Rewrite it such that this does not need to be maintained separately.
#
- @@known_keys = [:indexing, :partial, :qualifier, :qualifiers, :ranging, :similarity, :source, :weight]
+ @@known_keys = [:indexing, :partial, :qualifier, :qualifiers, :ranging, :similarity, :source, :tokenize, :tokenizer, :weight]
def warn_if_unknown options
warn <<-WARNING if options && (options.keys - @@known_keys).size > 0
@@ -95,8 +95,10 @@ def indexer
# If one isn't set on this category, will try the index,
# and finally the default index tokenizer.
#
+ # Will return nil if tokenize is set to false.
+ #
def tokenizer
- @tokenizer || @index.tokenizer
+ @tokenizer || @index.tokenizer if @tokenize
end
# Clears the caches.
@@ -57,10 +57,17 @@ def unshift thing
# For the given id, adds the list of
# strings to the index for the given id.
#
- def add_text id, text, where = :unshift
+ def add_text id, text_or_tokens, where = :unshift
# text = text.to_sym if @symbols # SYMBOLS.
- tokens, _ = tokenizer.tokenize text
+ tokens = nil
+ if tokenizer
+ tokens, _ = tokenizer.tokenize text_or_tokens
+ else
+ tokens = text_or_tokens
+ end
tokens.each { |text| add_tokenized_token id.send(key_format), text, where, false }
+ rescue NoMethodError
+ raise %Q{You probably set tokenize: false on category "#{name}". It will need an Enumerator of previously tokenized tokens.}
end
#
@@ -58,12 +58,26 @@ def process source_for_prepare, categories, scheduler = Scheduler.new
def index_flush objects, file, category, cache, tokenizer
comma = ?,
newline = ?\n
-
- objects.each do |object|
- tokens, _ = tokenizer.tokenize object.send(category.from) # Note: Originals not needed.
- tokens.each do |token_text|
- next unless token_text
- cache << object.id << comma << token_text << newline
+
+ # Optimized, therefore duplicate code.
+ #
+ # TODO Deoptimize?
+ #
+ if tokenizer
+ objects.each do |object|
+ tokens, _ = tokenizer.tokenize object.send(category.from) # Note: Originals not needed.
+ tokens.each do |token_text|
+ next unless token_text
+ cache << object.id << comma << token_text << newline
+ end
+ end
+ else
+ objects.each do |object|
+ tokens = object.send(category.from) # Note: Originals not needed.
+ tokens.each do |token_text|
+ next unless token_text
+ cache << object.id << comma << token_text << newline
+ end
end
end
@@ -54,12 +54,25 @@ def process source_for_prepare, categories, scheduler = Scheduler.new
def index_flush datas, file, cache, tokenizer
comma = ?,
newline = ?\n
-
- datas.each do |indexed_id, text|
- tokens, _ = tokenizer.tokenize text # Note: Originals not needed.
- tokens.each do |token_text|
- next unless token_text
- cache << indexed_id << comma << token_text << newline
+
+ # Optimized, therefore duplicate code.
+ #
+ # TODO Deoptimize?
+ #
+ if tokenizer
+ datas.each do |indexed_id, text|
+ tokens, _ = tokenizer.tokenize text # Note: Originals not needed.
+ tokens.each do |token_text|
+ next unless token_text
+ cache << indexed_id << comma << token_text << newline
+ end
+ end
+ else
+ datas.each do |indexed_id, tokens|
+ tokens.each do |token_text|
+ next unless token_text
+ cache << indexed_id << comma << token_text << newline
+ end
end
end
@@ -0,0 +1,38 @@
+# encoding: utf-8
+#
+require 'spec_helper'
+
+describe 'Category#tokenize(false)' do
+
+ it 'does tokenize' do
+ index = Picky::Index.new :thing do
+ category :text, tokenize: true
+ end
+
+ thing = Struct.new :id, :text
+ # expect do # Does not fail – because #to_s is called on the Array.
+ index.add thing.new(1, ['already', 'tokenized'])
+ # end.to raise_error
+ index.add thing.new(2, 'this should fail')
+
+ try = Picky::Search.new index
+
+ try.search('already').ids.should == [] # Not found because ['already', is indexed.
+ end
+ it 'does not tokenize' do
+ index = Picky::Index.new :thing do
+ category :text, tokenize: false
+ end
+
+ thing = Struct.new :id, :text
+ index.add thing.new(1, ['already', 'tokenized'])
+ expect do
+ index.add thing.new(2, 'this should fail')
+ end.to raise_error('You probably set tokenize: false on category "text". It will need an Enumerator of previously tokenized tokens.')
+
+ try = Picky::Search.new index
+
+ try.search('already').ids.should == [1]
+ end
+
+end
@@ -52,7 +52,7 @@
category.should_receive(:warn).once.with <<-WARNING
Warning: Category options {:weights=>:some_weight} for category some_category contain an unknown option.
- Working options are: [:indexing, :partial, :qualifier, :qualifiers, :ranging, :similarity, :source, :weight].
+ Working options are: [:indexing, :partial, :qualifier, :qualifiers, :ranging, :similarity, :source, :tokenize, :tokenizer, :weight].
WARNING
category.warn_if_unknown :weights => :some_weight

0 comments on commit cea55fd

Please sign in to comment.