+ tokenizing completely rewritten

floere · Apr 6, 2011 · b26806a · b26806a
1 parent f5c7e98
commit b26806a
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 129 deletions.
diff --git a/server/lib/picky/internals/query/token.rb b/server/lib/picky/internals/query/token.rb
@@ -29,13 +29,16 @@ def initialize text
       #       and normalized token. I.e. one prepared for a search.
       #
       def self.processed text
-        token = new text
-        token.qualify
-        token.extract_original
-        token.partialize
-        token.similarize
-        token.remove_illegals
-        token
+        new(text).process
+      end
+      def process
+        qualify
+        extract_original
+        partialize
+        similarize
+        remove_illegals
+        symbolize # NEW
+        self
       end
 
       # This returns a predefined category name if the user has given one.
@@ -61,23 +64,27 @@ def extract_original
       # It is only settable if it hasn't been set yet.
       #
       def partial= partial
-        @partial = partial if @partial.nil?
+        @partial ||= partial
       end
       def partial?
         !@similar && @partial
       end
 
       # If the text ends with *, partialize it. If with ", don't.
       #
+      # The latter wins. So "hello*" will not be partially searched.
+      #
       @@no_partial = /\"\Z/
       @@partial    = /\*\Z/
       def partialize
-        self.partial = false and return if @text =~ @@no_partial
-        self.partial = true if @text =~ @@partial
+        self.partial = false and return unless @text !~ @@no_partial
+        self.partial = true unless @text !~ @@partial
       end
 
       # If the text ends with ~ similarize it. If with ", don't.
       #
+      # The latter wins.
+      #
       @@no_similar = /\"\Z/
       @@similar    = /\~\Z/
       def similarize
@@ -95,14 +102,13 @@ def similar?
       def remove_illegals
         @text.gsub! @@illegals, '' unless @text.blank?
       end
-
-      # Visitor for tokenizer.
+
       #
-      # TODO Rewrite!!!
       #
-      def tokenize_with tokenizer
-        @text = tokenizer.normalize @text
+      def symbolize
+        @text = @text.to_sym
       end
+
       # TODO spec!
       #
       # TODO Rewrite!!

diff --git a/server/lib/picky/internals/query/tokens.rb b/server/lib/picky/internals/query/tokens.rb
@@ -19,8 +19,13 @@ class Tokens # :nodoc:all
       def initialize tokens = []
         @tokens = tokens
       end
+      def self.processed words
+        new words.collect! { |word| Token.processed word }
+      end
 
+      # Tokenizes each token.
       #
+      # Note: Passed tokenizer needs to offer #normalize(text).
       #
       def tokenize_with tokenizer
         @tokens.each { |token| token.tokenize_with(tokenizer) }

diff --git a/server/lib/picky/internals/tokenizers/base.rb b/server/lib/picky/internals/tokenizers/base.rb
@@ -163,37 +163,54 @@ def initialize options = {}
         reject_token_if &(options[:reject_token_if] || :blank?)
       end
 
-      # Hooks.
-      #
-
-      # Preprocessing.
-      #
-      def preprocess text; end
+      # Default preprocessing hook.
+      #
+      # Does:
+      # 1. Character substitution.
+      # 2. Remove illegal expressions.
+      # 3. Remove non-single stopwords. (Stopwords that occur with other words)
+      #
+      def preprocess text
+        text = substitute_characters text
+        remove_illegals text
+        # We do not remove single stopwords e.g. in the indexer for
+        # an entirely different reason than in the query tokenizer.
+        # An indexed thing with just name "UND" (a possible stopword)
+        # should not lose its name.
+        #
+        remove_non_single_stopwords text
+        text
+      end
       # Pretokenizing.
       #
-      def pretokenize text; end
+      # Does:
+      # 1. Split the text into words.
+      # 2. Normalize each word.
+      #
+      def pretokenize text
+        words = split text
+        words.collect! do |word|
+          normalize_with_patterns word
+          word
+        end
+      end
       # Postprocessing.
       #
       def process tokens
         reject tokens # Reject any tokens that don't meet criteria
         tokens
       end
 
-      # Converts words into real tokens.
-      #
-      def tokens_for words
-        Internals::Query::Tokens.new words.collect! { |word| token_for word }
-      end
+      # # Converts words into real tokens.
+      # #
+      # def tokens_for words
+      #   Internals::Query::Tokens.new words.collect! { |word| token_for word }
+      # end
       # Turns non-blank text into symbols.
       #
       def symbolize text
         text.blank? ? nil : text.to_sym
       end
-      # Returns a tokens object.
-      #
-      def empty_tokens
-        Internals::Query::Tokens.new
-      end
 
     end
 

diff --git a/server/lib/picky/internals/tokenizers/index.rb b/server/lib/picky/internals/tokenizers/index.rb
@@ -15,45 +15,26 @@ def self.default
         @default ||= new
       end
 
-      # Default indexing preprocessing hook.
+      # Postprocessing.
       #
-      # Does:
-      # 1. Character substitution.
-      # 2. Downcasing.
-      # 3. Remove illegal expressions.
-      # 4. Remove non-single stopwords. (Stopwords that occur with other words)
+      # In indexing, we work with symbol tokens.
       #
-      def preprocess text
-        text = substitute_characters text
-        text.downcase!
-        remove_illegals text
-        # we do not remove single stopwords for an entirely different
-        # reason than in the query tokenizer.
-        # An indexed thing with just name "UND" (a possible stopword) should not lose its name.
-        #
-        remove_non_single_stopwords text
-        text
-      end
-
-      # Default indexing pretokenizing hook.
-      #
-      # Does:
-      # 1. Split the text into words.
-      # 2. Normalize each word.
-      #
-      def pretokenize text
-        words = split text
-        words.collect! do |word|
-          normalize_with_patterns word
-          word
-        end
+      def process tokens
+        reject tokens # Reject any tokens that don't meet criteria
+        downcase tokens
+        tokens
       end
 
       # Does not actually return a token, but a
       # symbol "token".
       #
-      def token_for text
-        symbolize text
+      def tokens_for words
+        words.collect! { |word| word.to_sym }
+      end
+      # Returns empty tokens.
+      #
+      def empty_tokens
+        []
       end
 
     end

diff --git a/server/lib/picky/internals/tokenizers/query.rb b/server/lib/picky/internals/tokenizers/query.rb
@@ -3,7 +3,7 @@
 module Internals
 
   module Tokenizers
-  
+
     # There are a few class methods that you can use to configure how a query works.
     #
     # removes_characters regexp
@@ -14,66 +14,47 @@ module Tokenizers
     # normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
     #
     class Query < Base
-    
+
       def self.default= new_default
         @default = new_default
       end
       def self.default
         @default ||= new
       end
-    
+
       attr_reader :maximum_tokens
-    
+
       def initialize options = {}
         super options
         @maximum_tokens = options[:maximum_tokens] || 5
       end
-
-      def preprocess text
-        remove_illegals text             # Remove illegal characters
-        remove_non_single_stopwords text # remove stop words
-        text
-      end
-
-      # Split the text and put some back together.
-      #
-      # TODO Make the same as in indexing?
-      #
-      def pretokenize text
-        split text
-      end
-
+
       # Let each token process itself.
       # Reject, limit, and partialize tokens.
       #
+      # In querying we work with real tokens (in indexing it's just symbols).
+      #
       def process tokens
-        tokens.tokenize_with self
-        tokens.reject              # Reject any tokens that don't meet criteria
-        tokens.cap maximum_tokens  # Cut off superfluous tokens
-        tokens.partialize_last     # Set certain tokens as partial
+        tokens.reject              # Reject any tokens that don't meet criteria.
+        tokens.downcase            # Downcase all.
+        tokens.cap maximum_tokens  # Cut off superfluous tokens.
+        tokens.partialize_last     # Set certain tokens as partial.
         tokens
       end
-
-      # Called by the token.
-      #
-      # TODO Perhaps move to Normalizer?
+
+      # Converts words into real tokens.
       #
-      def normalize text
-        text = substitute_characters text # Substitute special characters
-        text.downcase!                    # Downcase all text
-        normalize_with_patterns text      # normalize
-        text.to_sym                       # symbolize
+      def tokens_for words
+        Internals::Query::Tokens.processed words
       end
-
-      # Returns a token for a word.
-      # The basic query tokenizer uses new tokens.
+      # Returns a tokens object.
       #
-      def token_for word
-        Internals::Query::Token.processed word
+      def empty_tokens
+        Internals::Query::Tokens.new
       end
-    
+
     end
-    
+
   end
-  
+
 end
diff --git a/server/spec/lib/internals/tokenizers/query_spec.rb b/server/spec/lib/internals/tokenizers/query_spec.rb
@@ -41,6 +41,8 @@
     it 'should call methods in order' do
       text = stub :text
 
+      tokenizer.should_receive(:substitute_characters).once.with(text).and_return text
+      text.should_receive(:downcase!).once.ordered.with()
       tokenizer.should_receive(:remove_illegals).once.ordered.with text
       tokenizer.should_receive(:remove_non_single_stopwords).once.ordered.with text
 
@@ -57,13 +59,7 @@
     before(:each) do
       @tokens = mock :tokens, :null_object => true
     end
-    it 'should tokenize the tokens' do
-      @tokens.should_receive(:tokenize_with).once.with tokenizer
-
-      tokenizer.process @tokens
-    end
     it 'should call methods on the tokens in order' do
-      @tokens.should_receive(:tokenize_with).once.ordered
       @tokens.should_receive(:reject).once.ordered
       @tokens.should_receive(:cap).once.ordered
       @tokens.should_receive(:partialize_last).once.ordered
@@ -122,13 +118,5 @@ def self.it_should_pattern_normalize original, expected
       tokenizer.tokenize('').map(&:to_s).should == []
     end
   end
-  describe "token_for" do
-    it "should get a preprocessed token" do
-      text = stub(:text)
-      Internals::Query::Token.should_receive(:processed).with text
-
-      tokenizer.token_for text
-    end
-  end
 
 end