Skip to content

Commit

Permalink
+ tokenizing completely rewritten
Browse files Browse the repository at this point in the history
  • Loading branch information
floere committed Apr 6, 2011
1 parent f5c7e98 commit b26806a
Show file tree
Hide file tree
Showing 7 changed files with 123 additions and 129 deletions.
36 changes: 21 additions & 15 deletions server/lib/picky/internals/query/token.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@ def initialize text
# and normalized token. I.e. one prepared for a search.
#
def self.processed text
token = new text
token.qualify
token.extract_original
token.partialize
token.similarize
token.remove_illegals
token
new(text).process
end
def process
qualify
extract_original
partialize
similarize
remove_illegals
symbolize # NEW
self
end

# This returns a predefined category name if the user has given one.
Expand All @@ -61,23 +64,27 @@ def extract_original
# It is only settable if it hasn't been set yet.
#
def partial= partial
@partial = partial if @partial.nil?
@partial ||= partial
end
def partial?
!@similar && @partial
end

# If the text ends with *, partialize it. If with ", don't.
#
# The latter wins. So "hello*" will not be partially searched.
#
@@no_partial = /\"\Z/
@@partial = /\*\Z/
def partialize
self.partial = false and return if @text =~ @@no_partial
self.partial = true if @text =~ @@partial
self.partial = false and return unless @text !~ @@no_partial
self.partial = true unless @text !~ @@partial
end

# If the text ends with ~ similarize it. If with ", don't.
#
# The latter wins.
#
@@no_similar = /\"\Z/
@@similar = /\~\Z/
def similarize
Expand All @@ -95,14 +102,13 @@ def similar?
def remove_illegals
@text.gsub! @@illegals, '' unless @text.blank?
end

# Visitor for tokenizer.

#
# TODO Rewrite!!!
#
def tokenize_with tokenizer
@text = tokenizer.normalize @text
def symbolize
@text = @text.to_sym
end

# TODO spec!
#
# TODO Rewrite!!
Expand Down
5 changes: 5 additions & 0 deletions server/lib/picky/internals/query/tokens.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,13 @@ class Tokens # :nodoc:all
def initialize tokens = []
@tokens = tokens
end
def self.processed words
new words.collect! { |word| Token.processed word }
end

# Tokenizes each token.
#
# Note: Passed tokenizer needs to offer #normalize(text).
#
def tokenize_with tokenizer
@tokens.each { |token| token.tokenize_with(tokenizer) }
Expand Down
51 changes: 34 additions & 17 deletions server/lib/picky/internals/tokenizers/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -163,37 +163,54 @@ def initialize options = {}
reject_token_if &(options[:reject_token_if] || :blank?)
end

# Hooks.
#

# Preprocessing.
#
def preprocess text; end
# Default preprocessing hook.
#
# Does:
# 1. Character substitution.
# 2. Remove illegal expressions.
# 3. Remove non-single stopwords. (Stopwords that occur with other words)
#
def preprocess text
text = substitute_characters text
remove_illegals text
# We do not remove single stopwords e.g. in the indexer for
# an entirely different reason than in the query tokenizer.
# An indexed thing with just name "UND" (a possible stopword)
# should not lose its name.
#
remove_non_single_stopwords text
text
end
# Pretokenizing.
#
def pretokenize text; end
# Does:
# 1. Split the text into words.
# 2. Normalize each word.
#
def pretokenize text
words = split text
words.collect! do |word|
normalize_with_patterns word
word
end
end
# Postprocessing.
#
def process tokens
reject tokens # Reject any tokens that don't meet criteria
tokens
end

# Converts words into real tokens.
#
def tokens_for words
Internals::Query::Tokens.new words.collect! { |word| token_for word }
end
# # Converts words into real tokens.
# #
# def tokens_for words
# Internals::Query::Tokens.new words.collect! { |word| token_for word }
# end
# Turns non-blank text into symbols.
#
def symbolize text
text.blank? ? nil : text.to_sym
end
# Returns a tokens object.
#
def empty_tokens
Internals::Query::Tokens.new
end

end

Expand Down
45 changes: 13 additions & 32 deletions server/lib/picky/internals/tokenizers/index.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,45 +15,26 @@ def self.default
@default ||= new
end

# Default indexing preprocessing hook.
# Postprocessing.
#
# Does:
# 1. Character substitution.
# 2. Downcasing.
# 3. Remove illegal expressions.
# 4. Remove non-single stopwords. (Stopwords that occur with other words)
# In indexing, we work with symbol tokens.
#
def preprocess text
text = substitute_characters text
text.downcase!
remove_illegals text
# we do not remove single stopwords for an entirely different
# reason than in the query tokenizer.
# An indexed thing with just name "UND" (a possible stopword) should not lose its name.
#
remove_non_single_stopwords text
text
end

# Default indexing pretokenizing hook.
#
# Does:
# 1. Split the text into words.
# 2. Normalize each word.
#
def pretokenize text
words = split text
words.collect! do |word|
normalize_with_patterns word
word
end
def process tokens
reject tokens # Reject any tokens that don't meet criteria
downcase tokens
tokens
end

# Does not actually return a token, but a
# symbol "token".
#
def token_for text
symbolize text
def tokens_for words
words.collect! { |word| word.to_sym }
end
# Returns empty tokens.
#
def empty_tokens
[]
end

end
Expand Down
61 changes: 21 additions & 40 deletions server/lib/picky/internals/tokenizers/query.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
module Internals

module Tokenizers

# There are a few class methods that you can use to configure how a query works.
#
# removes_characters regexp
Expand All @@ -14,66 +14,47 @@ module Tokenizers
# normalizes_words [[/regexp1/, 'replacement1'], [/regexp2/, 'replacement2']]
#
class Query < Base

def self.default= new_default
@default = new_default
end
def self.default
@default ||= new
end

attr_reader :maximum_tokens

def initialize options = {}
super options
@maximum_tokens = options[:maximum_tokens] || 5
end

def preprocess text
remove_illegals text # Remove illegal characters
remove_non_single_stopwords text # remove stop words
text
end

# Split the text and put some back together.
#
# TODO Make the same as in indexing?
#
def pretokenize text
split text
end


# Let each token process itself.
# Reject, limit, and partialize tokens.
#
# In querying we work with real tokens (in indexing it's just symbols).
#
def process tokens
tokens.tokenize_with self
tokens.reject # Reject any tokens that don't meet criteria
tokens.cap maximum_tokens # Cut off superfluous tokens
tokens.partialize_last # Set certain tokens as partial
tokens.reject # Reject any tokens that don't meet criteria.
tokens.downcase # Downcase all.
tokens.cap maximum_tokens # Cut off superfluous tokens.
tokens.partialize_last # Set certain tokens as partial.
tokens
end

# Called by the token.
#
# TODO Perhaps move to Normalizer?

# Converts words into real tokens.
#
def normalize text
text = substitute_characters text # Substitute special characters
text.downcase! # Downcase all text
normalize_with_patterns text # normalize
text.to_sym # symbolize
def tokens_for words
Internals::Query::Tokens.processed words
end

# Returns a token for a word.
# The basic query tokenizer uses new tokens.
# Returns a tokens object.
#
def token_for word
Internals::Query::Token.processed word
def empty_tokens
Internals::Query::Tokens.new
end

end

end

end
16 changes: 2 additions & 14 deletions server/spec/lib/internals/tokenizers/query_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
it 'should call methods in order' do
text = stub :text

tokenizer.should_receive(:substitute_characters).once.with(text).and_return text
text.should_receive(:downcase!).once.ordered.with()
tokenizer.should_receive(:remove_illegals).once.ordered.with text
tokenizer.should_receive(:remove_non_single_stopwords).once.ordered.with text

Expand All @@ -57,13 +59,7 @@
before(:each) do
@tokens = mock :tokens, :null_object => true
end
it 'should tokenize the tokens' do
@tokens.should_receive(:tokenize_with).once.with tokenizer

tokenizer.process @tokens
end
it 'should call methods on the tokens in order' do
@tokens.should_receive(:tokenize_with).once.ordered
@tokens.should_receive(:reject).once.ordered
@tokens.should_receive(:cap).once.ordered
@tokens.should_receive(:partialize_last).once.ordered
Expand Down Expand Up @@ -122,13 +118,5 @@ def self.it_should_pattern_normalize original, expected
tokenizer.tokenize('').map(&:to_s).should == []
end
end
describe "token_for" do
it "should get a preprocessed token" do
text = stub(:text)
Internals::Query::Token.should_receive(:processed).with text

tokenizer.token_for text
end
end

end
Loading

0 comments on commit b26806a

Please sign in to comment.