Permalink
Browse files

- a huge amount of generation code, + dynamic weights

  • Loading branch information...
1 parent 59b596c commit c783dbda10db60578fa52d376fc0abc1a782a9c1 @floere committed Nov 13, 2011
Showing with 913 additions and 822 deletions.
  1. +26 −1 server/lib/picky/bundle.rb
  2. +6 −6 server/lib/picky/bundle_indexed.rb
  3. +0 −22 server/lib/picky/bundle_indexing.rb
  4. +26 −16 server/lib/picky/bundle_realtime.rb
  5. +0 −19 server/lib/picky/generators/base.rb
  6. +0 −47 server/lib/picky/generators/partial/infix.rb
  7. +0 −6 server/lib/picky/generators/partial/none.rb
  8. +0 −47 server/lib/picky/generators/partial/substring.rb
  9. +3 −3 server/lib/picky/generators/similarity/double_metaphone.rb
  10. +3 −3 server/lib/picky/generators/similarity/metaphone.rb
  11. +12 −24 server/lib/picky/generators/similarity/phonetic.rb
  12. +3 −3 server/lib/picky/generators/similarity/soundex.rb
  13. +46 −0 server/lib/picky/generators/weights/constant.rb
  14. +37 −0 server/lib/picky/generators/weights/dynamic.rb
  15. +0 −10 server/lib/picky/generators/weights/logarithmic.rb
  16. +41 −0 server/lib/picky/generators/weights/runtime.rb
  17. +3 −3 server/lib/picky/loader.rb
  18. +2 −1 server/lib/picky/query/allocations.rb
  19. +1 −1 server/spec/lib/category_indexed_spec.rb
  20. +11 −13 server/spec/lib/character_substituters/west_european_spec.rb
  21. +161 −161 server/spec/lib/generators/partial/infix_spec.rb
  22. +3 −3 server/spec/lib/generators/partial/none_spec.rb
  23. +109 −109 server/spec/lib/generators/partial/postfix_spec.rb
  24. +190 −190 server/spec/lib/generators/partial/substring_spec.rb
  25. +38 −38 server/spec/lib/generators/similarity/double_metaphone_spec.rb
  26. +38 −38 server/spec/lib/generators/similarity/metaphone_spec.rb
  27. +38 −38 server/spec/lib/generators/similarity/soundex_spec.rb
  28. +37 −0 server/spec/lib/generators/weights/constant_spec.rb
  29. +27 −0 server/spec/lib/generators/weights/dynamic_spec.rb
  30. +10 −15 server/spec/lib/generators/weights/logarithmic_spec.rb
  31. +3 −2 server/spec/lib/indexed/bundle_spec.rb
  32. +5 −3 server/spec/lib/indexing/bundle_spec.rb
  33. +34 −0 server/spec/specific/dynamic_weights_spec.rb
View
@@ -66,8 +66,11 @@ def initialize name, category, backend, weights_strategy, partial_strategy, simi
# Initial indexes.
#
+ # Note that if the weights strategy doesn't need to be saved,
+ # the strategy itself pretends to be an index.
+ #
@inverted = @backend_inverted.initial
- @weights = @backend_weights.initial
+ @weights = @weights_strategy.saved?? @backend_weights.initial : @weights_strategy
@similarity = @backend_similarity.initial
@configuration = @backend_configuration.initial
@@ -77,6 +80,28 @@ def identifier
"#{category.identifier}:#{name}"
end
+ # "Empties" the index(es) by getting a new empty
+ # internal backend instance.
+ #
+ def empty
+ empty_inverted
+ empty_weights
+ empty_similarity
+ empty_configuration
+ end
+ def empty_inverted
+ @inverted = @backend_inverted.empty
+ end
+ def empty_weights
+ @weights = @weights_strategy.saved?? @backend_weights.empty : @weights_strategy
+ end
+ def empty_similarity
+ @similarity = @backend_similarity.empty
+ end
+ def empty_configuration
+ @configuration = @backend_configuration.empty
+ end
+
# Get a list of similar texts.
#
# Note: Does not return itself.
@@ -22,24 +22,24 @@ class Bundle
#
# Returns a (potentially empty) array of ids.
#
- def ids sym
- @inverted[sym] || []
+ def ids sym_or_string
+ @inverted[sym_or_string] || []
end
# Get a weight for the given symbol.
#
# Returns a number, or nil.
#
- def weight sym
- @weights[sym]
+ def weight sym_or_string
+ @weights[sym_or_string]
end
# Get settings for this bundle.
#
# Returns an object.
#
- def [] sym
- @configuration[sym]
+ def [] sym_or_string
+ @configuration[sym_or_string]
end
# Loads all indexes.
@@ -35,28 +35,6 @@ class Bundle
delegate :clear,
:to => :inverted
- # "Empties" the index(es) by getting a new empty
- # internal backend instance.
- #
- def empty
- empty_inverted
- empty_weights
- empty_similarity
- empty_configuration
- end
- def empty_inverted
- @inverted = @backend_inverted.empty
- end
- def empty_weights
- @weights = @backend_weights.empty
- end
- def empty_similarity
- @similarity = @backend_similarity.empty
- end
- def empty_configuration
- @configuration = @backend_configuration.empty
- end
-
# Saves the indexes in a dump file.
#
def dump
@@ -32,39 +32,49 @@ def remove id
# Returns a reference to the array where the id has been added.
#
- # TODO Rename sym.
- #
- def add id, sym, where = :unshift
- ary = @inverted[sym]
+ def add id, str_or_sym, where = :unshift
+ ary = @inverted[str_or_sym]
- syms = @realtime_mapping[id]
- syms = (@realtime_mapping[id] = []) unless syms # TODO Nicefy.
+ str_or_syms = @realtime_mapping[id]
+ str_or_syms = (@realtime_mapping[id] = []) unless str_or_syms # TODO Nicefy.
# Inverted.
#
- ids = if syms.include? sym
- ids = @inverted[sym]
+ ids = if str_or_syms.include? str_or_sym
+ ids = @inverted[str_or_sym]
ids.delete id
ids.send where, id
else
- syms << sym
- ids = @inverted[sym] ||= []
+ str_or_syms << str_or_sym
+ ids = @inverted[str_or_sym] ||= []
ids.send where, id
end
# Weights.
#
- @weights[sym] = self.weights_strategy.weight_for ids.size
+ @weights[str_or_sym] = self.weights_strategy.weight_for ids.size
# Similarity.
#
- if encoded = self.similarity_strategy.encoded(sym)
+ add_similarity str_or_sym, where
+
+ # Return reference.
+ #
+ ids
+ end
+
+ # Add string/symbol to similarity index.
+ #
+ # TODO Probably where makes no sense here. Should have its own order.
+ #
+ def add_similarity str_or_sym, where = :unshift
+ if encoded = self.similarity_strategy.encoded(str_or_sym)
similarity = @similarity[encoded] ||= []
- if similarity.include? sym
- similarity.delete sym # Not completely correct, as others will also be affected, but meh.
- similarity.send where, sym #
+ if similarity.include? str_or_sym
+ similarity.delete str_or_sym # Not completely correct, as others will also be affected, but meh.
+ similarity.send where, str_or_sym #
else
- similarity.send where, sym
+ similarity.send where, str_or_sym
end
end
end
@@ -1,19 +0,0 @@
-module Picky
-
- module Generators # :nodoc:all
-
- # A cache generator holds an index.
- #
- class Base
-
- attr_reader :inverted
-
- def initialize inverted
- @inverted = inverted
- end
-
- end
-
- end
-
-end
@@ -41,53 +41,6 @@ def each_partial token, &block
token.each_intoken min, max, &block
end
- # Generates a partial index from the given inverted index.
- #
- def generate_from inverted
- result = {}
-
- # Generate for each key token the subtokens.
- #
- i = 0
- j = 0
- inverted.each_key do |token|
- i += 1
- if i == 5000
- j += 1
- timed_exclaim %Q{#{"%8i" % (i*j)} generated (current token: "#{token}").}
- i = 0
- end
- generate_for token, inverted, result
- end
-
- # Remove duplicate ids.
- #
- # THINK If it is unique for a subtoken, it is
- # unique for all derived longer tokens.
- #
- result.each_value &:uniq!
-
- result
- end
-
- # To each shortened token of :test
- # :test, :tes, :te, :t
- # add all ids of :test
- #
- # "token" here means just text.
- #
- # THINK Could be improved by appending the aforegoing ids?
- #
- def generate_for token, inverted, result
- each_partial token do |intoken|
- if result[intoken]
- result[intoken] += inverted[token] # unique
- else
- result[intoken] = inverted[token].dup
- end
- end
- end
-
end
end
@@ -14,12 +14,6 @@ def each_partial token
# yields nothing
end
- # Returns an empty index.
- #
- def generate_from index
- {}
- end
-
# Returns if this strategy's generated file is saved.
#
def saved?
@@ -78,53 +78,6 @@ def each_partial token, &block
@generator.each_subtoken token, &block
end
- # Generates a partial index from the given inverted index.
- #
- def generate_from inverted
- result = {}
-
- # Generate for each key token the subtokens.
- #
- i = 0
- j = 0
- inverted.each_key do |token|
- i += 1
- if i == 5000
- j += 1
- timed_exclaim %Q{#{"%8i" % (i*j)} generated (current token: "#{token}").}
- i = 0
- end
- generate_for token, inverted, result
- end
-
- # Remove duplicate ids.
- #
- # THINK If it is unique for a subtoken, it is
- # unique for all derived longer tokens.
- #
- result.each_value &:uniq!
-
- result
- end
-
- # To each shortened token of :test
- # :test, :tes, :te, :t
- # add all ids of :test
- #
- # "token" here means just text.
- #
- # THINK Could be improved by appending the aforegoing ids?
- #
- def generate_for token, inverted, result
- each_partial token do |subtoken|
- if result[subtoken]
- result[subtoken] += inverted[token] # unique
- else
- result[subtoken] = inverted[token].dup
- end
- end
- end
-
end
end
@@ -14,12 +14,12 @@ module Similarity
#
class DoubleMetaphone < Phonetic
- # Encodes the given symbol.
+ # Encodes the given string/symbol.
#
# Returns a symbol.
#
- def encoded sym
- codes = Text::Metaphone.double_metaphone sym.to_s
+ def encoded str_or_sym
+ codes = Text::Metaphone.double_metaphone str_or_sym.to_s
codes.first.intern unless codes.empty?
end
@@ -14,12 +14,12 @@ module Similarity
#
class Metaphone < Phonetic
- # Encodes the given symbol.
+ # Encodes the given string/symbol.
#
# Returns a symbol.
#
- def encoded sym
- code = Text::Metaphone.metaphone sym.to_s
+ def encoded str_or_sym
+ code = Text::Metaphone.metaphone str_or_sym.to_s
code.intern if code
end
@@ -23,20 +23,23 @@ def initialize amount = 10
@amount = amount
end
- # Generates an index for the given index (in exact index style).
- #
- # In the following form:
- # [:meier, :mueller, :peter, :pater] => { MR: [:meier], MLR: [:mueller], PTR: [:peter, :pater] }
- #
- def generate_from inverted
- hash = hashify inverted.keys
- sort hash
- end
+ # # Generates an index for the given index (in exact index style).
+ # #
+ # # In the following form:
+ # # [:meier, :mueller, :peter, :pater] => { MR: [:meier], MLR: [:mueller], PTR: [:peter, :pater] }
+ # #
+ # def generate_from inverted
+ # hash = hashify inverted.keys
+ # sort hash
+ # end
protected
# Sorts the index values in place.
#
+ # TODO Include this again. Sort at the end.
+ # Or sort when inserting in realtime.
+ #
def sort hash
hash.each_pair.each do |code, ary|
ary.sort_by_levenshtein! code
@@ -45,21 +48,6 @@ def sort hash
hash
end
- # Hashifies a list of symbols.
- #
- # Where:
- # { encoded_sym => [syms] }
- #
- def hashify list
- list.inject({}) do |total, element|
- if code = encoded(element)
- total[code] ||= []
- total[code] << element
- end
- total
- end
- end
-
end
end
Oops, something went wrong.

0 comments on commit c783dbd

Please sign in to comment.