Skip to content

Commit

Permalink
+ parallel indexing
Browse files Browse the repository at this point in the history
  • Loading branch information
floere committed Apr 13, 2011
1 parent 252bcf3 commit e929ec0
Show file tree
Hide file tree
Showing 9 changed files with 75 additions and 46 deletions.
5 changes: 3 additions & 2 deletions server/lib/picky/index/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class Base
# === Options
# * result_identifier: Use if you'd like a different identifier/name in the results than the name of the index.
# * after_indexing: As of this writing only used in the db source. Executes the given after_indexing as SQL after the indexing process.
# * tokenizer: The tokenizer to use for this index.
#
# Example:
# my_index = Index::Memory.new(:my_index, some_source) do
Expand Down Expand Up @@ -54,12 +55,12 @@ def check name, source
NAME
) unless name.respond_to?(:to_sym)
raise ArgumentError.new(<<-SOURCE
The index "#{name}" should use a data source that responds to the method #harvest, which yields(id, text).
The index "#{name}" should use a data source that responds to either the method #each, or the method #harvest, which yields(id, text).
Or it could use one of the built-in sources:
Sources::#{(Sources.constants - [:Base, :Wrappers, :NoCSVFileGiven, :NoCouchDBGiven]).join(',
Sources::')}
SOURCE
) unless source.respond_to?(:harvest)
) unless source.respond_to?(:each) || source.respond_to?(:harvest)
end

def to_stats
Expand Down
22 changes: 11 additions & 11 deletions server/lib/picky/internals/index/file/text.rb
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
module Internals

module Index

module File

# Index data dumped in the text format.
#
class Text < Basic

# Uses the extension "txt".
#
def extension
Expand All @@ -20,12 +20,12 @@ def load
raise "Can't load from text file. Use JSON or Marshal."
end
# Text files are used exclusively for
# prepared data files.
# prepared data files.
#
def dump hash
raise "Can't dump to text file. Use JSON or Marshal."
end

# Retrieves prepared index data in the form
# * id,data\n
# * id,data\n
Expand All @@ -43,18 +43,18 @@ def retrieve
end
end
end

#
#
def open_for_indexing &block
::File.open cache_path, 'w:binary', &block
end


end

end

end

end
24 changes: 6 additions & 18 deletions server/lib/picky/internals/indexers/serial.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,30 +6,16 @@ module Indexers
#
# Note: It is called serial since it indexes each
#
class Serial

attr_accessor :tokenizer, :source
# FIXME Giving the serial a category would be enough, since it already contains a configuration!
#
class Serial < Base

def initialize configuration, source, tokenizer
@configuration = configuration
@source = source || raise_no_source
@tokenizer = tokenizer
end

# Raise a no source exception.
#
def raise_no_source
raise NoSourceSpecifiedException.new("No source given for #{@configuration}.")
end

# Delegates the key format to the source.
#
# Default is to_i.
#
def key_format
@source.key_format || :to_i
end

# Selects the original id (indexed id) and a column to process. The column data is called "token".
#
# Note: Puts together the parts first in an array, then releasing the array from time to time by joining.
Expand Down Expand Up @@ -66,8 +52,10 @@ def process
file.write result.join
end
end
#
#
def indexing_message
timed_exclaim %Q{"#{@configuration.identifier}": Starting indexing.}
timed_exclaim %Q{"#{@configuration.identifier}": Starting serial indexing.}
end

end
Expand Down
22 changes: 17 additions & 5 deletions server/lib/picky/internals/indexing/categories.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,28 @@ module Indexing

class Categories

attr_reader :categories
attr_reader :categories, :tokenizer

delegate :<<, :to => :categories
delegate :<<, :map, :size, :zip, :to => :categories

each_delegate :index,
:cache,
each_delegate :cache,
:generate_caches,
:index,
:backup_caches,
:restore_caches,
:check_caches,
:clear_caches,
:create_directory_structure,
:to => :categories

def initialize
def initialize tokenizer
@tokenizer = tokenizer

@categories = []
end

#
#
def find category_name
category_name = category_name.to_sym

Expand All @@ -33,6 +37,14 @@ def find category_name
raise %Q{Index category "#{category_name}" not found. Possible categories: "#{categories.map(&:name).join('", "')}".}
end

# Indexes the categories in parallel.
#
def index_parallel index, source
indexer = Indexers::Parallel.new index, self, source, tokenizer
categories.first.prepare_index_directory
indexer.index
end

def to_s
categories.indented_to_s
end
Expand Down
2 changes: 1 addition & 1 deletion server/lib/picky/internals/indexing/category.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ class Category

attr_reader :exact, :partial, :name, :configuration

delegate :identifier, :prepare_index_directory, :to => :configuration
delegate :identifier, :prepare_index_directory, :prepared_index_file, :to => :configuration
delegate :source, :source=, :tokenizer, :tokenizer=, :to => :indexer

# Mandatory params:
Expand Down
27 changes: 19 additions & 8 deletions server/lib/picky/internals/indexing/index.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,21 @@ module Indexing

class Index

attr_reader :name, :source, :categories, :after_indexing
attr_reader :name, :source, :categories, :after_indexing, :tokenizer

# Delegators for indexing.
#
delegate :connect_backend,
:to => :source

delegate :index,
delegate :backup_caches,
:cache,
:generate_caches,
:backup_caches,
:restore_caches,
:check_caches,
:clear_caches,
:create_directory_structure,
:generate_caches,
:index,
:restore_caches,
:to => :categories

def initialize name, source, options = {}
Expand All @@ -30,7 +30,7 @@ def initialize name, source, options = {}
@after_indexing = options[:after_indexing]
@bundle_class = options[:indexing_bundle_class] # TODO This should actually be a fixed parameter.

@categories = Categories.new
@categories = Categories.new options[:tokenizer]
end

# TODO Spec. Doc.
Expand All @@ -42,7 +42,6 @@ def define_category category_name, options = {}
categories << new_category
new_category
end

# By default, the category uses
# * the index's source.
# * the index's bundle type.
Expand All @@ -54,10 +53,22 @@ def default_category_options
}
end

# Decides whether to use a parallel indexer or whether to
# delegate to each category to index themselves.
#
def index
if source.respond_to?(:each)
warn "Warning: Source #{source} is empty." if source.respond_to?(:empty?) && source.empty?
categories.index_parallel self, source
else
categories.index
end
end

# Indexing.
#
def take_snapshot
source.take_snapshot self
source.take_snapshot self unless source.respond_to? :each
end

def to_s
Expand Down
2 changes: 2 additions & 0 deletions server/lib/picky/loader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ def self.load_framework_internals
# Index generation strategies.
#
load_internals 'indexers/no_source_specified_exception'
load_internals 'indexers/base'
load_internals 'indexers/serial'
load_internals 'indexers/parallel'

# Generators.
#
Expand Down
5 changes: 4 additions & 1 deletion server/spec/lib/index/base_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
end
it 'fails correctly' do
expect { described_class.new :some_index_name, :some_source }.to raise_error(<<-ERROR
The index "some_index_name" should use a data source that responds to the method #harvest, which yields(id, text).
The index "some_index_name" should use a data source that responds to either the method #each, or the method #harvest, which yields(id, text).
Or it could use one of the built-in sources:
Sources::DB,
Sources::CSV,
Expand All @@ -30,6 +30,9 @@
ERROR
)
end
it 'does not fail' do
expect { described_class.new :some_index_name, [] }.to_not raise_error
end
it 'registers with the indexes' do
@api = described_class.allocate

Expand Down
12 changes: 12 additions & 0 deletions server/test_project/app/application.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,18 @@ class BookSearch < Application
category :isbn, :qualifiers => [:i, :isbn]
end

class ISBN
@@id = 1
attr_reader :id, :isbn
def initialize
@id = @@id += 1
@isbn = rand(1000000).to_s # Fake ISBN.
end
end
isbn_each_index = Index::Memory.new :isbn_each, [ISBN.new, ISBN.new] do
category :isbn, :qualifiers => [:i, :isbn]
end

mgeo_index = Index::Memory.new :memory_geo, Sources::CSV.new(:location, :north, :east, file: 'data/ch.csv', col_sep: ',') do
category :location
map_location :north1, 1, precision: 3, from: :north
Expand Down

0 comments on commit e929ec0

Please sign in to comment.