Navigation Menu

Skip to content

Commit

Permalink
Add data:cnvert:ja:groonga task
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 4, 2014
1 parent 4bcd3a5 commit 6e1c73d
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 5 deletions.
25 changes: 20 additions & 5 deletions bin/wikipedia-to-groonga.rb
@@ -1,6 +1,7 @@
#!/usr/bin/env ruby

require "pathname"
require "ostruct"
require "optparse"

base_dir_path = Pathname.new(__FILE__).dirname
Expand All @@ -10,16 +11,30 @@

require "wikipedia-search/groonga-converter"

options = {
options = OpenStruct.new
options.output = "-"
converter_options = {
:max_n_records => -1,
}
parser = OptionParser.new
parser.on("--max-n-records=N", Integer,
"The number of maximum records. -1 means unlimited.",
"(#{options[:max_n_records]})") do |n|
options[:max_n_records] = n
"(#{converter_options[:max_n_records]})") do |n|
converter_options[:max_n_records] = n
end
parser.on("--output=PATH",
"Output to PATH. '-' means the standard output.",
"(#{options.output})") do |path|
options.output = path
end
parser.parse!(ARGV)

converter = WikipediaSearch::GroongaConverter.new(ARGF, options)
converter.convert($stdout)
converter = WikipediaSearch::GroongaConverter.new(ARGF, converter_options)
if options.output == "-"
output = $stdout
converter.convert(output)
else
File.open(options.output, "w") do |output|
converter.convert(output)
end
end
29 changes: 29 additions & 0 deletions lib/wikipedia-search/task.rb
@@ -1,3 +1,6 @@
require "rbconfig"
require "shellwords"

require "wikipedia-search/downloader"

module WikipediaSearch
Expand All @@ -13,6 +16,7 @@ def define
namespace :data do
directory data_dir_path.to_s
define_download_tasks
define_convert_tasks
end
end

Expand All @@ -29,6 +33,27 @@ def define_download_tasks
end
end

def define_convert_tasks
namespace :convert do
namespace :ja do
desc "Convert Japanese Wikipedia data to Groonga data."
task :groonga => ja_data_path.to_s do
command_line = []
command_line << "bzcat"
command_line << Shellwords.escape(ja_data_path.to_s)
command_line << "|"
command_line << RbConfig.ruby
command_line << "bin/wikipedia-to-groonga.rb"
command_line << "--max-n-records"
command_line << "5000"
command_line << "--output"
command_line << ja_groonga_output_path.to_s
sh(command_line.join(" "))
end
end
end
end

def data_dir_path
@data_dir_path ||= Pathname.new("data")
end
Expand All @@ -40,5 +65,9 @@ def ja_data_path
def ja_data_base_name
"jawiki-latest-pages-articles.xml.bz2"
end

def ja_groonga_output_path
@ja_groonga_output_path ||= data_dir_path + "ja-data.grn"
end
end
end

0 comments on commit 6e1c73d

Please sign in to comment.