Navigation Menu

Skip to content

Commit

Permalink
Use the leading 1000 characters
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 4, 2014
1 parent a3c84f4 commit ddd2f55
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 2 deletions.
6 changes: 6 additions & 0 deletions bin/wikipedia-to-groonga.rb
Expand Up @@ -15,13 +15,19 @@
options.output = "-"
converter_options = {
:max_n_records => -1,
:max_n_characters => -1,
}
parser = OptionParser.new
parser.on("--max-n-records=N", Integer,
"The number of maximum records. -1 means unlimited.",
"(#{converter_options[:max_n_records]})") do |n|
converter_options[:max_n_records] = n
end
parser.on("--max-n-characters=N", Integer,
"The number of maximum characters in a record. -1 means unlimited.",
"(#{converter_options[:max_n_characters]})") do |n|
converter_options[:max_n_characters] = n
end
parser.on("--output=PATH",
"Output to PATH. '-' means the standard output.",
"(#{options.output})") do |path|
Expand Down
14 changes: 12 additions & 2 deletions lib/wikipedia-search/groonga-converter.rb
Expand Up @@ -29,8 +29,10 @@ def initialize(output, options)
@text_stack = [""]
@first_page = true
@n_records = 0
@max_n_records = @options[:max_n_records]
@max_n_records = @options[:max_n_records] || -1
@max_n_records = nil if @max_n_records < 0
@max_n_characters = @options[:max_n_characters] || -1
@max_n_characters = nil if @max_n_characters < 0
end

def start(abort_tag)
Expand Down Expand Up @@ -68,7 +70,7 @@ def tag_end(name)
page = {
"_key" => @id,
"title" => @title,
"text" => @text,
"text" => shorten_text(@text),
}
@output.print(JSON.generate(page))
@n_records += 1
Expand Down Expand Up @@ -98,6 +100,14 @@ def push_stacks
def pop_stacks
@text_stack.pop
end

def shorten_text(text)
if @max_n_characters
text[0, @max_n_characters]
else
text
end
end
end
end
end
2 changes: 2 additions & 0 deletions lib/wikipedia-search/task.rb
Expand Up @@ -45,6 +45,8 @@ def define_convert_tasks
command_line << "bin/wikipedia-to-groonga.rb"
command_line << "--max-n-records"
command_line << "5000"
command_line << "--max-n-characters"
command_line << "1000"
command_line << "--output"
command_line << ja_groonga_data_path.to_s
sh(command_line.join(" "))
Expand Down
21 changes: 21 additions & 0 deletions test/test-groonga-converter.rb
Expand Up @@ -68,6 +68,27 @@ def test_max_n_records
load --table Pages
[
{"_key":1,"title":"Title1","text":"Text1"}
]
GROONGA
end

def test_max_n_characters
xml = <<-XML
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
<page>
<title>Title</title>
<id>1</id>
<revision>
<id>1001</id>
<text>Text</text>
</revision>
</page>
</mediawiki>
XML
assert_equal(<<-GROONGA, convert(xml, :max_n_characters => 2))
load --table Pages
[
{"_key":1,"title":"Title","text":"Te"}
]
GROONGA
end
Expand Down

0 comments on commit ddd2f55

Please sign in to comment.