Navigation Menu

Skip to content

Commit

Permalink
Add --max-n-records option
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 4, 2014
1 parent 2a9c677 commit 7fd702d
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 8 deletions.
14 changes: 13 additions & 1 deletion bin/wikipedia-to-groonga.rb
@@ -1,6 +1,7 @@
#!/usr/bin/env ruby

require "pathname"
require "optparse"

base_dir_path = Pathname.new(__FILE__).dirname
lib_dir_path = base_dir_path + "lib"
Expand All @@ -9,5 +10,16 @@

require "wikipedia-search/groonga-converter"

converter = WikipediaSearch::GroongaConverter.new(ARGF)
options = {
:max_n_records => -1,
}
parser = OptionParser.new
parser.on("--max-n-records=N", Integer,
"The number of maximum records. -1 means unlimited.",
"(#{options[:max_n_records]})") do |n|
options[:max_n_records] = n
end
parser.parse!(ARGV)

converter = WikipediaSearch::GroongaConverter.new(ARGF, options)
converter.convert($stdout)
25 changes: 20 additions & 5 deletions lib/wikipedia-search/groonga-converter.rb
Expand Up @@ -5,24 +5,35 @@

module WikipediaSearch
class GroongaConverter
def initialize(input)
def initialize(input, options={})
@input = input
@options = options
end

def convert(output)
listener = Listener.new(output)
parser = REXML::Parsers::StreamParser.new(@input, listener)
parser.parse
listener = Listener.new(output, @options)
catch do |tag|
parser = REXML::Parsers::StreamParser.new(@input, listener)
listener.start(tag)
parser.parse
end
listener.finish
end

class Listener
include REXML::StreamListener

def initialize(output)
def initialize(output, options)
@output = output
@options = options
@text_stack = [""]
@first_page = true
@n_records = 0
@max_n_records = @options[:max_n_records]
end

def start(abort_tag)
@abort_tag = abort_tag
@output.puts("load --table Pages")
@output.puts("[")
end
Expand All @@ -45,6 +56,9 @@ def tag_start(name, attributes)
def tag_end(name)
case name
when "page"
if @max_n_records and @n_records >= @max_n_records
throw(@abort_tag)
end
if @first_page
@first_page = false
else
Expand All @@ -56,6 +70,7 @@ def tag_end(name)
"text" => @text,
}
@output.print(JSON.generate(page))
@n_records += 1
when "title"
@title = @text_stack.last
when "id"
Expand Down
33 changes: 31 additions & 2 deletions test/test-groonga-converter.rb
Expand Up @@ -2,10 +2,10 @@
require "wikipedia-search/groonga-converter"

class TestGroongaConverter < Test::Unit::TestCase
def convert(xml)
def convert(xml, options={})
input = StringIO.new(xml)
output = StringIO.new
converter = WikipediaSearch::GroongaConverter.new(input)
converter = WikipediaSearch::GroongaConverter.new(input, options)
converter.convert(output)
output.string
end
Expand Down Expand Up @@ -39,6 +39,35 @@ def test_one
load --table Pages
[
{"_key":1,"title":"Title","text":"Text1 & Text2"}
]
GROONGA
end

def test_max_n_records
xml = <<-XML
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
<page>
<title>Title1</title>
<id>1</id>
<revision>
<id>1001</id>
<text>Text1</text>
</revision>
</page>
<page>
<title>Title2</title>
<id>2</id>
<revision>
<id>1002</id>
<text>Text2</text>
</revision>
</page>
</mediawiki>
XML
assert_equal(<<-GROONGA, convert(xml, :max_n_records => 1))
load --table Pages
[
{"_key":1,"title":"Title1","text":"Text1"}
]
GROONGA
end
Expand Down

0 comments on commit 7fd702d

Please sign in to comment.