Navigation Menu

Skip to content

Commit

Permalink
Add Wikipedia to Groonga converter
Browse files Browse the repository at this point in the history
GitHub: #1
  • Loading branch information
kou committed Apr 4, 2014
1 parent 0dbc3f3 commit 2a9c677
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1 +1,2 @@
/Gemfile.lock
/data/
5 changes: 5 additions & 0 deletions Gemfile
@@ -0,0 +1,5 @@
# -*- ruby -*-

source "https://rubygems.org/"

gem "test-unit", :require => false
13 changes: 13 additions & 0 deletions bin/wikipedia-to-groonga.rb
@@ -0,0 +1,13 @@
#!/usr/bin/env ruby

require "pathname"

base_dir_path = Pathname.new(__FILE__).dirname
lib_dir_path = base_dir_path + "lib"

$LOAD_PATH.unshift(lib_dir_path.to_s)

require "wikipedia-search/groonga-converter"

converter = WikipediaSearch::GroongaConverter.new(ARGF)
converter.convert($stdout)
87 changes: 87 additions & 0 deletions lib/wikipedia-search/groonga-converter.rb
@@ -0,0 +1,87 @@
require "json"
require "rexml/streamlistener"
require "rexml/parsers/baseparser"
require "rexml/parsers/streamparser"

module WikipediaSearch
class GroongaConverter
def initialize(input)
@input = input
end

def convert(output)
listener = Listener.new(output)
parser = REXML::Parsers::StreamParser.new(@input, listener)
parser.parse
listener.finish
end

class Listener
include REXML::StreamListener

def initialize(output)
@output = output
@text_stack = [""]
@first_page = true
@output.puts("load --table Pages")
@output.puts("[")
end

def finish
@output.puts unless @first_page
@output.puts("]")
end

def tag_start(name, attributes)
push_stacks
case name
when "page"
@title = nil
@id = nil
@text = nil
end
end

def tag_end(name)
case name
when "page"
if @first_page
@first_page = false
else
@output.puts(",")
end
page = {
"_key" => @id,
"title" => @title,
"text" => @text,
}
@output.print(JSON.generate(page))
when "title"
@title = @text_stack.last
when "id"
@id ||= Integer(@text_stack.last)
when "text"
@text = @text_stack.last
end
pop_stacks
end

def text(data)
@text_stack.last << data
end

def cdata(contnet)
@text_stack.last << content
end

private
def push_stacks
@text_stack << ""
end

def pop_stacks
@text_stack.pop
end
end
end
end
14 changes: 14 additions & 0 deletions test/run-test.rb
@@ -0,0 +1,14 @@
#!/usr/bin/env ruby

require "pathname"

require "bundler/setup"
require "test-unit"

base_dir_path = Pathname.new(__FILE__).dirname.parent
lib_dir_path = base_dir_path + "lib"
test_dir_path = base_dir_path + "test"

$LOAD_PATH.unshift(lib_dir_path.to_s)

exit(Test::Unit::AutoRunner.run(true, test_dir_path.to_s))
45 changes: 45 additions & 0 deletions test/test-groonga-converter.rb
@@ -0,0 +1,45 @@
require "stringio"
require "wikipedia-search/groonga-converter"

class TestGroongaConverter < Test::Unit::TestCase
def convert(xml)
input = StringIO.new(xml)
output = StringIO.new
converter = WikipediaSearch::GroongaConverter.new(input)
converter.convert(output)
output.string
end

def test_empty
xml = <<-XML
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
</mediawiki>
XML
assert_equal(<<-GROONGA, convert(xml))
load --table Pages
[
]
GROONGA
end

def test_one
xml = <<-XML
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
<page>
<title>Title</title>
<id>1</id>
<revision>
<id>1001</id>
<text>Text1 &amp; Text2</text>
</revision>
</page>
</mediawiki>
XML
assert_equal(<<-GROONGA, convert(xml))
load --table Pages
[
{"_key":1,"title":"Title","text":"Text1 & Text2"}
]
GROONGA
end
end

0 comments on commit 2a9c677

Please sign in to comment.