Navigation Menu

Skip to content

Commit

Permalink
Extract categories for drilldown
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 4, 2014
1 parent 87561bd commit d43b46b
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 12 deletions.
39 changes: 30 additions & 9 deletions lib/wikipedia-search/groonga-converter.rb
Expand Up @@ -50,9 +50,7 @@ def tag_start(name, attributes)
push_stacks
case name
when "page"
@title = nil
@id = nil
@text = nil
@page = Page.new
end
end

Expand All @@ -68,18 +66,19 @@ def tag_end(name)
@output.puts(",")
end
page = {
"_key" => @id,
"title" => @title,
"text" => shorten_text(@text),
"_key" => @page.id,
"title" => @page.title,
"text" => shorten_text(@page.text),
"categories" => @page.extract_categories,
}
@output.print(JSON.generate(page))
@n_records += 1
when "title"
@title = @text_stack.last
@page.title = @text_stack.last
when "id"
@id ||= Integer(@text_stack.last)
@page.id ||= Integer(@text_stack.last)
when "text"
@text = @text_stack.last
@page.text = @text_stack.last
end
pop_stacks
end
Expand Down Expand Up @@ -108,6 +107,28 @@ def shorten_text(text)
text
end
end

class Page
attr_accessor :id, :title, :text
def initialize
@id = nil
@title = nil
@text = nil
end

def extract_categories
return [] if @text.nil?

categories = []
@text.scan(/\[\[(.+?)\]\]/) do |link,|
case link
when /\ACategory:(.+?)(?:\|.*)?\z/
categories << $1
end
end
categories
end
end
end
end
end
27 changes: 24 additions & 3 deletions test/test-groonga-converter.rb
Expand Up @@ -38,7 +38,7 @@ def test_one
assert_equal(<<-GROONGA, convert(xml))
load --table Pages
[
{"_key":1,"title":"Title","text":"Text1 & Text2"}
{"_key":1,"title":"Title","text":"Text1 & Text2","categories":[]}
]
GROONGA
end
Expand Down Expand Up @@ -67,7 +67,7 @@ def test_max_n_records
assert_equal(<<-GROONGA, convert(xml, :max_n_records => 1))
load --table Pages
[
{"_key":1,"title":"Title1","text":"Text1"}
{"_key":1,"title":"Title1","text":"Text1","categories":[]}
]
GROONGA
end
Expand All @@ -88,7 +88,28 @@ def test_max_n_characters
assert_equal(<<-GROONGA, convert(xml, :max_n_characters => 2))
load --table Pages
[
{"_key":1,"title":"Title","text":"Te"}
{"_key":1,"title":"Title","text":"Te","categories":[]}
]
GROONGA
end

def test_categories
xml = <<-XML
<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.8/">
<page>
<title>Title</title>
<id>1</id>
<revision>
<id>1001</id>
<text>[[Category:Groonga]]</text>
</revision>
</page>
</mediawiki>
XML
assert_equal(<<-GROONGA, convert(xml))
load --table Pages
[
{"_key":1,"title":"Title","text":"[[Category:Groonga]]","categories":["Groonga"]}
]
GROONGA
end
Expand Down

0 comments on commit d43b46b

Please sign in to comment.