Navigation Menu

Skip to content

Commit

Permalink
Introduce path object
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 7, 2014
1 parent d6521b1 commit 9ffa222
Show file tree
Hide file tree
Showing 2 changed files with 166 additions and 83 deletions.
131 changes: 131 additions & 0 deletions lib/wikipedia-search/path.rb
@@ -0,0 +1,131 @@
require "pathname"

module WikipediaSearch
class Path
def initialize(base, language)
@base = Pathname.new(base)
@language = language
end

def data_dir
@base + "data"
end

def download_dir
data_dir + "download"
end

def config_dir
@base + "config"
end

def wikipedia
WikipediaPath.new(self, @language)
end

def groonga
GroongaPath.new(self, @language)
end

def droonga
DroongaPath.new(self, @language)
end
end

class WikipediaPath
def initialize(base_path, language)
@base_path = base_path
@language = language
end

def download_base_url
"http://dumps.wikimedia.org/#{@language}wiki/latest"
end

def pages
@base_path.download_dir + pages_base_name
end

def pages_base_name
"#{@language}wiki-latest-pages-articles.xml.bz2"
end

def pages_url
"#{download_base_url}/#{pages_base_name}"
end

def titles
@base_path.download_dir + titles_base_name
end

def titles_base_name
"#{@language}wiki-latest-all-titles.gz"
end

def titles_url
"#{download_base_url}/#{titles_base_name}"
end
end

class GroongaPath
def initialize(base_path, language)
@base_path = base_path
@language = language
end

def config_dir
@base_path.config_dir + "groonga"
end

def data_dir
@base_path.data_dir + "groonga"
end

def schema
config_dir + "schema.grn"
end

def indexes
config_dir + "indexes.grn"
end

def pages
data_dir + "#{@language}-pages.grn"
end

def database_dir
data_dir + "db"
end

def database
database_dir + "wikipedia"
end

def log
database_dir + "groonga.log"
end

def query_log
database_dir + "query.log"
end
end

class DroongaPath
def initialize(base_path, language)
@base_path = base_path
@language = language
end

def config_dir
@base_path.config_dir + "droonga"
end

def data_dir
@base_path.data_dir + "droonga"
end

def pages
data_dir + "#{@language}-pages.jsons"
end
end
end
118 changes: 35 additions & 83 deletions lib/wikipedia-search/task.rb
Expand Up @@ -2,6 +2,7 @@
require "shellwords"

require "wikipedia-search/downloader"
require "wikipedia-search/path"

module WikipediaSearch
class Task
Expand All @@ -12,6 +13,10 @@ def define
end
include Rake::DSL

def initialize
@path = Path.new(".", "ja")
end

def define
define_data_tasks
define_groonga_tasks
Expand All @@ -20,37 +25,40 @@ def define
private
def define_data_tasks
namespace :data do
directory data_dir_path.to_s
define_data_download_tasks
define_data_convert_tasks
end
end

def define_data_download_tasks
path = @path.wikipedia
directory @path.download_dir.to_s

namespace :download do
namespace :pages do
file ja_pages_path.to_s => data_dir_path.to_s do
url = "#{ja_download_base_url}/#{ja_pages_base_name}"
WikipediaSearch::Downloader.download(url, ja_pages_path)
file path.pages.to_s => @path.download_dir.to_s do
WikipediaSearch::Downloader.download(path.pages_url, path.pages)
end

desc "Download the latest Japanese Wikipedia pages."
task :ja => ja_pages_path.to_s
task :ja => path.pages.to_s
end

namespace :titles do
file ja_titles_path.to_s => data_dir_path.to_s do
url = "#{ja_download_base_url}/#{ja_titles_base_name}"
WikipediaSearch::Downloader.download(url, ja_titles_path)
file path.titles.to_s => @path.download_dir.to_s do
WikipediaSearch::Downloader.download(path.titles_url,
path.titles)
end

desc "Download the latest Japanese Wikipedia titles."
task :ja => ja_titles_path.to_s
task :ja => path.titles.to_s
end
end
end

def define_data_convert_tasks
directory @path.data_dir.to_s

namespace :convert do
define_data_convert_groonga_tasks
define_data_convert_droonga_tasks
Expand All @@ -59,10 +67,10 @@ def define_data_convert_tasks

def define_data_convert_groonga_tasks
namespace :groonga do
file ja_groonga_pages_path.to_s => ja_pages_path.to_s do
file @path.groonga.pages.to_s => @path.wikipedia.pages.to_s do
command_line = []
command_line << "bzcat"
command_line << Shellwords.escape(ja_pages_path.to_s)
command_line << Shellwords.escape(@path.wikipedia.pages.to_s)
command_line << "|"
command_line << RbConfig.ruby
command_line << "bin/wikipedia-to-groonga.rb"
Expand All @@ -71,110 +79,54 @@ def define_data_convert_groonga_tasks
command_line << "--max-n-characters"
command_line << "1000"
command_line << "--output"
command_line << ja_groonga_pages_path.to_s
command_line << @path.groonga.pages.to_s
sh(command_line.join(" "))
end

desc "Convert Japanese Wikipedia page data to Groonga page data."
task :ja => ja_groonga_pages_path.to_s
task :ja => @path.groonga.pages.to_s
end
end

def define_data_convert_droonga_tasks
namespace :droonga do
file ja_droonga_pages_path.to_s => ja_groonga_pages_path.to_s do
file @path.droonga.pages.to_s => @path.groonga.pages.to_s do
sh("grn2drn",
"--dataset", "Wikipedia",
"--output", ja_droonga_pages_path.to_s,
ja_groonga_pages_path.to_s)
"--output", @path.droonga.pages.to_s,
@path.groonga.pages.to_s)
end

desc "Convert Japanese Wikipedia page data to Droonga page data."
task :ja => ja_droonga_pages_path.to_s
task :ja => @path.droonga.pages.to_s
end
end

def define_groonga_tasks
namespace :groonga do
desc "Load data."
task :load do
rm_rf(groonga_database_dir_path.to_s)
mkdir_p(groonga_database_dir_path.to_s)
groonga_run(groonga_schema_path.to_s)
groonga_run(ja_groonga_pages_path.to_s.to_s)
groonga_run(groonga_indexes_path.to_s)
task :load => @path.groonga.pages.to_s do
rm_rf(@path.groonga.database_dir.to_s)
mkdir_p(@path.groonga.database_dir.to_s)
groonga_run(@path.groonga.schema.to_s)
groonga_run(@path.groonga.pages.to_s)
groonga_run(@path.groonga.indexes.to_s)
end
end
end

def groonga_run(input)
command_line = [
"groonga",
"--log-path", (groonga_database_dir_path + "groonga.log").to_s,
"--query-log-path", (groonga_database_dir_path + "query.log").to_s,
"--log-path", @path.groonga.log.to_s,
"--query-log-path", @path.groonga.query_log.to_s,
"--file", input,
]
unless groonga_database_path.exist?
unless @path.groonga.database.exist?
command_line << "-n"
end
command_line << groonga_database_path.to_s
command_line << @path.groonga.database.to_s
sh(*command_line)
end

def download_base_url(language)
"http://dumps.wikimedia.org/#{language}wiki/latest"
end

def ja_download_base_url
download_base_url("ja")
end

def data_dir_path
Pathname.new("data")
end

def ja_pages_path
data_dir_path + ja_pages_base_name
end

def ja_pages_base_name
"jawiki-latest-pages-articles.xml.bz2"
end

def ja_groonga_pages_path
data_dir_path + "ja-pages.grn"
end

def ja_droonga_pages_path
data_dir_path + "ja-pages.jsons"
end

def ja_titles_path
data_dir_path + ja_titles_base_name
end

def ja_titles_base_name
"jawiki-latest-all-titles.gz"
end

def config_dir
Pathname.new("config")
end

def groonga_schema_path
config_dir + "groonga" + "schema.grn"
end

def groonga_indexes_path
config_dir + "groonga" + "indexes.grn"
end

def groonga_database_dir_path
data_dir_path + "groonga"
end

def groonga_database_path
groonga_database_dir_path + "db"
end
end
end

0 comments on commit 9ffa222

Please sign in to comment.