Navigation Menu

Skip to content

Commit

Permalink
Add a task to download Wikipedia data in Japanese
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Apr 4, 2014
1 parent 642f462 commit 6c632dd
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -0,0 +1 @@
/data/
64 changes: 64 additions & 0 deletions Rakefile
@@ -0,0 +1,64 @@
# -*- ruby -*-

require "pathname"
require "open-uri"

def format_size(size)
if size < 1024
"%d" % size
elsif size < (1024 ** 2)
"%7.2fKiB" % (size.to_f / 1024)
elsif size < (1024 ** 3)
"%7.2fMiB" % (size.to_f / (1024 ** 2))
elsif size < (1024 ** 4)
"%7.2fGiB" % (size.to_f / (1024 ** 3))
else
"%.2fTiB" % (size.to_f / (1024 ** 4))
end
end

def download(url, output_path)
base_name = File.basename(url)
max = nil
content_length_proc = lambda do |content_length|
max = content_length
end
progress_proc = lambda do |current|
if max
percent = (current / max.to_f) * 100
formatted_size = "[%s/%s]" % [format_size(current), format_size(max)]
print("\r%s - %06.2f%% %s" % [base_name, percent, formatted_size])
puts if current == max
end
end
options = {
:content_length_proc => content_length_proc,
:progress_proc => progress_proc,
}

open(url, options) do |input|
output_path.open("wb") do |output|
chunk = ""
while input.read(8012, chunk)
output.print(chunk)
end
end
end
end

namespace :data do
data_dir_path = Pathname.new("data")
directory data_dir_path.to_s

namespace :download do
base_name = "jawiki-latest-pages-articles.xml.bz2"
ja_data_path = data_dir_path + base_name
file ja_data_path.to_s => data_dir_path.to_s do
download("http://dumps.wikimedia.org/jawiki/latest/#{base_name}",
ja_data_path)
end

desc "Download the latest Japanese Wikiepdia data."
task :ja => ja_data_path.to_s
end
end

0 comments on commit 6c632dd

Please sign in to comment.