Skip to content

Commit

Permalink
first commit of aozorabunko.rb
Browse files Browse the repository at this point in the history
  • Loading branch information
crepusculum committed Jul 7, 2012
1 parent 3716cf5 commit 702c311
Show file tree
Hide file tree
Showing 4 changed files with 239 additions and 0 deletions.
105 changes: 105 additions & 0 deletions aozorabunko.rb
@@ -0,0 +1,105 @@
#!/usr/bin/env ruby
# 2012.06.15

require File.dirname(__FILE__) + '/support'
require File.dirname(__FILE__) + '/aozorabunko_html2xhtml'
#require File.dirname(__FILE__) + '/aozorabunko_contents'

class AozoraBunko
include Support

DIST_TEXT = './text'
DIST_IMAGES = './images'

def download(url)
prep_dist
case url
when /.+\.html$/ then download_html(url)
when /.+\.zip$/ then download_zip(url)
end
end

def download_html(url)
contents, filename = get_html(url)
path = File.dirname(url)
image_files = image_links(contents)
image_files.each do |image_file|
outfile = File.basename(image_file)
get_image(path + '/' + image_file, DIST_IMAGES + '/' + outfile)
end
filename = DIST_TEXT + '/' + filename.sub(/\.html/, '.xhtml')
#generate_contents_file(contents, filename)
xhtml = html2xhtml(contents)
writefile(xhtml, filename)
end

def download_zip(url)
contents, filenames = get_text(url)
filenames.each_with_index do |filename, idx|
filename = DIST_TEXT + '/' + filename
content = contents[idx]
#generate_contents_file(content, filename)
writefile(content, filename)
end
File.delete(File.basename(url))
end

def prep_dist
switch_continue = 'on'
directories = Array.new
[ DIST_TEXT, DIST_IMAGES ].each do |dist|
if File.exists?(dist) then
unless File.directory?(dist) then
puts " please modify '#{File.basename(dist)}' as a directory."
switch_continue = 'off'
end
else
directories.push(dist)
end
end
if switch_continue == 'on' then
directories.each do |dirname|
Dir.mkdir(dirname)
end
else
exit(0)
end
end

def get_html(url)
content, filename = get_web_content(url)
return convert_utf8(content.split("\n")), filename
end

def get_image(url, outfile)
content, filename = get_web_content(url)
open(outfile, 'wb') do |file|
file.puts(content)
end
end

def image_links(contents)
image_files = Array.new
contents.each do |line|
line.split(/</).each do |inps|
if /img\s+src\s*=\s*\"(.+?)\".*?\/>/ =~ inps then
image_dist = $1
unless image_files.include?(image_dist) then
image_files.push(image_dist)
end
end
end
end
return image_files
end

def get_text(url)
content, zipfile = get_web_content(url)
open(zipfile, 'wb') do |file|
file.puts(content)
end
contents, filenames = extract_zipfile(zipfile)
return contents, filenames
end
end

8 changes: 8 additions & 0 deletions aozorabunko_head.xhtml
@@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" lang="LANGUAGE" xml:lang="LANGUAGE">
<head>
<title>TITLE</title>
<link href="../styles/style.css" rel="stylesheet" type="text/css" />
</head>

61 changes: 61 additions & 0 deletions aozorabunko_html2xhtml.rb
@@ -0,0 +1,61 @@
#!/usr/bin/env ruby
# 2012.06.17

class AozoraBunko
HEADFILE = File.dirname(__FILE__) + '/aozorabunko_head.xhtml'

def html2xhtml(contents)
result = Array.new
generate_head(contents).each do |line|
result.push(line)
end
eliminate(contents).each do |line|
result.push(line)
end
return result
end

def eliminate(contents)
result = Array.new
switch_body = 'off'
contents.each do |line|
if /<body>/ =~ line then
switch_body = 'on'
end
if switch_body == 'on' then
line.gsub!(/<a\s+href.+?>(.+?)<\/a>/, '\1')
line.gsub!(/<script\s+.+?<\/script>/, '')
line.gsub!(/(<img\s+src=\").+\/(.+?\")(.+?\s*\/>)/, '\1../images/\2\3')
line.gsub!(/&nbsp;/, ' ')
line.gsub!(/<\/?rb>/, '')
result.push(line)
end
end
return result
end

def generate_head(contents)
language, title = String.new
contents.each do |line|
if /<\/head>/ =~ line then
break
end
case line
when /lang=\"(.+?)\"/ then
language = $1
when /title\"\s+content=\"(.+?)\"/i then
title = $1
when /<title>(.+?)<\/title>/ then
title = $1
end
end
head = Array.new
readfile(HEADFILE).each do |line|
line.gsub!(/LANGUAGE/, language)
line.gsub!(/TITLE/, title)
head.push(line)
end
return head
end
end

65 changes: 65 additions & 0 deletions support.rb
@@ -0,0 +1,65 @@
#!/usr/bin/env ruby
# 2012.05.16

require 'rubygems'
require 'mechanize'
require 'zipruby'

module Support
def readfile(filename)
result = Array.new
open(filename) do |file|
while line = file.gets do
result.push(line.chomp)
end
end
return result
end

def writefile(content, filename)
open(filename, 'w') do |file|
content.each do |line|
file.puts line
end
end
end

def get_web_content(url)
filename = File.basename(url)
content = String.new
agent = Mechanize.new
begin
content = agent.get(url).body
return content, filename
rescue
puts ' maybe network has some trouble....'
puts " #{url}"
exit(0)
end
end

def convert_utf8(lines)
result = Array.new
lines.each do |line|
line.sub!(//, '')
result.push(Kconv.toutf8(line))
end
return result
end

def extract_zipfile(zipfile)
filenames = Array.new
contents = Array.new
Zip::Archive.open(zipfile) do |zips|
zips.num_files.times do |n|
zips.fopen(zips.get_name(n)) do |file|
filenames.push(file.name)
content = convert_utf8(file.read)
contents.push(content)
end
end
end
return contents, filenames
end
end

Expand Down

0 comments on commit 702c311

Please sign in to comment.