Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
crepusculum
committed
Jul 7, 2012
1 parent
3716cf5
commit 702c311
Showing
4 changed files
with
239 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/usr/bin/env ruby | ||
# 2012.06.15 | ||
|
||
require File.dirname(__FILE__) + '/support' | ||
require File.dirname(__FILE__) + '/aozorabunko_html2xhtml' | ||
#require File.dirname(__FILE__) + '/aozorabunko_contents' | ||
|
||
class AozoraBunko | ||
include Support | ||
|
||
DIST_TEXT = './text' | ||
DIST_IMAGES = './images' | ||
|
||
def download(url) | ||
prep_dist | ||
case url | ||
when /.+\.html$/ then download_html(url) | ||
when /.+\.zip$/ then download_zip(url) | ||
end | ||
end | ||
|
||
def download_html(url) | ||
contents, filename = get_html(url) | ||
path = File.dirname(url) | ||
image_files = image_links(contents) | ||
image_files.each do |image_file| | ||
outfile = File.basename(image_file) | ||
get_image(path + '/' + image_file, DIST_IMAGES + '/' + outfile) | ||
end | ||
filename = DIST_TEXT + '/' + filename.sub(/\.html/, '.xhtml') | ||
#generate_contents_file(contents, filename) | ||
xhtml = html2xhtml(contents) | ||
writefile(xhtml, filename) | ||
end | ||
|
||
def download_zip(url) | ||
contents, filenames = get_text(url) | ||
filenames.each_with_index do |filename, idx| | ||
filename = DIST_TEXT + '/' + filename | ||
content = contents[idx] | ||
#generate_contents_file(content, filename) | ||
writefile(content, filename) | ||
end | ||
File.delete(File.basename(url)) | ||
end | ||
|
||
def prep_dist | ||
switch_continue = 'on' | ||
directories = Array.new | ||
[ DIST_TEXT, DIST_IMAGES ].each do |dist| | ||
if File.exists?(dist) then | ||
unless File.directory?(dist) then | ||
puts " please modify '#{File.basename(dist)}' as a directory." | ||
switch_continue = 'off' | ||
end | ||
else | ||
directories.push(dist) | ||
end | ||
end | ||
if switch_continue == 'on' then | ||
directories.each do |dirname| | ||
Dir.mkdir(dirname) | ||
end | ||
else | ||
exit(0) | ||
end | ||
end | ||
|
||
def get_html(url) | ||
content, filename = get_web_content(url) | ||
return convert_utf8(content.split("\n")), filename | ||
end | ||
|
||
def get_image(url, outfile) | ||
content, filename = get_web_content(url) | ||
open(outfile, 'wb') do |file| | ||
file.puts(content) | ||
end | ||
end | ||
|
||
def image_links(contents) | ||
image_files = Array.new | ||
contents.each do |line| | ||
line.split(/</).each do |inps| | ||
if /img\s+src\s*=\s*\"(.+?)\".*?\/>/ =~ inps then | ||
image_dist = $1 | ||
unless image_files.include?(image_dist) then | ||
image_files.push(image_dist) | ||
end | ||
end | ||
end | ||
end | ||
return image_files | ||
end | ||
|
||
def get_text(url) | ||
content, zipfile = get_web_content(url) | ||
open(zipfile, 'wb') do |file| | ||
file.puts(content) | ||
end | ||
contents, filenames = extract_zipfile(zipfile) | ||
return contents, filenames | ||
end | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE html> | ||
<html xmlns="http://www.w3.org/1999/xhtml" lang="LANGUAGE" xml:lang="LANGUAGE"> | ||
<head> | ||
<title>TITLE</title> | ||
<link href="../styles/style.css" rel="stylesheet" type="text/css" /> | ||
</head> | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
#!/usr/bin/env ruby | ||
# 2012.06.17 | ||
|
||
class AozoraBunko | ||
HEADFILE = File.dirname(__FILE__) + '/aozorabunko_head.xhtml' | ||
|
||
def html2xhtml(contents) | ||
result = Array.new | ||
generate_head(contents).each do |line| | ||
result.push(line) | ||
end | ||
eliminate(contents).each do |line| | ||
result.push(line) | ||
end | ||
return result | ||
end | ||
|
||
def eliminate(contents) | ||
result = Array.new | ||
switch_body = 'off' | ||
contents.each do |line| | ||
if /<body>/ =~ line then | ||
switch_body = 'on' | ||
end | ||
if switch_body == 'on' then | ||
line.gsub!(/<a\s+href.+?>(.+?)<\/a>/, '\1') | ||
line.gsub!(/<script\s+.+?<\/script>/, '') | ||
line.gsub!(/(<img\s+src=\").+\/(.+?\")(.+?\s*\/>)/, '\1../images/\2\3') | ||
line.gsub!(/ /, ' ') | ||
line.gsub!(/<\/?rb>/, '') | ||
result.push(line) | ||
end | ||
end | ||
return result | ||
end | ||
|
||
def generate_head(contents) | ||
language, title = String.new | ||
contents.each do |line| | ||
if /<\/head>/ =~ line then | ||
break | ||
end | ||
case line | ||
when /lang=\"(.+?)\"/ then | ||
language = $1 | ||
when /title\"\s+content=\"(.+?)\"/i then | ||
title = $1 | ||
when /<title>(.+?)<\/title>/ then | ||
title = $1 | ||
end | ||
end | ||
head = Array.new | ||
readfile(HEADFILE).each do |line| | ||
line.gsub!(/LANGUAGE/, language) | ||
line.gsub!(/TITLE/, title) | ||
head.push(line) | ||
end | ||
return head | ||
end | ||
end | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters