Permalink
Browse files

first commit of aozorabunko.rb

  • Loading branch information...
1 parent 3716cf5 commit 702c311fd1513f3a3e86f309c36f09bc6b41a12a crepusculum committed Jul 7, 2012
Showing with 239 additions and 0 deletions.
  1. +105 −0 aozorabunko.rb
  2. +8 −0 aozorabunko_head.xhtml
  3. +61 −0 aozorabunko_html2xhtml.rb
  4. +65 −0 support.rb
View
105 aozorabunko.rb
@@ -0,0 +1,105 @@
+#!/usr/bin/env ruby
+# 2012.06.15
+
+require File.dirname(__FILE__) + '/support'
+require File.dirname(__FILE__) + '/aozorabunko_html2xhtml'
+#require File.dirname(__FILE__) + '/aozorabunko_contents'
+
+class AozoraBunko
+ include Support
+
+ DIST_TEXT = './text'
+ DIST_IMAGES = './images'
+
+ def download(url)
+ prep_dist
+ case url
+ when /.+\.html$/ then download_html(url)
+ when /.+\.zip$/ then download_zip(url)
+ end
+ end
+
+ def download_html(url)
+ contents, filename = get_html(url)
+ path = File.dirname(url)
+ image_files = image_links(contents)
+ image_files.each do |image_file|
+ outfile = File.basename(image_file)
+ get_image(path + '/' + image_file, DIST_IMAGES + '/' + outfile)
+ end
+ filename = DIST_TEXT + '/' + filename.sub(/\.html/, '.xhtml')
+ #generate_contents_file(contents, filename)
+ xhtml = html2xhtml(contents)
+ writefile(xhtml, filename)
+ end
+
+ def download_zip(url)
+ contents, filenames = get_text(url)
+ filenames.each_with_index do |filename, idx|
+ filename = DIST_TEXT + '/' + filename
+ content = contents[idx]
+ #generate_contents_file(content, filename)
+ writefile(content, filename)
+ end
+ File.delete(File.basename(url))
+ end
+
+ def prep_dist
+ switch_continue = 'on'
+ directories = Array.new
+ [ DIST_TEXT, DIST_IMAGES ].each do |dist|
+ if File.exists?(dist) then
+ unless File.directory?(dist) then
+ puts " please modify '#{File.basename(dist)}' as a directory."
+ switch_continue = 'off'
+ end
+ else
+ directories.push(dist)
+ end
+ end
+ if switch_continue == 'on' then
+ directories.each do |dirname|
+ Dir.mkdir(dirname)
+ end
+ else
+ exit(0)
+ end
+ end
+
+ def get_html(url)
+ content, filename = get_web_content(url)
+ return convert_utf8(content.split("\n")), filename
+ end
+
+ def get_image(url, outfile)
+ content, filename = get_web_content(url)
+ open(outfile, 'wb') do |file|
+ file.puts(content)
+ end
+ end
+
+ def image_links(contents)
+ image_files = Array.new
+ contents.each do |line|
+ line.split(/</).each do |inps|
+ if /img\s+src\s*=\s*\"(.+?)\".*?\/>/ =~ inps then
+ image_dist = $1
+ unless image_files.include?(image_dist) then
+ image_files.push(image_dist)
+ end
+ end
+ end
+ end
+ return image_files
+ end
+
+ def get_text(url)
+ content, zipfile = get_web_content(url)
+ open(zipfile, 'wb') do |file|
+ file.puts(content)
+ end
+ contents, filenames = extract_zipfile(zipfile)
+ return contents, filenames
+ end
+end
+
View
8 aozorabunko_head.xhtml
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html>
+<html xmlns="http://www.w3.org/1999/xhtml" lang="LANGUAGE" xml:lang="LANGUAGE">
+<head>
+ <title>TITLE</title>
+ <link href="../styles/style.css" rel="stylesheet" type="text/css" />
+</head>
+
View
61 aozorabunko_html2xhtml.rb
@@ -0,0 +1,61 @@
+#!/usr/bin/env ruby
+# 2012.06.17
+
+class AozoraBunko
+ HEADFILE = File.dirname(__FILE__) + '/aozorabunko_head.xhtml'
+
+ def html2xhtml(contents)
+ result = Array.new
+ generate_head(contents).each do |line|
+ result.push(line)
+ end
+ eliminate(contents).each do |line|
+ result.push(line)
+ end
+ return result
+ end
+
+ def eliminate(contents)
+ result = Array.new
+ switch_body = 'off'
+ contents.each do |line|
+ if /<body>/ =~ line then
+ switch_body = 'on'
+ end
+ if switch_body == 'on' then
+ line.gsub!(/<a\s+href.+?>(.+?)<\/a>/, '\1')
+ line.gsub!(/<script\s+.+?<\/script>/, '')
+ line.gsub!(/(<img\s+src=\").+\/(.+?\")(.+?\s*\/>)/, '\1../images/\2\3')
+ line.gsub!(/&nbsp;/, ' ')
+ line.gsub!(/<\/?rb>/, '')
+ result.push(line)
+ end
+ end
+ return result
+ end
+
+ def generate_head(contents)
+ language, title = String.new
+ contents.each do |line|
+ if /<\/head>/ =~ line then
+ break
+ end
+ case line
+ when /lang=\"(.+?)\"/ then
+ language = $1
+ when /title\"\s+content=\"(.+?)\"/i then
+ title = $1
+ when /<title>(.+?)<\/title>/ then
+ title = $1
+ end
+ end
+ head = Array.new
+ readfile(HEADFILE).each do |line|
+ line.gsub!(/LANGUAGE/, language)
+ line.gsub!(/TITLE/, title)
+ head.push(line)
+ end
+ return head
+ end
+end
+
View
65 support.rb
@@ -0,0 +1,65 @@
+#!/usr/bin/env ruby
+# 2012.05.16
+
+require 'rubygems'
+require 'mechanize'
+require 'zipruby'
+
+module Support
+ def readfile(filename)
+ result = Array.new
+ open(filename) do |file|
+ while line = file.gets do
+ result.push(line.chomp)
+ end
+ end
+ return result
+ end
+
+ def writefile(content, filename)
+ open(filename, 'w') do |file|
+ content.each do |line|
+ file.puts line
+ end
+ end
+ end
+
+ def get_web_content(url)
+ filename = File.basename(url)
+ content = String.new
+ agent = Mechanize.new
+ begin
+ content = agent.get(url).body
+ return content, filename
+ rescue
+ puts ' maybe network has some trouble....'
+ puts " #{url}"
+ exit(0)
+ end
+ end
+
+ def convert_utf8(lines)
+ result = Array.new
+ lines.each do |line|
+ line.sub!(/
+/, '')
+ result.push(Kconv.toutf8(line))
+ end
+ return result
+ end
+
+ def extract_zipfile(zipfile)
+ filenames = Array.new
+ contents = Array.new
+ Zip::Archive.open(zipfile) do |zips|
+ zips.num_files.times do |n|
+ zips.fopen(zips.get_name(n)) do |file|
+ filenames.push(file.name)
+ content = convert_utf8(file.read)
+ contents.push(content)
+ end
+ end
+ end
+ return contents, filenames
+ end
+end

0 comments on commit 702c311

Please sign in to comment.