Permalink
Browse files

def get_web_content and eliminate is modified

  • Loading branch information...
1 parent 702c311 commit 20a5f0efdf1453f79572c9ca47085c22456e7f43 crepusculum committed Jul 24, 2012
Showing with 25 additions and 11 deletions.
  1. +23 −8 aozorabunko.rb
  2. +1 −1 aozorabunko_html2xhtml.rb
  3. +1 −2 support.rb
View
@@ -68,21 +68,31 @@ def prep_dist
def get_html(url)
content, filename = get_web_content(url)
- return convert_utf8(content.split("\n")), filename
+ unless content.nil? then
+ return convert_utf8(content.split("\n")), filename
+ else
+ puts " please try again."
+ exit(0)
+ end
end
def get_image(url, outfile)
content, filename = get_web_content(url)
- open(outfile, 'wb') do |file|
- file.puts(content)
+ unless content.nil? then
+ open(outfile, 'wb') do |file|
+ file.puts(content)
+ end
+ else
+ puts " please try again."
+ exit(0)
end
end
def image_links(contents)
image_files = Array.new
contents.each do |line|
line.split(/</).each do |inps|
- if /img\s+src\s*=\s*\"(.+?)\".*?\/>/ =~ inps then
+ if /img.+?src\s*=\s*\"(.+?)\".*?\/>/ =~ inps then
image_dist = $1
unless image_files.include?(image_dist) then
image_files.push(image_dist)
@@ -95,11 +105,16 @@ def image_links(contents)
def get_text(url)
content, zipfile = get_web_content(url)
- open(zipfile, 'wb') do |file|
- file.puts(content)
+ unless content.nil? then
+ open(zipfile, 'wb') do |file|
+ file.puts(content)
+ end
+ contents, filenames = extract_zipfile(zipfile)
+ return contents, filenames
+ else
+ puts " please try again."
+ exit(0)
end
- contents, filenames = extract_zipfile(zipfile)
- return contents, filenames
end
end
@@ -25,7 +25,7 @@ def eliminate(contents)
if switch_body == 'on' then
line.gsub!(/<a\s+href.+?>(.+?)<\/a>/, '\1')
line.gsub!(/<script\s+.+?<\/script>/, '')
- line.gsub!(/(<img\s+src=\").+\/(.+?\")(.+?\s*\/>)/, '\1../images/\2\3')
+ line.gsub!(/(<img.+?src=\").+\/(.+?\")(.+?\s*\/>)/, 'img src="../images/\2\3')
line.gsub!(/&nbsp;/, ' ')
line.gsub!(/<\/?rb>/, '')
result.push(line)
View
@@ -30,12 +30,11 @@ def get_web_content(url)
agent = Mechanize.new
begin
content = agent.get(url).body
- return content, filename
rescue
puts ' maybe network has some trouble....'
puts " #{url}"
- exit(0)
end
+ return content, filename
end
def convert_utf8(lines)

0 comments on commit 20a5f0e

Please sign in to comment.