Skip to content

Commit

Permalink
def get_web_content and eliminate is modified
Browse files Browse the repository at this point in the history
  • Loading branch information
crepusculum committed Jul 24, 2012
1 parent 702c311 commit 20a5f0e
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 11 deletions.
31 changes: 23 additions & 8 deletions aozorabunko.rb
Expand Up @@ -68,21 +68,31 @@ def prep_dist

def get_html(url)
content, filename = get_web_content(url)
return convert_utf8(content.split("\n")), filename
unless content.nil? then
return convert_utf8(content.split("\n")), filename
else
puts " please try again."
exit(0)
end
end

def get_image(url, outfile)
content, filename = get_web_content(url)
open(outfile, 'wb') do |file|
file.puts(content)
unless content.nil? then
open(outfile, 'wb') do |file|
file.puts(content)
end
else
puts " please try again."
exit(0)
end
end

def image_links(contents)
image_files = Array.new
contents.each do |line|
line.split(/</).each do |inps|
if /img\s+src\s*=\s*\"(.+?)\".*?\/>/ =~ inps then
if /img.+?src\s*=\s*\"(.+?)\".*?\/>/ =~ inps then
image_dist = $1
unless image_files.include?(image_dist) then
image_files.push(image_dist)
Expand All @@ -95,11 +105,16 @@ def image_links(contents)

def get_text(url)
content, zipfile = get_web_content(url)
open(zipfile, 'wb') do |file|
file.puts(content)
unless content.nil? then
open(zipfile, 'wb') do |file|
file.puts(content)
end
contents, filenames = extract_zipfile(zipfile)
return contents, filenames
else
puts " please try again."
exit(0)
end
contents, filenames = extract_zipfile(zipfile)
return contents, filenames
end
end

2 changes: 1 addition & 1 deletion aozorabunko_html2xhtml.rb
Expand Up @@ -25,7 +25,7 @@ def eliminate(contents)
if switch_body == 'on' then
line.gsub!(/<a\s+href.+?>(.+?)<\/a>/, '\1')
line.gsub!(/<script\s+.+?<\/script>/, '')
line.gsub!(/(<img\s+src=\").+\/(.+?\")(.+?\s*\/>)/, '\1../images/\2\3')
line.gsub!(/(<img.+?src=\").+\/(.+?\")(.+?\s*\/>)/, 'img src="../images/\2\3')
line.gsub!(/&nbsp;/, ' ')
line.gsub!(/<\/?rb>/, '')
result.push(line)
Expand Down
3 changes: 1 addition & 2 deletions support.rb
Expand Up @@ -30,12 +30,11 @@ def get_web_content(url)
agent = Mechanize.new
begin
content = agent.get(url).body
return content, filename
rescue
puts ' maybe network has some trouble....'
puts " #{url}"
exit(0)
end
return content, filename
end

def convert_utf8(lines)
Expand Down

0 comments on commit 20a5f0e

Please sign in to comment.