Skip to content

Commit

Permalink
When sanitizing elements add a space when replacing block element suc…
Browse files Browse the repository at this point in the history
…h as br with its text.
  • Loading branch information
libc authored and Andrew Cantino committed Oct 1, 2010
1 parent 3c8ab69 commit 99ecb33
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
14 changes: 12 additions & 2 deletions lib/readability.rb
Expand Up @@ -248,10 +248,15 @@ def sanitize(node, candidates, options = {})

# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
# We'll add whitespace instead of block elements,
# so a<br>b will have a nice space between them
base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]

# Use a hash for speed (don't want to make a million calls to include?)
whitelist = Hash.new
base_whitelist.each {|tag| whitelist[tag] = true }
replace_with_whitespace = Hash[base_replace_with_whitespace.map { |tag| [tag, true] }]

([node] + node.css("*")).each do |el|

# If element is in whitelist, delete all its attributes
Expand All @@ -260,13 +265,18 @@ def sanitize(node, candidates, options = {})

# Otherwise, replace the element with its contents
else
el.swap(el.text)
if replace_with_whitespace[el.node_name]
# Adding &nbsp; here, because swap removes regular spaaces
el.swap('&nbsp;' << el.text << '&nbsp;')
else
el.swap(el.text)
end
end

end

# Get rid of duplicate whitespace
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/&nbsp;/, " ")
node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t  ]+/, " ")
end

def clean_conditionally(node, candidates, selector)
Expand Down
20 changes: 19 additions & 1 deletion spec/readability_spec.rb
Expand Up @@ -178,7 +178,25 @@
@doc.content.should_not match("sidebar")
end
end


describe "inserting space for block elements" do
before do
@doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
<html><head><title>title!</title></head>
<body>
<div>
<p>a<br>b<hr>c<address>d</address>f/p>
</div>
</body>
</html>
HTML
end

it "should not return the sidebar" do
@doc.content.should_not match("a b c d f")
end
end

describe "outputs good stuff for known documents" do
before do
@html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")
Expand Down

0 comments on commit 99ecb33

Please sign in to comment.