diff --git a/lib/readability.rb b/lib/readability.rb index b2e8465..a7f7ad5 100644 --- a/lib/readability.rb +++ b/lib/readability.rb @@ -248,10 +248,15 @@ def sanitize(node, candidates, options = {}) # We'll sanitize all elements using a whitelist base_whitelist = @options[:tags] || %w[div p] + # We'll add whitespace instead of block elements, + # so a
b will have a nice space between them + base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center] # Use a hash for speed (don't want to make a million calls to include?) whitelist = Hash.new base_whitelist.each {|tag| whitelist[tag] = true } + replace_with_whitespace = Hash[base_replace_with_whitespace.map { |tag| [tag, true] }] + ([node] + node.css("*")).each do |el| # If element is in whitelist, delete all its attributes @@ -260,13 +265,18 @@ def sanitize(node, candidates, options = {}) # Otherwise, replace the element with its contents else - el.swap(el.text) + if replace_with_whitespace[el.node_name] + # Adding   here, because swap removes regular spaaces + el.swap(' ' << el.text << ' ') + else + el.swap(el.text) + end end end # Get rid of duplicate whitespace - node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ") + node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t  ]+/, " ") end def clean_conditionally(node, candidates, selector) diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb index 0ad425f..5021a7a 100644 --- a/spec/readability_spec.rb +++ b/spec/readability_spec.rb @@ -178,7 +178,25 @@ @doc.content.should_not match("sidebar") end end - + + describe "inserting space for block elements" do + before do + @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1) + title! + +
+

a
b


c
d
f/p> +
+ + + HTML + end + + it "should not return the sidebar" do + @doc.content.should_not match("a b c d f") + end + end + describe "outputs good stuff for known documents" do before do @html_files = Dir.glob(File.dirname(__FILE__) + "/fixtures/samples/*.html")