diff --git a/lib/readability.rb b/lib/readability.rb
index b2e8465..a7f7ad5 100644
--- a/lib/readability.rb
+++ b/lib/readability.rb
@@ -248,10 +248,15 @@ def sanitize(node, candidates, options = {})
# We'll sanitize all elements using a whitelist
base_whitelist = @options[:tags] || %w[div p]
+ # We'll add whitespace instead of block elements,
+ # so a
b will have a nice space between them
+ base_replace_with_whitespace = %w[br hr h1 h2 h3 h4 h5 h6 dl dd ol li ul address blockquote center]
# Use a hash for speed (don't want to make a million calls to include?)
whitelist = Hash.new
base_whitelist.each {|tag| whitelist[tag] = true }
+ replace_with_whitespace = Hash[base_replace_with_whitespace.map { |tag| [tag, true] }]
+
([node] + node.css("*")).each do |el|
# If element is in whitelist, delete all its attributes
@@ -260,13 +265,18 @@ def sanitize(node, candidates, options = {})
# Otherwise, replace the element with its contents
else
- el.swap(el.text)
+ if replace_with_whitespace[el.node_name]
+ # Adding here, because swap removes regular spaaces
+ el.swap(' ' << el.text << ' ')
+ else
+ el.swap(el.text)
+ end
end
end
# Get rid of duplicate whitespace
- node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ").gsub(/ /, " ")
+ node.to_html.gsub(/[\r\n\f]+/, "\n" ).gsub(/[\t ]+/, " ")
end
def clean_conditionally(node, candidates, selector)
diff --git a/spec/readability_spec.rb b/spec/readability_spec.rb
index 0ad425f..5021a7a 100644
--- a/spec/readability_spec.rb
+++ b/spec/readability_spec.rb
@@ -178,7 +178,25 @@
@doc.content.should_not match("sidebar")
end
end
-
+
+ describe "inserting space for block elements" do
+ before do
+ @doc = Readability::Document.new(<<-HTML, :min_text_length => 0, :retry_length => 1)
+
a
b